1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the AArch64TargetLowering class. 11// 12//===----------------------------------------------------------------------===// 13 14#include "AArch64ISelLowering.h" 15#include "AArch64CallingConvention.h" 16#include "AArch64MachineFunctionInfo.h" 17#include "AArch64PerfectShuffle.h" 18#include "AArch64Subtarget.h" 19#include "AArch64TargetMachine.h" 20#include "AArch64TargetObjectFile.h" 21#include "MCTargetDesc/AArch64AddressingModes.h" 22#include "llvm/ADT/Statistic.h" 23#include "llvm/CodeGen/CallingConvLower.h" 24#include "llvm/CodeGen/MachineFrameInfo.h" 25#include "llvm/CodeGen/MachineInstrBuilder.h" 26#include "llvm/CodeGen/MachineRegisterInfo.h" 27#include "llvm/IR/Function.h" 28#include "llvm/IR/Intrinsics.h" 29#include "llvm/IR/Type.h" 30#include "llvm/Support/CommandLine.h" 31#include "llvm/Support/Debug.h" 32#include "llvm/Support/ErrorHandling.h" 33#include "llvm/Support/raw_ostream.h" 34#include "llvm/Target/TargetOptions.h" 35using namespace llvm; 36 37#define DEBUG_TYPE "aarch64-lower" 38 39STATISTIC(NumTailCalls, "Number of tail calls"); 40STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 41 42namespace { 43enum AlignMode { 44 StrictAlign, 45 NoStrictAlign 46}; 47} 48 49static cl::opt<AlignMode> 50Align(cl::desc("Load/store alignment support"), 51 cl::Hidden, cl::init(NoStrictAlign), 52 cl::values( 53 clEnumValN(StrictAlign, "aarch64-strict-align", 54 "Disallow all unaligned memory accesses"), 55 clEnumValN(NoStrictAlign, "aarch64-no-strict-align", 56 "Allow unaligned memory accesses"), 57 clEnumValEnd)); 58 59// Place holder until extr generation is tested fully. 60static cl::opt<bool> 61EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, 62 cl::desc("Allow AArch64 (or (shift)(shift))->extract"), 63 cl::init(true)); 64 65static cl::opt<bool> 66EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, 67 cl::desc("Allow AArch64 SLI/SRI formation"), 68 cl::init(false)); 69 70// FIXME: The necessary dtprel relocations don't seem to be supported 71// well in the GNU bfd and gold linkers at the moment. Therefore, by 72// default, for now, fall back to GeneralDynamic code generation. 73cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 74 "aarch64-elf-ldtls-generation", cl::Hidden, 75 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 76 cl::init(false)); 77 78 79AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) 80 : TargetLowering(TM) { 81 Subtarget = &TM.getSubtarget<AArch64Subtarget>(); 82 83 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 84 // we have to make something up. Arbitrarily, choose ZeroOrOne. 85 setBooleanContents(ZeroOrOneBooleanContent); 86 // When comparing vectors the result sets the different elements in the 87 // vector to all-one or all-zero. 88 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 89 90 // Set up the register classes. 91 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 92 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 93 94 if (Subtarget->hasFPARMv8()) { 95 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 96 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 97 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 98 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 99 } 100 101 if (Subtarget->hasNEON()) { 102 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 103 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 104 // Someone set us up the NEON. 105 addDRTypeForNEON(MVT::v2f32); 106 addDRTypeForNEON(MVT::v8i8); 107 addDRTypeForNEON(MVT::v4i16); 108 addDRTypeForNEON(MVT::v2i32); 109 addDRTypeForNEON(MVT::v1i64); 110 addDRTypeForNEON(MVT::v1f64); 111 addDRTypeForNEON(MVT::v4f16); 112 113 addQRTypeForNEON(MVT::v4f32); 114 addQRTypeForNEON(MVT::v2f64); 115 addQRTypeForNEON(MVT::v16i8); 116 addQRTypeForNEON(MVT::v8i16); 117 addQRTypeForNEON(MVT::v4i32); 118 addQRTypeForNEON(MVT::v2i64); 119 addQRTypeForNEON(MVT::v8f16); 120 } 121 122 // Compute derived properties from the register classes 123 computeRegisterProperties(); 124 125 // Provide all sorts of operation actions 126 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 127 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 128 setOperationAction(ISD::SETCC, MVT::i32, Custom); 129 setOperationAction(ISD::SETCC, MVT::i64, Custom); 130 setOperationAction(ISD::SETCC, MVT::f32, Custom); 131 setOperationAction(ISD::SETCC, MVT::f64, Custom); 132 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 133 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 134 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 135 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 136 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 137 setOperationAction(ISD::SELECT, MVT::i32, Custom); 138 setOperationAction(ISD::SELECT, MVT::i64, Custom); 139 setOperationAction(ISD::SELECT, MVT::f32, Custom); 140 setOperationAction(ISD::SELECT, MVT::f64, Custom); 141 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 142 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 143 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 144 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 145 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 146 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 147 148 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 149 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 150 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 151 152 setOperationAction(ISD::FREM, MVT::f32, Expand); 153 setOperationAction(ISD::FREM, MVT::f64, Expand); 154 setOperationAction(ISD::FREM, MVT::f80, Expand); 155 156 // Custom lowering hooks are needed for XOR 157 // to fold it into CSINC/CSINV. 158 setOperationAction(ISD::XOR, MVT::i32, Custom); 159 setOperationAction(ISD::XOR, MVT::i64, Custom); 160 161 // Virtually no operation on f128 is legal, but LLVM can't expand them when 162 // there's a valid register class, so we need custom operations in most cases. 163 setOperationAction(ISD::FABS, MVT::f128, Expand); 164 setOperationAction(ISD::FADD, MVT::f128, Custom); 165 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 166 setOperationAction(ISD::FCOS, MVT::f128, Expand); 167 setOperationAction(ISD::FDIV, MVT::f128, Custom); 168 setOperationAction(ISD::FMA, MVT::f128, Expand); 169 setOperationAction(ISD::FMUL, MVT::f128, Custom); 170 setOperationAction(ISD::FNEG, MVT::f128, Expand); 171 setOperationAction(ISD::FPOW, MVT::f128, Expand); 172 setOperationAction(ISD::FREM, MVT::f128, Expand); 173 setOperationAction(ISD::FRINT, MVT::f128, Expand); 174 setOperationAction(ISD::FSIN, MVT::f128, Expand); 175 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 176 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 177 setOperationAction(ISD::FSUB, MVT::f128, Custom); 178 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 179 setOperationAction(ISD::SETCC, MVT::f128, Custom); 180 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 181 setOperationAction(ISD::SELECT, MVT::f128, Custom); 182 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 183 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 184 185 // Lowering for many of the conversions is actually specified by the non-f128 186 // type. The LowerXXX function will be trivial when f128 isn't involved. 187 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 188 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 189 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 190 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 191 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 192 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 193 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 194 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 195 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 196 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 197 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 198 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 199 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 200 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 201 202 // Variable arguments. 203 setOperationAction(ISD::VASTART, MVT::Other, Custom); 204 setOperationAction(ISD::VAARG, MVT::Other, Custom); 205 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 206 setOperationAction(ISD::VAEND, MVT::Other, Expand); 207 208 // Variable-sized objects. 209 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 210 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 211 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 212 213 // Exception handling. 214 // FIXME: These are guesses. Has this been defined yet? 215 setExceptionPointerRegister(AArch64::X0); 216 setExceptionSelectorRegister(AArch64::X1); 217 218 // Constant pool entries 219 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 220 221 // BlockAddress 222 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 223 224 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. 225 setOperationAction(ISD::ADDC, MVT::i32, Custom); 226 setOperationAction(ISD::ADDE, MVT::i32, Custom); 227 setOperationAction(ISD::SUBC, MVT::i32, Custom); 228 setOperationAction(ISD::SUBE, MVT::i32, Custom); 229 setOperationAction(ISD::ADDC, MVT::i64, Custom); 230 setOperationAction(ISD::ADDE, MVT::i64, Custom); 231 setOperationAction(ISD::SUBC, MVT::i64, Custom); 232 setOperationAction(ISD::SUBE, MVT::i64, Custom); 233 234 // AArch64 lacks both left-rotate and popcount instructions. 235 setOperationAction(ISD::ROTL, MVT::i32, Expand); 236 setOperationAction(ISD::ROTL, MVT::i64, Expand); 237 238 // AArch64 doesn't have {U|S}MUL_LOHI. 239 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 240 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 241 242 243 // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero 244 // counterparts, which AArch64 supports directly. 245 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); 246 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); 247 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 248 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 249 250 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 251 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 252 253 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 254 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 255 setOperationAction(ISD::SREM, MVT::i32, Expand); 256 setOperationAction(ISD::SREM, MVT::i64, Expand); 257 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 258 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 259 setOperationAction(ISD::UREM, MVT::i32, Expand); 260 setOperationAction(ISD::UREM, MVT::i64, Expand); 261 262 // Custom lower Add/Sub/Mul with overflow. 263 setOperationAction(ISD::SADDO, MVT::i32, Custom); 264 setOperationAction(ISD::SADDO, MVT::i64, Custom); 265 setOperationAction(ISD::UADDO, MVT::i32, Custom); 266 setOperationAction(ISD::UADDO, MVT::i64, Custom); 267 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 268 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 269 setOperationAction(ISD::USUBO, MVT::i32, Custom); 270 setOperationAction(ISD::USUBO, MVT::i64, Custom); 271 setOperationAction(ISD::SMULO, MVT::i32, Custom); 272 setOperationAction(ISD::SMULO, MVT::i64, Custom); 273 setOperationAction(ISD::UMULO, MVT::i32, Custom); 274 setOperationAction(ISD::UMULO, MVT::i64, Custom); 275 276 setOperationAction(ISD::FSIN, MVT::f32, Expand); 277 setOperationAction(ISD::FSIN, MVT::f64, Expand); 278 setOperationAction(ISD::FCOS, MVT::f32, Expand); 279 setOperationAction(ISD::FCOS, MVT::f64, Expand); 280 setOperationAction(ISD::FPOW, MVT::f32, Expand); 281 setOperationAction(ISD::FPOW, MVT::f64, Expand); 282 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 283 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 284 285 // f16 is storage-only, so we promote operations to f32 if we know this is 286 // valid, and ignore them otherwise. The operations not mentioned here will 287 // fail to select, but this is not a major problem as no source language 288 // should be emitting native f16 operations yet. 289 setOperationAction(ISD::FADD, MVT::f16, Promote); 290 setOperationAction(ISD::FDIV, MVT::f16, Promote); 291 setOperationAction(ISD::FMUL, MVT::f16, Promote); 292 setOperationAction(ISD::FSUB, MVT::f16, Promote); 293 294 // v4f16 is also a storage-only type, so promote it to v4f32 when that is 295 // known to be safe. 296 setOperationAction(ISD::FADD, MVT::v4f16, Promote); 297 setOperationAction(ISD::FSUB, MVT::v4f16, Promote); 298 setOperationAction(ISD::FMUL, MVT::v4f16, Promote); 299 setOperationAction(ISD::FDIV, MVT::v4f16, Promote); 300 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); 301 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); 302 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 303 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 304 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 305 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 306 AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); 307 AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); 308 309 // Expand all other v4f16 operations. 310 // FIXME: We could generate better code by promoting some operations to 311 // a pair of v4f32s 312 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 313 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 314 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 315 setOperationAction(ISD::FCOS, MVT::v4f16, Expand); 316 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 317 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 318 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 319 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 320 setOperationAction(ISD::FPOW, MVT::v4f16, Expand); 321 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); 322 setOperationAction(ISD::FREM, MVT::v4f16, Expand); 323 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 324 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 325 setOperationAction(ISD::FSIN, MVT::v4f16, Expand); 326 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); 327 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 328 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 329 setOperationAction(ISD::SETCC, MVT::v4f16, Expand); 330 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 331 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 332 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 333 setOperationAction(ISD::FEXP, MVT::v4f16, Expand); 334 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); 335 setOperationAction(ISD::FLOG, MVT::v4f16, Expand); 336 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); 337 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); 338 339 340 // v8f16 is also a storage-only type, so expand it. 341 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 342 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 343 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 344 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 345 setOperationAction(ISD::FCOS, MVT::v8f16, Expand); 346 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 347 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 348 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 349 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 350 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 351 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 352 setOperationAction(ISD::FPOW, MVT::v8f16, Expand); 353 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); 354 setOperationAction(ISD::FREM, MVT::v8f16, Expand); 355 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 356 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 357 setOperationAction(ISD::FSIN, MVT::v8f16, Expand); 358 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); 359 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 360 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 361 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 362 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 363 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 364 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 365 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 366 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 367 setOperationAction(ISD::FEXP, MVT::v8f16, Expand); 368 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); 369 setOperationAction(ISD::FLOG, MVT::v8f16, Expand); 370 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); 371 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); 372 373 // AArch64 has implementations of a lot of rounding-like FP operations. 374 static MVT RoundingTypes[] = { MVT::f32, MVT::f64}; 375 for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) { 376 MVT Ty = RoundingTypes[I]; 377 setOperationAction(ISD::FFLOOR, Ty, Legal); 378 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 379 setOperationAction(ISD::FCEIL, Ty, Legal); 380 setOperationAction(ISD::FRINT, Ty, Legal); 381 setOperationAction(ISD::FTRUNC, Ty, Legal); 382 setOperationAction(ISD::FROUND, Ty, Legal); 383 } 384 385 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 386 387 if (Subtarget->isTargetMachO()) { 388 // For iOS, we don't want to the normal expansion of a libcall to 389 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 390 // traffic. 391 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 392 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 393 } else { 394 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 395 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 396 } 397 398 // Make floating-point constants legal for the large code model, so they don't 399 // become loads from the constant pool. 400 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 401 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 402 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 403 } 404 405 // AArch64 does not have floating-point extending loads, i1 sign-extending 406 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 407 for (MVT VT : MVT::fp_valuetypes()) { 408 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 409 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 410 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 411 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 412 } 413 for (MVT VT : MVT::integer_valuetypes()) 414 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 415 416 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 417 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 418 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 419 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 420 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 421 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 422 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 423 424 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 425 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 426 427 // Indexed loads and stores are supported. 428 for (unsigned im = (unsigned)ISD::PRE_INC; 429 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 430 setIndexedLoadAction(im, MVT::i8, Legal); 431 setIndexedLoadAction(im, MVT::i16, Legal); 432 setIndexedLoadAction(im, MVT::i32, Legal); 433 setIndexedLoadAction(im, MVT::i64, Legal); 434 setIndexedLoadAction(im, MVT::f64, Legal); 435 setIndexedLoadAction(im, MVT::f32, Legal); 436 setIndexedStoreAction(im, MVT::i8, Legal); 437 setIndexedStoreAction(im, MVT::i16, Legal); 438 setIndexedStoreAction(im, MVT::i32, Legal); 439 setIndexedStoreAction(im, MVT::i64, Legal); 440 setIndexedStoreAction(im, MVT::f64, Legal); 441 setIndexedStoreAction(im, MVT::f32, Legal); 442 } 443 444 // Trap. 445 setOperationAction(ISD::TRAP, MVT::Other, Legal); 446 447 // We combine OR nodes for bitfield operations. 448 setTargetDAGCombine(ISD::OR); 449 450 // Vector add and sub nodes may conceal a high-half opportunity. 451 // Also, try to fold ADD into CSINC/CSINV.. 452 setTargetDAGCombine(ISD::ADD); 453 setTargetDAGCombine(ISD::SUB); 454 455 setTargetDAGCombine(ISD::XOR); 456 setTargetDAGCombine(ISD::SINT_TO_FP); 457 setTargetDAGCombine(ISD::UINT_TO_FP); 458 459 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 460 461 setTargetDAGCombine(ISD::ANY_EXTEND); 462 setTargetDAGCombine(ISD::ZERO_EXTEND); 463 setTargetDAGCombine(ISD::SIGN_EXTEND); 464 setTargetDAGCombine(ISD::BITCAST); 465 setTargetDAGCombine(ISD::CONCAT_VECTORS); 466 setTargetDAGCombine(ISD::STORE); 467 468 setTargetDAGCombine(ISD::MUL); 469 470 setTargetDAGCombine(ISD::SELECT); 471 setTargetDAGCombine(ISD::VSELECT); 472 473 setTargetDAGCombine(ISD::INTRINSIC_VOID); 474 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 475 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 476 477 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; 478 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; 479 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; 480 481 setStackPointerRegisterToSaveRestore(AArch64::SP); 482 483 setSchedulingPreference(Sched::Hybrid); 484 485 // Enable TBZ/TBNZ 486 MaskAndBranchFoldingIsLegal = true; 487 488 setMinFunctionAlignment(2); 489 490 RequireStrictAlign = (Align == StrictAlign); 491 492 setHasExtractBitsInsn(true); 493 494 if (Subtarget->hasNEON()) { 495 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 496 // silliness like this: 497 setOperationAction(ISD::FABS, MVT::v1f64, Expand); 498 setOperationAction(ISD::FADD, MVT::v1f64, Expand); 499 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); 500 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); 501 setOperationAction(ISD::FCOS, MVT::v1f64, Expand); 502 setOperationAction(ISD::FDIV, MVT::v1f64, Expand); 503 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); 504 setOperationAction(ISD::FMA, MVT::v1f64, Expand); 505 setOperationAction(ISD::FMUL, MVT::v1f64, Expand); 506 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); 507 setOperationAction(ISD::FNEG, MVT::v1f64, Expand); 508 setOperationAction(ISD::FPOW, MVT::v1f64, Expand); 509 setOperationAction(ISD::FREM, MVT::v1f64, Expand); 510 setOperationAction(ISD::FROUND, MVT::v1f64, Expand); 511 setOperationAction(ISD::FRINT, MVT::v1f64, Expand); 512 setOperationAction(ISD::FSIN, MVT::v1f64, Expand); 513 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); 514 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); 515 setOperationAction(ISD::FSUB, MVT::v1f64, Expand); 516 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); 517 setOperationAction(ISD::SETCC, MVT::v1f64, Expand); 518 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); 519 setOperationAction(ISD::SELECT, MVT::v1f64, Expand); 520 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); 521 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); 522 523 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); 524 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); 525 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); 526 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); 527 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); 528 529 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 530 531 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 532 // elements smaller than i32, so promote the input to i32 first. 533 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); 534 setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); 535 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); 536 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); 537 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 538 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 539 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); 540 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); 541 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); 542 543 // AArch64 doesn't have MUL.2d: 544 setOperationAction(ISD::MUL, MVT::v2i64, Expand); 545 // Custom handling for some quad-vector types to detect MULL. 546 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 547 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 548 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 549 550 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 551 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 552 // Likewise, narrowing and extending vector loads/stores aren't handled 553 // directly. 554 for (MVT VT : MVT::vector_valuetypes()) { 555 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 556 557 setOperationAction(ISD::MULHS, VT, Expand); 558 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 559 setOperationAction(ISD::MULHU, VT, Expand); 560 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 561 562 setOperationAction(ISD::BSWAP, VT, Expand); 563 564 for (MVT InnerVT : MVT::vector_valuetypes()) { 565 setTruncStoreAction(VT, InnerVT, Expand); 566 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 567 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 568 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 569 } 570 } 571 572 // AArch64 has implementations of a lot of rounding-like FP operations. 573 static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 }; 574 for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) { 575 MVT Ty = RoundingVecTypes[I]; 576 setOperationAction(ISD::FFLOOR, Ty, Legal); 577 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 578 setOperationAction(ISD::FCEIL, Ty, Legal); 579 setOperationAction(ISD::FRINT, Ty, Legal); 580 setOperationAction(ISD::FTRUNC, Ty, Legal); 581 setOperationAction(ISD::FROUND, Ty, Legal); 582 } 583 } 584 585 // Prefer likely predicted branches to selects on out-of-order cores. 586 if (Subtarget->isCortexA57()) 587 PredictableSelectIsExpensive = true; 588} 589 590void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { 591 if (VT == MVT::v2f32 || VT == MVT::v4f16) { 592 setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); 593 AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); 594 595 setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); 596 AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); 597 } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { 598 setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); 599 AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); 600 601 setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); 602 AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); 603 } 604 605 // Mark vector float intrinsics as expand. 606 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 607 setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); 608 setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); 609 setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); 610 setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); 611 setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); 612 setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); 613 setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); 614 setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); 615 setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); 616 } 617 618 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); 619 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); 620 setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); 621 setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); 622 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); 623 setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); 624 setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); 625 setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); 626 setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); 627 setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); 628 setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); 629 setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); 630 631 setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); 632 setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); 633 setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); 634 for (MVT InnerVT : MVT::all_valuetypes()) 635 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); 636 637 // CNT supports only B element sizes. 638 if (VT != MVT::v8i8 && VT != MVT::v16i8) 639 setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); 640 641 setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); 642 setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); 643 setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); 644 setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); 645 setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); 646 647 setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); 648 setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); 649 650 if (Subtarget->isLittleEndian()) { 651 for (unsigned im = (unsigned)ISD::PRE_INC; 652 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 653 setIndexedLoadAction(im, VT.getSimpleVT(), Legal); 654 setIndexedStoreAction(im, VT.getSimpleVT(), Legal); 655 } 656 } 657} 658 659void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 660 addRegisterClass(VT, &AArch64::FPR64RegClass); 661 addTypeForNEON(VT, MVT::v2i32); 662} 663 664void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 665 addRegisterClass(VT, &AArch64::FPR128RegClass); 666 addTypeForNEON(VT, MVT::v4i32); 667} 668 669EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 670 if (!VT.isVector()) 671 return MVT::i32; 672 return VT.changeVectorElementTypeToInteger(); 673} 674 675/// computeKnownBitsForTargetNode - Determine which of the bits specified in 676/// Mask are known to be either zero or one and return them in the 677/// KnownZero/KnownOne bitsets. 678void AArch64TargetLowering::computeKnownBitsForTargetNode( 679 const SDValue Op, APInt &KnownZero, APInt &KnownOne, 680 const SelectionDAG &DAG, unsigned Depth) const { 681 switch (Op.getOpcode()) { 682 default: 683 break; 684 case AArch64ISD::CSEL: { 685 APInt KnownZero2, KnownOne2; 686 DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); 687 DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); 688 KnownZero &= KnownZero2; 689 KnownOne &= KnownOne2; 690 break; 691 } 692 case ISD::INTRINSIC_W_CHAIN: { 693 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 694 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 695 switch (IntID) { 696 default: return; 697 case Intrinsic::aarch64_ldaxr: 698 case Intrinsic::aarch64_ldxr: { 699 unsigned BitWidth = KnownOne.getBitWidth(); 700 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 701 unsigned MemBits = VT.getScalarType().getSizeInBits(); 702 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 703 return; 704 } 705 } 706 break; 707 } 708 case ISD::INTRINSIC_WO_CHAIN: 709 case ISD::INTRINSIC_VOID: { 710 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 711 switch (IntNo) { 712 default: 713 break; 714 case Intrinsic::aarch64_neon_umaxv: 715 case Intrinsic::aarch64_neon_uminv: { 716 // Figure out the datatype of the vector operand. The UMINV instruction 717 // will zero extend the result, so we can mark as known zero all the 718 // bits larger than the element datatype. 32-bit or larget doesn't need 719 // this as those are legal types and will be handled by isel directly. 720 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 721 unsigned BitWidth = KnownZero.getBitWidth(); 722 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 723 assert(BitWidth >= 8 && "Unexpected width!"); 724 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 725 KnownZero |= Mask; 726 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 727 assert(BitWidth >= 16 && "Unexpected width!"); 728 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 729 KnownZero |= Mask; 730 } 731 break; 732 } break; 733 } 734 } 735 } 736} 737 738MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { 739 return MVT::i64; 740} 741 742unsigned AArch64TargetLowering::getMaximalGlobalOffset() const { 743 // FIXME: On AArch64, this depends on the type. 744 // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). 745 // and the offset has to be a multiple of the related size in bytes. 746 return 4095; 747} 748 749FastISel * 750AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 751 const TargetLibraryInfo *libInfo) const { 752 return AArch64::createFastISel(funcInfo, libInfo); 753} 754 755const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 756 switch (Opcode) { 757 default: 758 return nullptr; 759 case AArch64ISD::CALL: return "AArch64ISD::CALL"; 760 case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; 761 case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; 762 case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; 763 case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; 764 case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; 765 case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; 766 case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; 767 case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; 768 case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; 769 case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; 770 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 771 case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; 772 case AArch64ISD::ADC: return "AArch64ISD::ADC"; 773 case AArch64ISD::SBC: return "AArch64ISD::SBC"; 774 case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; 775 case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; 776 case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; 777 case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; 778 case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; 779 case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; 780 case AArch64ISD::FMIN: return "AArch64ISD::FMIN"; 781 case AArch64ISD::FMAX: return "AArch64ISD::FMAX"; 782 case AArch64ISD::DUP: return "AArch64ISD::DUP"; 783 case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; 784 case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; 785 case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; 786 case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; 787 case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; 788 case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; 789 case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; 790 case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; 791 case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; 792 case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; 793 case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; 794 case AArch64ISD::BICi: return "AArch64ISD::BICi"; 795 case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; 796 case AArch64ISD::BSL: return "AArch64ISD::BSL"; 797 case AArch64ISD::NEG: return "AArch64ISD::NEG"; 798 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 799 case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; 800 case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; 801 case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; 802 case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; 803 case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; 804 case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; 805 case AArch64ISD::REV16: return "AArch64ISD::REV16"; 806 case AArch64ISD::REV32: return "AArch64ISD::REV32"; 807 case AArch64ISD::REV64: return "AArch64ISD::REV64"; 808 case AArch64ISD::EXT: return "AArch64ISD::EXT"; 809 case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; 810 case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; 811 case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; 812 case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; 813 case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; 814 case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; 815 case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; 816 case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; 817 case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; 818 case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; 819 case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; 820 case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; 821 case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; 822 case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; 823 case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; 824 case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; 825 case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; 826 case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; 827 case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; 828 case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; 829 case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; 830 case AArch64ISD::NOT: return "AArch64ISD::NOT"; 831 case AArch64ISD::BIT: return "AArch64ISD::BIT"; 832 case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; 833 case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; 834 case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; 835 case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; 836 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 837 case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; 838 case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; 839 case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; 840 case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; 841 case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; 842 case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; 843 case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; 844 case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; 845 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; 846 case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; 847 case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; 848 case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; 849 case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; 850 case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; 851 case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; 852 case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; 853 case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; 854 case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; 855 case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; 856 case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; 857 case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; 858 case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; 859 case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; 860 case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; 861 case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; 862 case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; 863 case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; 864 case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; 865 case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; 866 case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; 867 case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; 868 case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; 869 case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; 870 case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; 871 } 872} 873 874MachineBasicBlock * 875AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, 876 MachineBasicBlock *MBB) const { 877 // We materialise the F128CSEL pseudo-instruction as some control flow and a 878 // phi node: 879 880 // OrigBB: 881 // [... previous instrs leading to comparison ...] 882 // b.ne TrueBB 883 // b EndBB 884 // TrueBB: 885 // ; Fallthrough 886 // EndBB: 887 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 888 889 const TargetInstrInfo *TII = 890 getTargetMachine().getSubtargetImpl()->getInstrInfo(); 891 MachineFunction *MF = MBB->getParent(); 892 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 893 DebugLoc DL = MI->getDebugLoc(); 894 MachineFunction::iterator It = MBB; 895 ++It; 896 897 unsigned DestReg = MI->getOperand(0).getReg(); 898 unsigned IfTrueReg = MI->getOperand(1).getReg(); 899 unsigned IfFalseReg = MI->getOperand(2).getReg(); 900 unsigned CondCode = MI->getOperand(3).getImm(); 901 bool NZCVKilled = MI->getOperand(4).isKill(); 902 903 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 904 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 905 MF->insert(It, TrueBB); 906 MF->insert(It, EndBB); 907 908 // Transfer rest of current basic-block to EndBB 909 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 910 MBB->end()); 911 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 912 913 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 914 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 915 MBB->addSuccessor(TrueBB); 916 MBB->addSuccessor(EndBB); 917 918 // TrueBB falls through to the end. 919 TrueBB->addSuccessor(EndBB); 920 921 if (!NZCVKilled) { 922 TrueBB->addLiveIn(AArch64::NZCV); 923 EndBB->addLiveIn(AArch64::NZCV); 924 } 925 926 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 927 .addReg(IfTrueReg) 928 .addMBB(TrueBB) 929 .addReg(IfFalseReg) 930 .addMBB(MBB); 931 932 MI->eraseFromParent(); 933 return EndBB; 934} 935 936MachineBasicBlock * 937AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 938 MachineBasicBlock *BB) const { 939 switch (MI->getOpcode()) { 940 default: 941#ifndef NDEBUG 942 MI->dump(); 943#endif 944 llvm_unreachable("Unexpected instruction for custom inserter!"); 945 946 case AArch64::F128CSEL: 947 return EmitF128CSEL(MI, BB); 948 949 case TargetOpcode::STACKMAP: 950 case TargetOpcode::PATCHPOINT: 951 return emitPatchPoint(MI, BB); 952 } 953} 954 955//===----------------------------------------------------------------------===// 956// AArch64 Lowering private implementation. 957//===----------------------------------------------------------------------===// 958 959//===----------------------------------------------------------------------===// 960// Lowering Code 961//===----------------------------------------------------------------------===// 962 963/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 964/// CC 965static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 966 switch (CC) { 967 default: 968 llvm_unreachable("Unknown condition code!"); 969 case ISD::SETNE: 970 return AArch64CC::NE; 971 case ISD::SETEQ: 972 return AArch64CC::EQ; 973 case ISD::SETGT: 974 return AArch64CC::GT; 975 case ISD::SETGE: 976 return AArch64CC::GE; 977 case ISD::SETLT: 978 return AArch64CC::LT; 979 case ISD::SETLE: 980 return AArch64CC::LE; 981 case ISD::SETUGT: 982 return AArch64CC::HI; 983 case ISD::SETUGE: 984 return AArch64CC::HS; 985 case ISD::SETULT: 986 return AArch64CC::LO; 987 case ISD::SETULE: 988 return AArch64CC::LS; 989 } 990} 991 992/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 993static void changeFPCCToAArch64CC(ISD::CondCode CC, 994 AArch64CC::CondCode &CondCode, 995 AArch64CC::CondCode &CondCode2) { 996 CondCode2 = AArch64CC::AL; 997 switch (CC) { 998 default: 999 llvm_unreachable("Unknown FP condition!"); 1000 case ISD::SETEQ: 1001 case ISD::SETOEQ: 1002 CondCode = AArch64CC::EQ; 1003 break; 1004 case ISD::SETGT: 1005 case ISD::SETOGT: 1006 CondCode = AArch64CC::GT; 1007 break; 1008 case ISD::SETGE: 1009 case ISD::SETOGE: 1010 CondCode = AArch64CC::GE; 1011 break; 1012 case ISD::SETOLT: 1013 CondCode = AArch64CC::MI; 1014 break; 1015 case ISD::SETOLE: 1016 CondCode = AArch64CC::LS; 1017 break; 1018 case ISD::SETONE: 1019 CondCode = AArch64CC::MI; 1020 CondCode2 = AArch64CC::GT; 1021 break; 1022 case ISD::SETO: 1023 CondCode = AArch64CC::VC; 1024 break; 1025 case ISD::SETUO: 1026 CondCode = AArch64CC::VS; 1027 break; 1028 case ISD::SETUEQ: 1029 CondCode = AArch64CC::EQ; 1030 CondCode2 = AArch64CC::VS; 1031 break; 1032 case ISD::SETUGT: 1033 CondCode = AArch64CC::HI; 1034 break; 1035 case ISD::SETUGE: 1036 CondCode = AArch64CC::PL; 1037 break; 1038 case ISD::SETLT: 1039 case ISD::SETULT: 1040 CondCode = AArch64CC::LT; 1041 break; 1042 case ISD::SETLE: 1043 case ISD::SETULE: 1044 CondCode = AArch64CC::LE; 1045 break; 1046 case ISD::SETNE: 1047 case ISD::SETUNE: 1048 CondCode = AArch64CC::NE; 1049 break; 1050 } 1051} 1052 1053/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 1054/// CC usable with the vector instructions. Fewer operations are available 1055/// without a real NZCV register, so we have to use less efficient combinations 1056/// to get the same effect. 1057static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 1058 AArch64CC::CondCode &CondCode, 1059 AArch64CC::CondCode &CondCode2, 1060 bool &Invert) { 1061 Invert = false; 1062 switch (CC) { 1063 default: 1064 // Mostly the scalar mappings work fine. 1065 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1066 break; 1067 case ISD::SETUO: 1068 Invert = true; // Fallthrough 1069 case ISD::SETO: 1070 CondCode = AArch64CC::MI; 1071 CondCode2 = AArch64CC::GE; 1072 break; 1073 case ISD::SETUEQ: 1074 case ISD::SETULT: 1075 case ISD::SETULE: 1076 case ISD::SETUGT: 1077 case ISD::SETUGE: 1078 // All of the compare-mask comparisons are ordered, but we can switch 1079 // between the two by a double inversion. E.g. ULE == !OGT. 1080 Invert = true; 1081 changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); 1082 break; 1083 } 1084} 1085 1086static bool isLegalArithImmed(uint64_t C) { 1087 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 1088 return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 1089} 1090 1091static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1092 SDLoc dl, SelectionDAG &DAG) { 1093 EVT VT = LHS.getValueType(); 1094 1095 if (VT.isFloatingPoint()) 1096 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 1097 1098 // The CMP instruction is just an alias for SUBS, and representing it as 1099 // SUBS means that it's possible to get CSE with subtract operations. 1100 // A later phase can perform the optimization of setting the destination 1101 // register to WZR/XZR if it ends up being unused. 1102 unsigned Opcode = AArch64ISD::SUBS; 1103 1104 if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) && 1105 cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 && 1106 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1107 // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on 1108 // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags 1109 // can be set differently by this operation. It comes down to whether 1110 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 1111 // everything is fine. If not then the optimization is wrong. Thus general 1112 // comparisons are only valid if op2 != 0. 1113 1114 // So, finally, the only LLVM-native comparisons that don't mention C and V 1115 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 1116 // the absence of information about op2. 1117 Opcode = AArch64ISD::ADDS; 1118 RHS = RHS.getOperand(1); 1119 } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) && 1120 cast<ConstantSDNode>(RHS)->getZExtValue() == 0 && 1121 !isUnsignedIntSetCC(CC)) { 1122 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 1123 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 1124 // of the signed comparisons. 1125 Opcode = AArch64ISD::ANDS; 1126 RHS = LHS.getOperand(1); 1127 LHS = LHS.getOperand(0); 1128 } 1129 1130 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) 1131 .getValue(1); 1132} 1133 1134static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1135 SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { 1136 SDValue Cmp; 1137 AArch64CC::CondCode AArch64CC; 1138 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1139 EVT VT = RHS.getValueType(); 1140 uint64_t C = RHSC->getZExtValue(); 1141 if (!isLegalArithImmed(C)) { 1142 // Constant does not fit, try adjusting it by one? 1143 switch (CC) { 1144 default: 1145 break; 1146 case ISD::SETLT: 1147 case ISD::SETGE: 1148 if ((VT == MVT::i32 && C != 0x80000000 && 1149 isLegalArithImmed((uint32_t)(C - 1))) || 1150 (VT == MVT::i64 && C != 0x80000000ULL && 1151 isLegalArithImmed(C - 1ULL))) { 1152 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1153 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1154 RHS = DAG.getConstant(C, VT); 1155 } 1156 break; 1157 case ISD::SETULT: 1158 case ISD::SETUGE: 1159 if ((VT == MVT::i32 && C != 0 && 1160 isLegalArithImmed((uint32_t)(C - 1))) || 1161 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 1162 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1163 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1164 RHS = DAG.getConstant(C, VT); 1165 } 1166 break; 1167 case ISD::SETLE: 1168 case ISD::SETGT: 1169 if ((VT == MVT::i32 && C != INT32_MAX && 1170 isLegalArithImmed((uint32_t)(C + 1))) || 1171 (VT == MVT::i64 && C != INT64_MAX && 1172 isLegalArithImmed(C + 1ULL))) { 1173 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1174 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1175 RHS = DAG.getConstant(C, VT); 1176 } 1177 break; 1178 case ISD::SETULE: 1179 case ISD::SETUGT: 1180 if ((VT == MVT::i32 && C != UINT32_MAX && 1181 isLegalArithImmed((uint32_t)(C + 1))) || 1182 (VT == MVT::i64 && C != UINT64_MAX && 1183 isLegalArithImmed(C + 1ULL))) { 1184 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1185 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1186 RHS = DAG.getConstant(C, VT); 1187 } 1188 break; 1189 } 1190 } 1191 } 1192 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 1193 // For the i8 operand, the largest immediate is 255, so this can be easily 1194 // encoded in the compare instruction. For the i16 operand, however, the 1195 // largest immediate cannot be encoded in the compare. 1196 // Therefore, use a sign extending load and cmn to avoid materializing the -1 1197 // constant. For example, 1198 // movz w1, #65535 1199 // ldrh w0, [x0, #0] 1200 // cmp w0, w1 1201 // > 1202 // ldrsh w0, [x0, #0] 1203 // cmn w0, #1 1204 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 1205 // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure 1206 // both the LHS and RHS are truely zero extended and to make sure the 1207 // transformation is profitable. 1208 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 1209 if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) && 1210 isa<LoadSDNode>(LHS)) { 1211 if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 1212 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 1213 LHS.getNode()->hasNUsesOfValue(1, 0)) { 1214 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); 1215 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 1216 SDValue SExt = 1217 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 1218 DAG.getValueType(MVT::i16)); 1219 Cmp = emitComparison(SExt, 1220 DAG.getConstant(ValueofRHS, RHS.getValueType()), 1221 CC, dl, DAG); 1222 AArch64CC = changeIntCCToAArch64CC(CC); 1223 AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); 1224 return Cmp; 1225 } 1226 } 1227 } 1228 } 1229 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 1230 AArch64CC = changeIntCCToAArch64CC(CC); 1231 AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); 1232 return Cmp; 1233} 1234 1235static std::pair<SDValue, SDValue> 1236getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 1237 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 1238 "Unsupported value type"); 1239 SDValue Value, Overflow; 1240 SDLoc DL(Op); 1241 SDValue LHS = Op.getOperand(0); 1242 SDValue RHS = Op.getOperand(1); 1243 unsigned Opc = 0; 1244 switch (Op.getOpcode()) { 1245 default: 1246 llvm_unreachable("Unknown overflow instruction!"); 1247 case ISD::SADDO: 1248 Opc = AArch64ISD::ADDS; 1249 CC = AArch64CC::VS; 1250 break; 1251 case ISD::UADDO: 1252 Opc = AArch64ISD::ADDS; 1253 CC = AArch64CC::HS; 1254 break; 1255 case ISD::SSUBO: 1256 Opc = AArch64ISD::SUBS; 1257 CC = AArch64CC::VS; 1258 break; 1259 case ISD::USUBO: 1260 Opc = AArch64ISD::SUBS; 1261 CC = AArch64CC::LO; 1262 break; 1263 // Multiply needs a little bit extra work. 1264 case ISD::SMULO: 1265 case ISD::UMULO: { 1266 CC = AArch64CC::NE; 1267 bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false; 1268 if (Op.getValueType() == MVT::i32) { 1269 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1270 // For a 32 bit multiply with overflow check we want the instruction 1271 // selector to generate a widening multiply (SMADDL/UMADDL). For that we 1272 // need to generate the following pattern: 1273 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) 1274 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 1275 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 1276 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1277 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, 1278 DAG.getConstant(0, MVT::i64)); 1279 // On AArch64 the upper 32 bits are always zero extended for a 32 bit 1280 // operation. We need to clear out the upper 32 bits, because we used a 1281 // widening multiply that wrote all 64 bits. In the end this should be a 1282 // noop. 1283 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); 1284 if (IsSigned) { 1285 // The signed overflow check requires more than just a simple check for 1286 // any bit set in the upper 32 bits of the result. These bits could be 1287 // just the sign bits of a negative number. To perform the overflow 1288 // check we have to arithmetic shift right the 32nd bit of the result by 1289 // 31 bits. Then we compare the result to the upper 32 bits. 1290 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, 1291 DAG.getConstant(32, MVT::i64)); 1292 UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); 1293 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, 1294 DAG.getConstant(31, MVT::i64)); 1295 // It is important that LowerBits is last, otherwise the arithmetic 1296 // shift will not be folded into the compare (SUBS). 1297 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); 1298 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1299 .getValue(1); 1300 } else { 1301 // The overflow check for unsigned multiply is easy. We only need to 1302 // check if any of the upper 32 bits are set. This can be done with a 1303 // CMP (shifted register). For that we need to generate the following 1304 // pattern: 1305 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) 1306 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, 1307 DAG.getConstant(32, MVT::i64)); 1308 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1309 Overflow = 1310 DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), 1311 UpperBits).getValue(1); 1312 } 1313 break; 1314 } 1315 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 1316 // For the 64 bit multiply 1317 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1318 if (IsSigned) { 1319 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 1320 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 1321 DAG.getConstant(63, MVT::i64)); 1322 // It is important that LowerBits is last, otherwise the arithmetic 1323 // shift will not be folded into the compare (SUBS). 1324 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1325 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1326 .getValue(1); 1327 } else { 1328 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 1329 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1330 Overflow = 1331 DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), 1332 UpperBits).getValue(1); 1333 } 1334 break; 1335 } 1336 } // switch (...) 1337 1338 if (Opc) { 1339 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 1340 1341 // Emit the AArch64 operation with overflow check. 1342 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 1343 Overflow = Value.getValue(1); 1344 } 1345 return std::make_pair(Value, Overflow); 1346} 1347 1348SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, 1349 RTLIB::Libcall Call) const { 1350 SmallVector<SDValue, 2> Ops; 1351 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) 1352 Ops.push_back(Op.getOperand(i)); 1353 1354 return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, 1355 SDLoc(Op)).first; 1356} 1357 1358static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { 1359 SDValue Sel = Op.getOperand(0); 1360 SDValue Other = Op.getOperand(1); 1361 1362 // If neither operand is a SELECT_CC, give up. 1363 if (Sel.getOpcode() != ISD::SELECT_CC) 1364 std::swap(Sel, Other); 1365 if (Sel.getOpcode() != ISD::SELECT_CC) 1366 return Op; 1367 1368 // The folding we want to perform is: 1369 // (xor x, (select_cc a, b, cc, 0, -1) ) 1370 // --> 1371 // (csel x, (xor x, -1), cc ...) 1372 // 1373 // The latter will get matched to a CSINV instruction. 1374 1375 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 1376 SDValue LHS = Sel.getOperand(0); 1377 SDValue RHS = Sel.getOperand(1); 1378 SDValue TVal = Sel.getOperand(2); 1379 SDValue FVal = Sel.getOperand(3); 1380 SDLoc dl(Sel); 1381 1382 // FIXME: This could be generalized to non-integer comparisons. 1383 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 1384 return Op; 1385 1386 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 1387 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 1388 1389 // The the values aren't constants, this isn't the pattern we're looking for. 1390 if (!CFVal || !CTVal) 1391 return Op; 1392 1393 // We can commute the SELECT_CC by inverting the condition. This 1394 // might be needed to make this fit into a CSINV pattern. 1395 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { 1396 std::swap(TVal, FVal); 1397 std::swap(CTVal, CFVal); 1398 CC = ISD::getSetCCInverse(CC, true); 1399 } 1400 1401 // If the constants line up, perform the transform! 1402 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { 1403 SDValue CCVal; 1404 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 1405 1406 FVal = Other; 1407 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 1408 DAG.getConstant(-1ULL, Other.getValueType())); 1409 1410 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 1411 CCVal, Cmp); 1412 } 1413 1414 return Op; 1415} 1416 1417static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 1418 EVT VT = Op.getValueType(); 1419 1420 // Let legalize expand this if it isn't a legal type yet. 1421 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 1422 return SDValue(); 1423 1424 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 1425 1426 unsigned Opc; 1427 bool ExtraOp = false; 1428 switch (Op.getOpcode()) { 1429 default: 1430 llvm_unreachable("Invalid code"); 1431 case ISD::ADDC: 1432 Opc = AArch64ISD::ADDS; 1433 break; 1434 case ISD::SUBC: 1435 Opc = AArch64ISD::SUBS; 1436 break; 1437 case ISD::ADDE: 1438 Opc = AArch64ISD::ADCS; 1439 ExtraOp = true; 1440 break; 1441 case ISD::SUBE: 1442 Opc = AArch64ISD::SBCS; 1443 ExtraOp = true; 1444 break; 1445 } 1446 1447 if (!ExtraOp) 1448 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); 1449 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), 1450 Op.getOperand(2)); 1451} 1452 1453static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 1454 // Let legalize expand this if it isn't a legal type yet. 1455 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 1456 return SDValue(); 1457 1458 AArch64CC::CondCode CC; 1459 // The actual operation that sets the overflow or carry flag. 1460 SDValue Value, Overflow; 1461 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 1462 1463 // We use 0 and 1 as false and true values. 1464 SDValue TVal = DAG.getConstant(1, MVT::i32); 1465 SDValue FVal = DAG.getConstant(0, MVT::i32); 1466 1467 // We use an inverted condition, because the conditional select is inverted 1468 // too. This will allow it to be selected to a single instruction: 1469 // CSINC Wd, WZR, WZR, invert(cond). 1470 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32); 1471 Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, 1472 CCVal, Overflow); 1473 1474 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 1475 return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow); 1476} 1477 1478// Prefetch operands are: 1479// 1: Address to prefetch 1480// 2: bool isWrite 1481// 3: int locality (0 = no locality ... 3 = extreme locality) 1482// 4: bool isDataCache 1483static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 1484 SDLoc DL(Op); 1485 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 1486 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 1487 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 1488 1489 bool IsStream = !Locality; 1490 // When the locality number is set 1491 if (Locality) { 1492 // The front-end should have filtered out the out-of-range values 1493 assert(Locality <= 3 && "Prefetch locality out-of-range"); 1494 // The locality degree is the opposite of the cache speed. 1495 // Put the number the other way around. 1496 // The encoding starts at 0 for level 1 1497 Locality = 3 - Locality; 1498 } 1499 1500 // built the mask value encoding the expected behavior. 1501 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1502 (!IsData << 3) | // IsDataCache bit 1503 (Locality << 1) | // Cache level bits 1504 (unsigned)IsStream; // Stream bit 1505 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 1506 DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1)); 1507} 1508 1509SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 1510 SelectionDAG &DAG) const { 1511 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 1512 1513 RTLIB::Libcall LC; 1514 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 1515 1516 return LowerF128Call(Op, DAG, LC); 1517} 1518 1519SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 1520 SelectionDAG &DAG) const { 1521 if (Op.getOperand(0).getValueType() != MVT::f128) { 1522 // It's legal except when f128 is involved 1523 return Op; 1524 } 1525 1526 RTLIB::Libcall LC; 1527 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 1528 1529 // FP_ROUND node has a second operand indicating whether it is known to be 1530 // precise. That doesn't take part in the LibCall so we can't directly use 1531 // LowerF128Call. 1532 SDValue SrcVal = Op.getOperand(0); 1533 return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, 1534 /*isSigned*/ false, SDLoc(Op)).first; 1535} 1536 1537static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 1538 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1539 // Any additional optimization in this function should be recorded 1540 // in the cost tables. 1541 EVT InVT = Op.getOperand(0).getValueType(); 1542 EVT VT = Op.getValueType(); 1543 1544 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1545 SDLoc dl(Op); 1546 SDValue Cv = 1547 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 1548 Op.getOperand(0)); 1549 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 1550 } 1551 1552 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 1553 SDLoc dl(Op); 1554 MVT ExtVT = 1555 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 1556 VT.getVectorNumElements()); 1557 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 1558 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 1559 } 1560 1561 // Type changing conversions are illegal. 1562 return Op; 1563} 1564 1565SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 1566 SelectionDAG &DAG) const { 1567 if (Op.getOperand(0).getValueType().isVector()) 1568 return LowerVectorFP_TO_INT(Op, DAG); 1569 1570 if (Op.getOperand(0).getValueType() != MVT::f128) { 1571 // It's legal except when f128 is involved 1572 return Op; 1573 } 1574 1575 RTLIB::Libcall LC; 1576 if (Op.getOpcode() == ISD::FP_TO_SINT) 1577 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1578 else 1579 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1580 1581 SmallVector<SDValue, 2> Ops; 1582 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) 1583 Ops.push_back(Op.getOperand(i)); 1584 1585 return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, 1586 SDLoc(Op)).first; 1587} 1588 1589static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 1590 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1591 // Any additional optimization in this function should be recorded 1592 // in the cost tables. 1593 EVT VT = Op.getValueType(); 1594 SDLoc dl(Op); 1595 SDValue In = Op.getOperand(0); 1596 EVT InVT = In.getValueType(); 1597 1598 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1599 MVT CastVT = 1600 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 1601 InVT.getVectorNumElements()); 1602 In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); 1603 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0)); 1604 } 1605 1606 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 1607 unsigned CastOpc = 1608 Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1609 EVT CastVT = VT.changeVectorElementTypeToInteger(); 1610 In = DAG.getNode(CastOpc, dl, CastVT, In); 1611 return DAG.getNode(Op.getOpcode(), dl, VT, In); 1612 } 1613 1614 return Op; 1615} 1616 1617SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 1618 SelectionDAG &DAG) const { 1619 if (Op.getValueType().isVector()) 1620 return LowerVectorINT_TO_FP(Op, DAG); 1621 1622 // i128 conversions are libcalls. 1623 if (Op.getOperand(0).getValueType() == MVT::i128) 1624 return SDValue(); 1625 1626 // Other conversions are legal, unless it's to the completely software-based 1627 // fp128. 1628 if (Op.getValueType() != MVT::f128) 1629 return Op; 1630 1631 RTLIB::Libcall LC; 1632 if (Op.getOpcode() == ISD::SINT_TO_FP) 1633 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 1634 else 1635 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 1636 1637 return LowerF128Call(Op, DAG, LC); 1638} 1639 1640SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 1641 SelectionDAG &DAG) const { 1642 // For iOS, we want to call an alternative entry point: __sincos_stret, 1643 // which returns the values in two S / D registers. 1644 SDLoc dl(Op); 1645 SDValue Arg = Op.getOperand(0); 1646 EVT ArgVT = Arg.getValueType(); 1647 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 1648 1649 ArgListTy Args; 1650 ArgListEntry Entry; 1651 1652 Entry.Node = Arg; 1653 Entry.Ty = ArgTy; 1654 Entry.isSExt = false; 1655 Entry.isZExt = false; 1656 Args.push_back(Entry); 1657 1658 const char *LibcallName = 1659 (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; 1660 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); 1661 1662 StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); 1663 TargetLowering::CallLoweringInfo CLI(DAG); 1664 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 1665 .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); 1666 1667 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 1668 return CallResult.first; 1669} 1670 1671static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { 1672 if (Op.getValueType() != MVT::f16) 1673 return SDValue(); 1674 1675 assert(Op.getOperand(0).getValueType() == MVT::i16); 1676 SDLoc DL(Op); 1677 1678 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 1679 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 1680 return SDValue( 1681 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, 1682 DAG.getTargetConstant(AArch64::hsub, MVT::i32)), 1683 0); 1684} 1685 1686static EVT getExtensionTo64Bits(const EVT &OrigVT) { 1687 if (OrigVT.getSizeInBits() >= 64) 1688 return OrigVT; 1689 1690 assert(OrigVT.isSimple() && "Expecting a simple value type"); 1691 1692 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 1693 switch (OrigSimpleTy) { 1694 default: llvm_unreachable("Unexpected Vector Type"); 1695 case MVT::v2i8: 1696 case MVT::v2i16: 1697 return MVT::v2i32; 1698 case MVT::v4i8: 1699 return MVT::v4i16; 1700 } 1701} 1702 1703static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 1704 const EVT &OrigTy, 1705 const EVT &ExtTy, 1706 unsigned ExtOpcode) { 1707 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 1708 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 1709 // 64-bits we need to insert a new extension so that it will be 64-bits. 1710 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 1711 if (OrigTy.getSizeInBits() >= 64) 1712 return N; 1713 1714 // Must extend size to at least 64 bits to be used as an operand for VMULL. 1715 EVT NewVT = getExtensionTo64Bits(OrigTy); 1716 1717 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 1718} 1719 1720static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 1721 bool isSigned) { 1722 EVT VT = N->getValueType(0); 1723 1724 if (N->getOpcode() != ISD::BUILD_VECTOR) 1725 return false; 1726 1727 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 1728 SDNode *Elt = N->getOperand(i).getNode(); 1729 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 1730 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 1731 unsigned HalfSize = EltSize / 2; 1732 if (isSigned) { 1733 if (!isIntN(HalfSize, C->getSExtValue())) 1734 return false; 1735 } else { 1736 if (!isUIntN(HalfSize, C->getZExtValue())) 1737 return false; 1738 } 1739 continue; 1740 } 1741 return false; 1742 } 1743 1744 return true; 1745} 1746 1747static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { 1748 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 1749 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, 1750 N->getOperand(0)->getValueType(0), 1751 N->getValueType(0), 1752 N->getOpcode()); 1753 1754 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 1755 EVT VT = N->getValueType(0); 1756 unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; 1757 unsigned NumElts = VT.getVectorNumElements(); 1758 MVT TruncVT = MVT::getIntegerVT(EltSize); 1759 SmallVector<SDValue, 8> Ops; 1760 for (unsigned i = 0; i != NumElts; ++i) { 1761 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 1762 const APInt &CInt = C->getAPIntValue(); 1763 // Element types smaller than 32 bits are not legal, so use i32 elements. 1764 // The values are implicitly truncated so sext vs. zext doesn't matter. 1765 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); 1766 } 1767 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), 1768 MVT::getVectorVT(TruncVT, NumElts), Ops); 1769} 1770 1771static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 1772 if (N->getOpcode() == ISD::SIGN_EXTEND) 1773 return true; 1774 if (isExtendedBUILD_VECTOR(N, DAG, true)) 1775 return true; 1776 return false; 1777} 1778 1779static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 1780 if (N->getOpcode() == ISD::ZERO_EXTEND) 1781 return true; 1782 if (isExtendedBUILD_VECTOR(N, DAG, false)) 1783 return true; 1784 return false; 1785} 1786 1787static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 1788 unsigned Opcode = N->getOpcode(); 1789 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 1790 SDNode *N0 = N->getOperand(0).getNode(); 1791 SDNode *N1 = N->getOperand(1).getNode(); 1792 return N0->hasOneUse() && N1->hasOneUse() && 1793 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 1794 } 1795 return false; 1796} 1797 1798static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 1799 unsigned Opcode = N->getOpcode(); 1800 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 1801 SDNode *N0 = N->getOperand(0).getNode(); 1802 SDNode *N1 = N->getOperand(1).getNode(); 1803 return N0->hasOneUse() && N1->hasOneUse() && 1804 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 1805 } 1806 return false; 1807} 1808 1809static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 1810 // Multiplications are only custom-lowered for 128-bit vectors so that 1811 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 1812 EVT VT = Op.getValueType(); 1813 assert(VT.is128BitVector() && VT.isInteger() && 1814 "unexpected type for custom-lowering ISD::MUL"); 1815 SDNode *N0 = Op.getOperand(0).getNode(); 1816 SDNode *N1 = Op.getOperand(1).getNode(); 1817 unsigned NewOpc = 0; 1818 bool isMLA = false; 1819 bool isN0SExt = isSignExtended(N0, DAG); 1820 bool isN1SExt = isSignExtended(N1, DAG); 1821 if (isN0SExt && isN1SExt) 1822 NewOpc = AArch64ISD::SMULL; 1823 else { 1824 bool isN0ZExt = isZeroExtended(N0, DAG); 1825 bool isN1ZExt = isZeroExtended(N1, DAG); 1826 if (isN0ZExt && isN1ZExt) 1827 NewOpc = AArch64ISD::UMULL; 1828 else if (isN1SExt || isN1ZExt) { 1829 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 1830 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 1831 if (isN1SExt && isAddSubSExt(N0, DAG)) { 1832 NewOpc = AArch64ISD::SMULL; 1833 isMLA = true; 1834 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 1835 NewOpc = AArch64ISD::UMULL; 1836 isMLA = true; 1837 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 1838 std::swap(N0, N1); 1839 NewOpc = AArch64ISD::UMULL; 1840 isMLA = true; 1841 } 1842 } 1843 1844 if (!NewOpc) { 1845 if (VT == MVT::v2i64) 1846 // Fall through to expand this. It is not legal. 1847 return SDValue(); 1848 else 1849 // Other vector multiplications are legal. 1850 return Op; 1851 } 1852 } 1853 1854 // Legalize to a S/UMULL instruction 1855 SDLoc DL(Op); 1856 SDValue Op0; 1857 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 1858 if (!isMLA) { 1859 Op0 = skipExtensionForVectorMULL(N0, DAG); 1860 assert(Op0.getValueType().is64BitVector() && 1861 Op1.getValueType().is64BitVector() && 1862 "unexpected types for extended operands to VMULL"); 1863 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 1864 } 1865 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 1866 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 1867 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 1868 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); 1869 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); 1870 EVT Op1VT = Op1.getValueType(); 1871 return DAG.getNode(N0->getOpcode(), DL, VT, 1872 DAG.getNode(NewOpc, DL, VT, 1873 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 1874 DAG.getNode(NewOpc, DL, VT, 1875 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 1876} 1877 1878SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 1879 SelectionDAG &DAG) const { 1880 switch (Op.getOpcode()) { 1881 default: 1882 llvm_unreachable("unimplemented operand"); 1883 return SDValue(); 1884 case ISD::BITCAST: 1885 return LowerBITCAST(Op, DAG); 1886 case ISD::GlobalAddress: 1887 return LowerGlobalAddress(Op, DAG); 1888 case ISD::GlobalTLSAddress: 1889 return LowerGlobalTLSAddress(Op, DAG); 1890 case ISD::SETCC: 1891 return LowerSETCC(Op, DAG); 1892 case ISD::BR_CC: 1893 return LowerBR_CC(Op, DAG); 1894 case ISD::SELECT: 1895 return LowerSELECT(Op, DAG); 1896 case ISD::SELECT_CC: 1897 return LowerSELECT_CC(Op, DAG); 1898 case ISD::JumpTable: 1899 return LowerJumpTable(Op, DAG); 1900 case ISD::ConstantPool: 1901 return LowerConstantPool(Op, DAG); 1902 case ISD::BlockAddress: 1903 return LowerBlockAddress(Op, DAG); 1904 case ISD::VASTART: 1905 return LowerVASTART(Op, DAG); 1906 case ISD::VACOPY: 1907 return LowerVACOPY(Op, DAG); 1908 case ISD::VAARG: 1909 return LowerVAARG(Op, DAG); 1910 case ISD::ADDC: 1911 case ISD::ADDE: 1912 case ISD::SUBC: 1913 case ISD::SUBE: 1914 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 1915 case ISD::SADDO: 1916 case ISD::UADDO: 1917 case ISD::SSUBO: 1918 case ISD::USUBO: 1919 case ISD::SMULO: 1920 case ISD::UMULO: 1921 return LowerXALUO(Op, DAG); 1922 case ISD::FADD: 1923 return LowerF128Call(Op, DAG, RTLIB::ADD_F128); 1924 case ISD::FSUB: 1925 return LowerF128Call(Op, DAG, RTLIB::SUB_F128); 1926 case ISD::FMUL: 1927 return LowerF128Call(Op, DAG, RTLIB::MUL_F128); 1928 case ISD::FDIV: 1929 return LowerF128Call(Op, DAG, RTLIB::DIV_F128); 1930 case ISD::FP_ROUND: 1931 return LowerFP_ROUND(Op, DAG); 1932 case ISD::FP_EXTEND: 1933 return LowerFP_EXTEND(Op, DAG); 1934 case ISD::FRAMEADDR: 1935 return LowerFRAMEADDR(Op, DAG); 1936 case ISD::RETURNADDR: 1937 return LowerRETURNADDR(Op, DAG); 1938 case ISD::INSERT_VECTOR_ELT: 1939 return LowerINSERT_VECTOR_ELT(Op, DAG); 1940 case ISD::EXTRACT_VECTOR_ELT: 1941 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 1942 case ISD::BUILD_VECTOR: 1943 return LowerBUILD_VECTOR(Op, DAG); 1944 case ISD::VECTOR_SHUFFLE: 1945 return LowerVECTOR_SHUFFLE(Op, DAG); 1946 case ISD::EXTRACT_SUBVECTOR: 1947 return LowerEXTRACT_SUBVECTOR(Op, DAG); 1948 case ISD::SRA: 1949 case ISD::SRL: 1950 case ISD::SHL: 1951 return LowerVectorSRA_SRL_SHL(Op, DAG); 1952 case ISD::SHL_PARTS: 1953 return LowerShiftLeftParts(Op, DAG); 1954 case ISD::SRL_PARTS: 1955 case ISD::SRA_PARTS: 1956 return LowerShiftRightParts(Op, DAG); 1957 case ISD::CTPOP: 1958 return LowerCTPOP(Op, DAG); 1959 case ISD::FCOPYSIGN: 1960 return LowerFCOPYSIGN(Op, DAG); 1961 case ISD::AND: 1962 return LowerVectorAND(Op, DAG); 1963 case ISD::OR: 1964 return LowerVectorOR(Op, DAG); 1965 case ISD::XOR: 1966 return LowerXOR(Op, DAG); 1967 case ISD::PREFETCH: 1968 return LowerPREFETCH(Op, DAG); 1969 case ISD::SINT_TO_FP: 1970 case ISD::UINT_TO_FP: 1971 return LowerINT_TO_FP(Op, DAG); 1972 case ISD::FP_TO_SINT: 1973 case ISD::FP_TO_UINT: 1974 return LowerFP_TO_INT(Op, DAG); 1975 case ISD::FSINCOS: 1976 return LowerFSINCOS(Op, DAG); 1977 case ISD::MUL: 1978 return LowerMUL(Op, DAG); 1979 } 1980} 1981 1982/// getFunctionAlignment - Return the Log2 alignment of this function. 1983unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const { 1984 return 2; 1985} 1986 1987//===----------------------------------------------------------------------===// 1988// Calling Convention Implementation 1989//===----------------------------------------------------------------------===// 1990 1991#include "AArch64GenCallingConv.inc" 1992 1993/// Selects the correct CCAssignFn for a given CallingConvention value. 1994CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1995 bool IsVarArg) const { 1996 switch (CC) { 1997 default: 1998 llvm_unreachable("Unsupported calling convention."); 1999 case CallingConv::WebKit_JS: 2000 return CC_AArch64_WebKit_JS; 2001 case CallingConv::GHC: 2002 return CC_AArch64_GHC; 2003 case CallingConv::C: 2004 case CallingConv::Fast: 2005 if (!Subtarget->isTargetDarwin()) 2006 return CC_AArch64_AAPCS; 2007 return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; 2008 } 2009} 2010 2011SDValue AArch64TargetLowering::LowerFormalArguments( 2012 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2013 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 2014 SmallVectorImpl<SDValue> &InVals) const { 2015 MachineFunction &MF = DAG.getMachineFunction(); 2016 MachineFrameInfo *MFI = MF.getFrameInfo(); 2017 2018 // Assign locations to all of the incoming arguments. 2019 SmallVector<CCValAssign, 16> ArgLocs; 2020 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2021 *DAG.getContext()); 2022 2023 // At this point, Ins[].VT may already be promoted to i32. To correctly 2024 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2025 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2026 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 2027 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 2028 // LocVT. 2029 unsigned NumArgs = Ins.size(); 2030 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); 2031 unsigned CurArgIdx = 0; 2032 for (unsigned i = 0; i != NumArgs; ++i) { 2033 MVT ValVT = Ins[i].VT;
| 1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the AArch64TargetLowering class. 11// 12//===----------------------------------------------------------------------===// 13 14#include "AArch64ISelLowering.h" 15#include "AArch64CallingConvention.h" 16#include "AArch64MachineFunctionInfo.h" 17#include "AArch64PerfectShuffle.h" 18#include "AArch64Subtarget.h" 19#include "AArch64TargetMachine.h" 20#include "AArch64TargetObjectFile.h" 21#include "MCTargetDesc/AArch64AddressingModes.h" 22#include "llvm/ADT/Statistic.h" 23#include "llvm/CodeGen/CallingConvLower.h" 24#include "llvm/CodeGen/MachineFrameInfo.h" 25#include "llvm/CodeGen/MachineInstrBuilder.h" 26#include "llvm/CodeGen/MachineRegisterInfo.h" 27#include "llvm/IR/Function.h" 28#include "llvm/IR/Intrinsics.h" 29#include "llvm/IR/Type.h" 30#include "llvm/Support/CommandLine.h" 31#include "llvm/Support/Debug.h" 32#include "llvm/Support/ErrorHandling.h" 33#include "llvm/Support/raw_ostream.h" 34#include "llvm/Target/TargetOptions.h" 35using namespace llvm; 36 37#define DEBUG_TYPE "aarch64-lower" 38 39STATISTIC(NumTailCalls, "Number of tail calls"); 40STATISTIC(NumShiftInserts, "Number of vector shift inserts"); 41 42namespace { 43enum AlignMode { 44 StrictAlign, 45 NoStrictAlign 46}; 47} 48 49static cl::opt<AlignMode> 50Align(cl::desc("Load/store alignment support"), 51 cl::Hidden, cl::init(NoStrictAlign), 52 cl::values( 53 clEnumValN(StrictAlign, "aarch64-strict-align", 54 "Disallow all unaligned memory accesses"), 55 clEnumValN(NoStrictAlign, "aarch64-no-strict-align", 56 "Allow unaligned memory accesses"), 57 clEnumValEnd)); 58 59// Place holder until extr generation is tested fully. 60static cl::opt<bool> 61EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, 62 cl::desc("Allow AArch64 (or (shift)(shift))->extract"), 63 cl::init(true)); 64 65static cl::opt<bool> 66EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, 67 cl::desc("Allow AArch64 SLI/SRI formation"), 68 cl::init(false)); 69 70// FIXME: The necessary dtprel relocations don't seem to be supported 71// well in the GNU bfd and gold linkers at the moment. Therefore, by 72// default, for now, fall back to GeneralDynamic code generation. 73cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( 74 "aarch64-elf-ldtls-generation", cl::Hidden, 75 cl::desc("Allow AArch64 Local Dynamic TLS code generation"), 76 cl::init(false)); 77 78 79AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) 80 : TargetLowering(TM) { 81 Subtarget = &TM.getSubtarget<AArch64Subtarget>(); 82 83 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so 84 // we have to make something up. Arbitrarily, choose ZeroOrOne. 85 setBooleanContents(ZeroOrOneBooleanContent); 86 // When comparing vectors the result sets the different elements in the 87 // vector to all-one or all-zero. 88 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 89 90 // Set up the register classes. 91 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); 92 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); 93 94 if (Subtarget->hasFPARMv8()) { 95 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); 96 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); 97 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); 98 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); 99 } 100 101 if (Subtarget->hasNEON()) { 102 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); 103 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); 104 // Someone set us up the NEON. 105 addDRTypeForNEON(MVT::v2f32); 106 addDRTypeForNEON(MVT::v8i8); 107 addDRTypeForNEON(MVT::v4i16); 108 addDRTypeForNEON(MVT::v2i32); 109 addDRTypeForNEON(MVT::v1i64); 110 addDRTypeForNEON(MVT::v1f64); 111 addDRTypeForNEON(MVT::v4f16); 112 113 addQRTypeForNEON(MVT::v4f32); 114 addQRTypeForNEON(MVT::v2f64); 115 addQRTypeForNEON(MVT::v16i8); 116 addQRTypeForNEON(MVT::v8i16); 117 addQRTypeForNEON(MVT::v4i32); 118 addQRTypeForNEON(MVT::v2i64); 119 addQRTypeForNEON(MVT::v8f16); 120 } 121 122 // Compute derived properties from the register classes 123 computeRegisterProperties(); 124 125 // Provide all sorts of operation actions 126 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); 127 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 128 setOperationAction(ISD::SETCC, MVT::i32, Custom); 129 setOperationAction(ISD::SETCC, MVT::i64, Custom); 130 setOperationAction(ISD::SETCC, MVT::f32, Custom); 131 setOperationAction(ISD::SETCC, MVT::f64, Custom); 132 setOperationAction(ISD::BRCOND, MVT::Other, Expand); 133 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 134 setOperationAction(ISD::BR_CC, MVT::i64, Custom); 135 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 136 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 137 setOperationAction(ISD::SELECT, MVT::i32, Custom); 138 setOperationAction(ISD::SELECT, MVT::i64, Custom); 139 setOperationAction(ISD::SELECT, MVT::f32, Custom); 140 setOperationAction(ISD::SELECT, MVT::f64, Custom); 141 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 142 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); 143 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 144 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 145 setOperationAction(ISD::BR_JT, MVT::Other, Expand); 146 setOperationAction(ISD::JumpTable, MVT::i64, Custom); 147 148 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); 149 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); 150 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); 151 152 setOperationAction(ISD::FREM, MVT::f32, Expand); 153 setOperationAction(ISD::FREM, MVT::f64, Expand); 154 setOperationAction(ISD::FREM, MVT::f80, Expand); 155 156 // Custom lowering hooks are needed for XOR 157 // to fold it into CSINC/CSINV. 158 setOperationAction(ISD::XOR, MVT::i32, Custom); 159 setOperationAction(ISD::XOR, MVT::i64, Custom); 160 161 // Virtually no operation on f128 is legal, but LLVM can't expand them when 162 // there's a valid register class, so we need custom operations in most cases. 163 setOperationAction(ISD::FABS, MVT::f128, Expand); 164 setOperationAction(ISD::FADD, MVT::f128, Custom); 165 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); 166 setOperationAction(ISD::FCOS, MVT::f128, Expand); 167 setOperationAction(ISD::FDIV, MVT::f128, Custom); 168 setOperationAction(ISD::FMA, MVT::f128, Expand); 169 setOperationAction(ISD::FMUL, MVT::f128, Custom); 170 setOperationAction(ISD::FNEG, MVT::f128, Expand); 171 setOperationAction(ISD::FPOW, MVT::f128, Expand); 172 setOperationAction(ISD::FREM, MVT::f128, Expand); 173 setOperationAction(ISD::FRINT, MVT::f128, Expand); 174 setOperationAction(ISD::FSIN, MVT::f128, Expand); 175 setOperationAction(ISD::FSINCOS, MVT::f128, Expand); 176 setOperationAction(ISD::FSQRT, MVT::f128, Expand); 177 setOperationAction(ISD::FSUB, MVT::f128, Custom); 178 setOperationAction(ISD::FTRUNC, MVT::f128, Expand); 179 setOperationAction(ISD::SETCC, MVT::f128, Custom); 180 setOperationAction(ISD::BR_CC, MVT::f128, Custom); 181 setOperationAction(ISD::SELECT, MVT::f128, Custom); 182 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); 183 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); 184 185 // Lowering for many of the conversions is actually specified by the non-f128 186 // type. The LowerXXX function will be trivial when f128 isn't involved. 187 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 188 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 189 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); 190 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 191 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 192 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); 193 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 194 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); 195 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); 196 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 197 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); 198 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); 199 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 200 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); 201 202 // Variable arguments. 203 setOperationAction(ISD::VASTART, MVT::Other, Custom); 204 setOperationAction(ISD::VAARG, MVT::Other, Custom); 205 setOperationAction(ISD::VACOPY, MVT::Other, Custom); 206 setOperationAction(ISD::VAEND, MVT::Other, Expand); 207 208 // Variable-sized objects. 209 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 210 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 211 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 212 213 // Exception handling. 214 // FIXME: These are guesses. Has this been defined yet? 215 setExceptionPointerRegister(AArch64::X0); 216 setExceptionSelectorRegister(AArch64::X1); 217 218 // Constant pool entries 219 setOperationAction(ISD::ConstantPool, MVT::i64, Custom); 220 221 // BlockAddress 222 setOperationAction(ISD::BlockAddress, MVT::i64, Custom); 223 224 // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. 225 setOperationAction(ISD::ADDC, MVT::i32, Custom); 226 setOperationAction(ISD::ADDE, MVT::i32, Custom); 227 setOperationAction(ISD::SUBC, MVT::i32, Custom); 228 setOperationAction(ISD::SUBE, MVT::i32, Custom); 229 setOperationAction(ISD::ADDC, MVT::i64, Custom); 230 setOperationAction(ISD::ADDE, MVT::i64, Custom); 231 setOperationAction(ISD::SUBC, MVT::i64, Custom); 232 setOperationAction(ISD::SUBE, MVT::i64, Custom); 233 234 // AArch64 lacks both left-rotate and popcount instructions. 235 setOperationAction(ISD::ROTL, MVT::i32, Expand); 236 setOperationAction(ISD::ROTL, MVT::i64, Expand); 237 238 // AArch64 doesn't have {U|S}MUL_LOHI. 239 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); 240 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); 241 242 243 // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero 244 // counterparts, which AArch64 supports directly. 245 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); 246 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); 247 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); 248 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); 249 250 setOperationAction(ISD::CTPOP, MVT::i32, Custom); 251 setOperationAction(ISD::CTPOP, MVT::i64, Custom); 252 253 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 254 setOperationAction(ISD::SDIVREM, MVT::i64, Expand); 255 setOperationAction(ISD::SREM, MVT::i32, Expand); 256 setOperationAction(ISD::SREM, MVT::i64, Expand); 257 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 258 setOperationAction(ISD::UDIVREM, MVT::i64, Expand); 259 setOperationAction(ISD::UREM, MVT::i32, Expand); 260 setOperationAction(ISD::UREM, MVT::i64, Expand); 261 262 // Custom lower Add/Sub/Mul with overflow. 263 setOperationAction(ISD::SADDO, MVT::i32, Custom); 264 setOperationAction(ISD::SADDO, MVT::i64, Custom); 265 setOperationAction(ISD::UADDO, MVT::i32, Custom); 266 setOperationAction(ISD::UADDO, MVT::i64, Custom); 267 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 268 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 269 setOperationAction(ISD::USUBO, MVT::i32, Custom); 270 setOperationAction(ISD::USUBO, MVT::i64, Custom); 271 setOperationAction(ISD::SMULO, MVT::i32, Custom); 272 setOperationAction(ISD::SMULO, MVT::i64, Custom); 273 setOperationAction(ISD::UMULO, MVT::i32, Custom); 274 setOperationAction(ISD::UMULO, MVT::i64, Custom); 275 276 setOperationAction(ISD::FSIN, MVT::f32, Expand); 277 setOperationAction(ISD::FSIN, MVT::f64, Expand); 278 setOperationAction(ISD::FCOS, MVT::f32, Expand); 279 setOperationAction(ISD::FCOS, MVT::f64, Expand); 280 setOperationAction(ISD::FPOW, MVT::f32, Expand); 281 setOperationAction(ISD::FPOW, MVT::f64, Expand); 282 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 283 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 284 285 // f16 is storage-only, so we promote operations to f32 if we know this is 286 // valid, and ignore them otherwise. The operations not mentioned here will 287 // fail to select, but this is not a major problem as no source language 288 // should be emitting native f16 operations yet. 289 setOperationAction(ISD::FADD, MVT::f16, Promote); 290 setOperationAction(ISD::FDIV, MVT::f16, Promote); 291 setOperationAction(ISD::FMUL, MVT::f16, Promote); 292 setOperationAction(ISD::FSUB, MVT::f16, Promote); 293 294 // v4f16 is also a storage-only type, so promote it to v4f32 when that is 295 // known to be safe. 296 setOperationAction(ISD::FADD, MVT::v4f16, Promote); 297 setOperationAction(ISD::FSUB, MVT::v4f16, Promote); 298 setOperationAction(ISD::FMUL, MVT::v4f16, Promote); 299 setOperationAction(ISD::FDIV, MVT::v4f16, Promote); 300 setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); 301 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); 302 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); 303 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); 304 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); 305 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); 306 AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); 307 AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); 308 309 // Expand all other v4f16 operations. 310 // FIXME: We could generate better code by promoting some operations to 311 // a pair of v4f32s 312 setOperationAction(ISD::FABS, MVT::v4f16, Expand); 313 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); 314 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); 315 setOperationAction(ISD::FCOS, MVT::v4f16, Expand); 316 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); 317 setOperationAction(ISD::FMA, MVT::v4f16, Expand); 318 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); 319 setOperationAction(ISD::FNEG, MVT::v4f16, Expand); 320 setOperationAction(ISD::FPOW, MVT::v4f16, Expand); 321 setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); 322 setOperationAction(ISD::FREM, MVT::v4f16, Expand); 323 setOperationAction(ISD::FROUND, MVT::v4f16, Expand); 324 setOperationAction(ISD::FRINT, MVT::v4f16, Expand); 325 setOperationAction(ISD::FSIN, MVT::v4f16, Expand); 326 setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); 327 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); 328 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); 329 setOperationAction(ISD::SETCC, MVT::v4f16, Expand); 330 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); 331 setOperationAction(ISD::SELECT, MVT::v4f16, Expand); 332 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); 333 setOperationAction(ISD::FEXP, MVT::v4f16, Expand); 334 setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); 335 setOperationAction(ISD::FLOG, MVT::v4f16, Expand); 336 setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); 337 setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); 338 339 340 // v8f16 is also a storage-only type, so expand it. 341 setOperationAction(ISD::FABS, MVT::v8f16, Expand); 342 setOperationAction(ISD::FADD, MVT::v8f16, Expand); 343 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); 344 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); 345 setOperationAction(ISD::FCOS, MVT::v8f16, Expand); 346 setOperationAction(ISD::FDIV, MVT::v8f16, Expand); 347 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); 348 setOperationAction(ISD::FMA, MVT::v8f16, Expand); 349 setOperationAction(ISD::FMUL, MVT::v8f16, Expand); 350 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); 351 setOperationAction(ISD::FNEG, MVT::v8f16, Expand); 352 setOperationAction(ISD::FPOW, MVT::v8f16, Expand); 353 setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); 354 setOperationAction(ISD::FREM, MVT::v8f16, Expand); 355 setOperationAction(ISD::FROUND, MVT::v8f16, Expand); 356 setOperationAction(ISD::FRINT, MVT::v8f16, Expand); 357 setOperationAction(ISD::FSIN, MVT::v8f16, Expand); 358 setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); 359 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); 360 setOperationAction(ISD::FSUB, MVT::v8f16, Expand); 361 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); 362 setOperationAction(ISD::SETCC, MVT::v8f16, Expand); 363 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); 364 setOperationAction(ISD::SELECT, MVT::v8f16, Expand); 365 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); 366 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); 367 setOperationAction(ISD::FEXP, MVT::v8f16, Expand); 368 setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); 369 setOperationAction(ISD::FLOG, MVT::v8f16, Expand); 370 setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); 371 setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); 372 373 // AArch64 has implementations of a lot of rounding-like FP operations. 374 static MVT RoundingTypes[] = { MVT::f32, MVT::f64}; 375 for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) { 376 MVT Ty = RoundingTypes[I]; 377 setOperationAction(ISD::FFLOOR, Ty, Legal); 378 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 379 setOperationAction(ISD::FCEIL, Ty, Legal); 380 setOperationAction(ISD::FRINT, Ty, Legal); 381 setOperationAction(ISD::FTRUNC, Ty, Legal); 382 setOperationAction(ISD::FROUND, Ty, Legal); 383 } 384 385 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 386 387 if (Subtarget->isTargetMachO()) { 388 // For iOS, we don't want to the normal expansion of a libcall to 389 // sincos. We want to issue a libcall to __sincos_stret to avoid memory 390 // traffic. 391 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 392 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 393 } else { 394 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 395 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 396 } 397 398 // Make floating-point constants legal for the large code model, so they don't 399 // become loads from the constant pool. 400 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { 401 setOperationAction(ISD::ConstantFP, MVT::f32, Legal); 402 setOperationAction(ISD::ConstantFP, MVT::f64, Legal); 403 } 404 405 // AArch64 does not have floating-point extending loads, i1 sign-extending 406 // load, floating-point truncating stores, or v2i32->v2i16 truncating store. 407 for (MVT VT : MVT::fp_valuetypes()) { 408 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 409 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 410 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); 411 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); 412 } 413 for (MVT VT : MVT::integer_valuetypes()) 414 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); 415 416 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 417 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 418 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 419 setTruncStoreAction(MVT::f128, MVT::f80, Expand); 420 setTruncStoreAction(MVT::f128, MVT::f64, Expand); 421 setTruncStoreAction(MVT::f128, MVT::f32, Expand); 422 setTruncStoreAction(MVT::f128, MVT::f16, Expand); 423 424 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 425 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 426 427 // Indexed loads and stores are supported. 428 for (unsigned im = (unsigned)ISD::PRE_INC; 429 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 430 setIndexedLoadAction(im, MVT::i8, Legal); 431 setIndexedLoadAction(im, MVT::i16, Legal); 432 setIndexedLoadAction(im, MVT::i32, Legal); 433 setIndexedLoadAction(im, MVT::i64, Legal); 434 setIndexedLoadAction(im, MVT::f64, Legal); 435 setIndexedLoadAction(im, MVT::f32, Legal); 436 setIndexedStoreAction(im, MVT::i8, Legal); 437 setIndexedStoreAction(im, MVT::i16, Legal); 438 setIndexedStoreAction(im, MVT::i32, Legal); 439 setIndexedStoreAction(im, MVT::i64, Legal); 440 setIndexedStoreAction(im, MVT::f64, Legal); 441 setIndexedStoreAction(im, MVT::f32, Legal); 442 } 443 444 // Trap. 445 setOperationAction(ISD::TRAP, MVT::Other, Legal); 446 447 // We combine OR nodes for bitfield operations. 448 setTargetDAGCombine(ISD::OR); 449 450 // Vector add and sub nodes may conceal a high-half opportunity. 451 // Also, try to fold ADD into CSINC/CSINV.. 452 setTargetDAGCombine(ISD::ADD); 453 setTargetDAGCombine(ISD::SUB); 454 455 setTargetDAGCombine(ISD::XOR); 456 setTargetDAGCombine(ISD::SINT_TO_FP); 457 setTargetDAGCombine(ISD::UINT_TO_FP); 458 459 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 460 461 setTargetDAGCombine(ISD::ANY_EXTEND); 462 setTargetDAGCombine(ISD::ZERO_EXTEND); 463 setTargetDAGCombine(ISD::SIGN_EXTEND); 464 setTargetDAGCombine(ISD::BITCAST); 465 setTargetDAGCombine(ISD::CONCAT_VECTORS); 466 setTargetDAGCombine(ISD::STORE); 467 468 setTargetDAGCombine(ISD::MUL); 469 470 setTargetDAGCombine(ISD::SELECT); 471 setTargetDAGCombine(ISD::VSELECT); 472 473 setTargetDAGCombine(ISD::INTRINSIC_VOID); 474 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 475 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 476 477 MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; 478 MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; 479 MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; 480 481 setStackPointerRegisterToSaveRestore(AArch64::SP); 482 483 setSchedulingPreference(Sched::Hybrid); 484 485 // Enable TBZ/TBNZ 486 MaskAndBranchFoldingIsLegal = true; 487 488 setMinFunctionAlignment(2); 489 490 RequireStrictAlign = (Align == StrictAlign); 491 492 setHasExtractBitsInsn(true); 493 494 if (Subtarget->hasNEON()) { 495 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to 496 // silliness like this: 497 setOperationAction(ISD::FABS, MVT::v1f64, Expand); 498 setOperationAction(ISD::FADD, MVT::v1f64, Expand); 499 setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); 500 setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); 501 setOperationAction(ISD::FCOS, MVT::v1f64, Expand); 502 setOperationAction(ISD::FDIV, MVT::v1f64, Expand); 503 setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); 504 setOperationAction(ISD::FMA, MVT::v1f64, Expand); 505 setOperationAction(ISD::FMUL, MVT::v1f64, Expand); 506 setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); 507 setOperationAction(ISD::FNEG, MVT::v1f64, Expand); 508 setOperationAction(ISD::FPOW, MVT::v1f64, Expand); 509 setOperationAction(ISD::FREM, MVT::v1f64, Expand); 510 setOperationAction(ISD::FROUND, MVT::v1f64, Expand); 511 setOperationAction(ISD::FRINT, MVT::v1f64, Expand); 512 setOperationAction(ISD::FSIN, MVT::v1f64, Expand); 513 setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); 514 setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); 515 setOperationAction(ISD::FSUB, MVT::v1f64, Expand); 516 setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); 517 setOperationAction(ISD::SETCC, MVT::v1f64, Expand); 518 setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); 519 setOperationAction(ISD::SELECT, MVT::v1f64, Expand); 520 setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); 521 setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); 522 523 setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); 524 setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); 525 setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); 526 setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); 527 setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); 528 529 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 530 531 // AArch64 doesn't have a direct vector ->f32 conversion instructions for 532 // elements smaller than i32, so promote the input to i32 first. 533 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); 534 setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); 535 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); 536 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); 537 // Similarly, there is no direct i32 -> f64 vector conversion instruction. 538 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 539 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); 540 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); 541 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); 542 543 // AArch64 doesn't have MUL.2d: 544 setOperationAction(ISD::MUL, MVT::v2i64, Expand); 545 // Custom handling for some quad-vector types to detect MULL. 546 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 547 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 548 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 549 550 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); 551 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); 552 // Likewise, narrowing and extending vector loads/stores aren't handled 553 // directly. 554 for (MVT VT : MVT::vector_valuetypes()) { 555 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 556 557 setOperationAction(ISD::MULHS, VT, Expand); 558 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 559 setOperationAction(ISD::MULHU, VT, Expand); 560 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 561 562 setOperationAction(ISD::BSWAP, VT, Expand); 563 564 for (MVT InnerVT : MVT::vector_valuetypes()) { 565 setTruncStoreAction(VT, InnerVT, Expand); 566 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); 567 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); 568 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); 569 } 570 } 571 572 // AArch64 has implementations of a lot of rounding-like FP operations. 573 static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 }; 574 for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) { 575 MVT Ty = RoundingVecTypes[I]; 576 setOperationAction(ISD::FFLOOR, Ty, Legal); 577 setOperationAction(ISD::FNEARBYINT, Ty, Legal); 578 setOperationAction(ISD::FCEIL, Ty, Legal); 579 setOperationAction(ISD::FRINT, Ty, Legal); 580 setOperationAction(ISD::FTRUNC, Ty, Legal); 581 setOperationAction(ISD::FROUND, Ty, Legal); 582 } 583 } 584 585 // Prefer likely predicted branches to selects on out-of-order cores. 586 if (Subtarget->isCortexA57()) 587 PredictableSelectIsExpensive = true; 588} 589 590void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { 591 if (VT == MVT::v2f32 || VT == MVT::v4f16) { 592 setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); 593 AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); 594 595 setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); 596 AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); 597 } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { 598 setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); 599 AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); 600 601 setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); 602 AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); 603 } 604 605 // Mark vector float intrinsics as expand. 606 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { 607 setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); 608 setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); 609 setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); 610 setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); 611 setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); 612 setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); 613 setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); 614 setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); 615 setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); 616 } 617 618 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); 619 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); 620 setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); 621 setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); 622 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); 623 setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); 624 setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); 625 setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); 626 setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); 627 setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); 628 setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); 629 setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); 630 631 setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); 632 setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); 633 setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); 634 for (MVT InnerVT : MVT::all_valuetypes()) 635 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); 636 637 // CNT supports only B element sizes. 638 if (VT != MVT::v8i8 && VT != MVT::v16i8) 639 setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); 640 641 setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); 642 setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); 643 setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); 644 setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); 645 setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); 646 647 setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); 648 setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); 649 650 if (Subtarget->isLittleEndian()) { 651 for (unsigned im = (unsigned)ISD::PRE_INC; 652 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 653 setIndexedLoadAction(im, VT.getSimpleVT(), Legal); 654 setIndexedStoreAction(im, VT.getSimpleVT(), Legal); 655 } 656 } 657} 658 659void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { 660 addRegisterClass(VT, &AArch64::FPR64RegClass); 661 addTypeForNEON(VT, MVT::v2i32); 662} 663 664void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { 665 addRegisterClass(VT, &AArch64::FPR128RegClass); 666 addTypeForNEON(VT, MVT::v4i32); 667} 668 669EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 670 if (!VT.isVector()) 671 return MVT::i32; 672 return VT.changeVectorElementTypeToInteger(); 673} 674 675/// computeKnownBitsForTargetNode - Determine which of the bits specified in 676/// Mask are known to be either zero or one and return them in the 677/// KnownZero/KnownOne bitsets. 678void AArch64TargetLowering::computeKnownBitsForTargetNode( 679 const SDValue Op, APInt &KnownZero, APInt &KnownOne, 680 const SelectionDAG &DAG, unsigned Depth) const { 681 switch (Op.getOpcode()) { 682 default: 683 break; 684 case AArch64ISD::CSEL: { 685 APInt KnownZero2, KnownOne2; 686 DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); 687 DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); 688 KnownZero &= KnownZero2; 689 KnownOne &= KnownOne2; 690 break; 691 } 692 case ISD::INTRINSIC_W_CHAIN: { 693 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 694 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 695 switch (IntID) { 696 default: return; 697 case Intrinsic::aarch64_ldaxr: 698 case Intrinsic::aarch64_ldxr: { 699 unsigned BitWidth = KnownOne.getBitWidth(); 700 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 701 unsigned MemBits = VT.getScalarType().getSizeInBits(); 702 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 703 return; 704 } 705 } 706 break; 707 } 708 case ISD::INTRINSIC_WO_CHAIN: 709 case ISD::INTRINSIC_VOID: { 710 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 711 switch (IntNo) { 712 default: 713 break; 714 case Intrinsic::aarch64_neon_umaxv: 715 case Intrinsic::aarch64_neon_uminv: { 716 // Figure out the datatype of the vector operand. The UMINV instruction 717 // will zero extend the result, so we can mark as known zero all the 718 // bits larger than the element datatype. 32-bit or larget doesn't need 719 // this as those are legal types and will be handled by isel directly. 720 MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); 721 unsigned BitWidth = KnownZero.getBitWidth(); 722 if (VT == MVT::v8i8 || VT == MVT::v16i8) { 723 assert(BitWidth >= 8 && "Unexpected width!"); 724 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); 725 KnownZero |= Mask; 726 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { 727 assert(BitWidth >= 16 && "Unexpected width!"); 728 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); 729 KnownZero |= Mask; 730 } 731 break; 732 } break; 733 } 734 } 735 } 736} 737 738MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { 739 return MVT::i64; 740} 741 742unsigned AArch64TargetLowering::getMaximalGlobalOffset() const { 743 // FIXME: On AArch64, this depends on the type. 744 // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). 745 // and the offset has to be a multiple of the related size in bytes. 746 return 4095; 747} 748 749FastISel * 750AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 751 const TargetLibraryInfo *libInfo) const { 752 return AArch64::createFastISel(funcInfo, libInfo); 753} 754 755const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { 756 switch (Opcode) { 757 default: 758 return nullptr; 759 case AArch64ISD::CALL: return "AArch64ISD::CALL"; 760 case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; 761 case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; 762 case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; 763 case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; 764 case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; 765 case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; 766 case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; 767 case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; 768 case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; 769 case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; 770 case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; 771 case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; 772 case AArch64ISD::ADC: return "AArch64ISD::ADC"; 773 case AArch64ISD::SBC: return "AArch64ISD::SBC"; 774 case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; 775 case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; 776 case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; 777 case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; 778 case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; 779 case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; 780 case AArch64ISD::FMIN: return "AArch64ISD::FMIN"; 781 case AArch64ISD::FMAX: return "AArch64ISD::FMAX"; 782 case AArch64ISD::DUP: return "AArch64ISD::DUP"; 783 case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; 784 case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; 785 case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; 786 case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; 787 case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; 788 case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; 789 case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; 790 case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; 791 case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; 792 case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; 793 case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; 794 case AArch64ISD::BICi: return "AArch64ISD::BICi"; 795 case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; 796 case AArch64ISD::BSL: return "AArch64ISD::BSL"; 797 case AArch64ISD::NEG: return "AArch64ISD::NEG"; 798 case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; 799 case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; 800 case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; 801 case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; 802 case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; 803 case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; 804 case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; 805 case AArch64ISD::REV16: return "AArch64ISD::REV16"; 806 case AArch64ISD::REV32: return "AArch64ISD::REV32"; 807 case AArch64ISD::REV64: return "AArch64ISD::REV64"; 808 case AArch64ISD::EXT: return "AArch64ISD::EXT"; 809 case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; 810 case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; 811 case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; 812 case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; 813 case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; 814 case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; 815 case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; 816 case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; 817 case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; 818 case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; 819 case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; 820 case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; 821 case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; 822 case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; 823 case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; 824 case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; 825 case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; 826 case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; 827 case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; 828 case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; 829 case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; 830 case AArch64ISD::NOT: return "AArch64ISD::NOT"; 831 case AArch64ISD::BIT: return "AArch64ISD::BIT"; 832 case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; 833 case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; 834 case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; 835 case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; 836 case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; 837 case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; 838 case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; 839 case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; 840 case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; 841 case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; 842 case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; 843 case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; 844 case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; 845 case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; 846 case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; 847 case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; 848 case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; 849 case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; 850 case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; 851 case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; 852 case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; 853 case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; 854 case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; 855 case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; 856 case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; 857 case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; 858 case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; 859 case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; 860 case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; 861 case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; 862 case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; 863 case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; 864 case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; 865 case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; 866 case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; 867 case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; 868 case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; 869 case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; 870 case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; 871 } 872} 873 874MachineBasicBlock * 875AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, 876 MachineBasicBlock *MBB) const { 877 // We materialise the F128CSEL pseudo-instruction as some control flow and a 878 // phi node: 879 880 // OrigBB: 881 // [... previous instrs leading to comparison ...] 882 // b.ne TrueBB 883 // b EndBB 884 // TrueBB: 885 // ; Fallthrough 886 // EndBB: 887 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] 888 889 const TargetInstrInfo *TII = 890 getTargetMachine().getSubtargetImpl()->getInstrInfo(); 891 MachineFunction *MF = MBB->getParent(); 892 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 893 DebugLoc DL = MI->getDebugLoc(); 894 MachineFunction::iterator It = MBB; 895 ++It; 896 897 unsigned DestReg = MI->getOperand(0).getReg(); 898 unsigned IfTrueReg = MI->getOperand(1).getReg(); 899 unsigned IfFalseReg = MI->getOperand(2).getReg(); 900 unsigned CondCode = MI->getOperand(3).getImm(); 901 bool NZCVKilled = MI->getOperand(4).isKill(); 902 903 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); 904 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); 905 MF->insert(It, TrueBB); 906 MF->insert(It, EndBB); 907 908 // Transfer rest of current basic-block to EndBB 909 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), 910 MBB->end()); 911 EndBB->transferSuccessorsAndUpdatePHIs(MBB); 912 913 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); 914 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); 915 MBB->addSuccessor(TrueBB); 916 MBB->addSuccessor(EndBB); 917 918 // TrueBB falls through to the end. 919 TrueBB->addSuccessor(EndBB); 920 921 if (!NZCVKilled) { 922 TrueBB->addLiveIn(AArch64::NZCV); 923 EndBB->addLiveIn(AArch64::NZCV); 924 } 925 926 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) 927 .addReg(IfTrueReg) 928 .addMBB(TrueBB) 929 .addReg(IfFalseReg) 930 .addMBB(MBB); 931 932 MI->eraseFromParent(); 933 return EndBB; 934} 935 936MachineBasicBlock * 937AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 938 MachineBasicBlock *BB) const { 939 switch (MI->getOpcode()) { 940 default: 941#ifndef NDEBUG 942 MI->dump(); 943#endif 944 llvm_unreachable("Unexpected instruction for custom inserter!"); 945 946 case AArch64::F128CSEL: 947 return EmitF128CSEL(MI, BB); 948 949 case TargetOpcode::STACKMAP: 950 case TargetOpcode::PATCHPOINT: 951 return emitPatchPoint(MI, BB); 952 } 953} 954 955//===----------------------------------------------------------------------===// 956// AArch64 Lowering private implementation. 957//===----------------------------------------------------------------------===// 958 959//===----------------------------------------------------------------------===// 960// Lowering Code 961//===----------------------------------------------------------------------===// 962 963/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 964/// CC 965static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { 966 switch (CC) { 967 default: 968 llvm_unreachable("Unknown condition code!"); 969 case ISD::SETNE: 970 return AArch64CC::NE; 971 case ISD::SETEQ: 972 return AArch64CC::EQ; 973 case ISD::SETGT: 974 return AArch64CC::GT; 975 case ISD::SETGE: 976 return AArch64CC::GE; 977 case ISD::SETLT: 978 return AArch64CC::LT; 979 case ISD::SETLE: 980 return AArch64CC::LE; 981 case ISD::SETUGT: 982 return AArch64CC::HI; 983 case ISD::SETUGE: 984 return AArch64CC::HS; 985 case ISD::SETULT: 986 return AArch64CC::LO; 987 case ISD::SETULE: 988 return AArch64CC::LS; 989 } 990} 991 992/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. 993static void changeFPCCToAArch64CC(ISD::CondCode CC, 994 AArch64CC::CondCode &CondCode, 995 AArch64CC::CondCode &CondCode2) { 996 CondCode2 = AArch64CC::AL; 997 switch (CC) { 998 default: 999 llvm_unreachable("Unknown FP condition!"); 1000 case ISD::SETEQ: 1001 case ISD::SETOEQ: 1002 CondCode = AArch64CC::EQ; 1003 break; 1004 case ISD::SETGT: 1005 case ISD::SETOGT: 1006 CondCode = AArch64CC::GT; 1007 break; 1008 case ISD::SETGE: 1009 case ISD::SETOGE: 1010 CondCode = AArch64CC::GE; 1011 break; 1012 case ISD::SETOLT: 1013 CondCode = AArch64CC::MI; 1014 break; 1015 case ISD::SETOLE: 1016 CondCode = AArch64CC::LS; 1017 break; 1018 case ISD::SETONE: 1019 CondCode = AArch64CC::MI; 1020 CondCode2 = AArch64CC::GT; 1021 break; 1022 case ISD::SETO: 1023 CondCode = AArch64CC::VC; 1024 break; 1025 case ISD::SETUO: 1026 CondCode = AArch64CC::VS; 1027 break; 1028 case ISD::SETUEQ: 1029 CondCode = AArch64CC::EQ; 1030 CondCode2 = AArch64CC::VS; 1031 break; 1032 case ISD::SETUGT: 1033 CondCode = AArch64CC::HI; 1034 break; 1035 case ISD::SETUGE: 1036 CondCode = AArch64CC::PL; 1037 break; 1038 case ISD::SETLT: 1039 case ISD::SETULT: 1040 CondCode = AArch64CC::LT; 1041 break; 1042 case ISD::SETLE: 1043 case ISD::SETULE: 1044 CondCode = AArch64CC::LE; 1045 break; 1046 case ISD::SETNE: 1047 case ISD::SETUNE: 1048 CondCode = AArch64CC::NE; 1049 break; 1050 } 1051} 1052 1053/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 1054/// CC usable with the vector instructions. Fewer operations are available 1055/// without a real NZCV register, so we have to use less efficient combinations 1056/// to get the same effect. 1057static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, 1058 AArch64CC::CondCode &CondCode, 1059 AArch64CC::CondCode &CondCode2, 1060 bool &Invert) { 1061 Invert = false; 1062 switch (CC) { 1063 default: 1064 // Mostly the scalar mappings work fine. 1065 changeFPCCToAArch64CC(CC, CondCode, CondCode2); 1066 break; 1067 case ISD::SETUO: 1068 Invert = true; // Fallthrough 1069 case ISD::SETO: 1070 CondCode = AArch64CC::MI; 1071 CondCode2 = AArch64CC::GE; 1072 break; 1073 case ISD::SETUEQ: 1074 case ISD::SETULT: 1075 case ISD::SETULE: 1076 case ISD::SETUGT: 1077 case ISD::SETUGE: 1078 // All of the compare-mask comparisons are ordered, but we can switch 1079 // between the two by a double inversion. E.g. ULE == !OGT. 1080 Invert = true; 1081 changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); 1082 break; 1083 } 1084} 1085 1086static bool isLegalArithImmed(uint64_t C) { 1087 // Matches AArch64DAGToDAGISel::SelectArithImmed(). 1088 return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); 1089} 1090 1091static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1092 SDLoc dl, SelectionDAG &DAG) { 1093 EVT VT = LHS.getValueType(); 1094 1095 if (VT.isFloatingPoint()) 1096 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); 1097 1098 // The CMP instruction is just an alias for SUBS, and representing it as 1099 // SUBS means that it's possible to get CSE with subtract operations. 1100 // A later phase can perform the optimization of setting the destination 1101 // register to WZR/XZR if it ends up being unused. 1102 unsigned Opcode = AArch64ISD::SUBS; 1103 1104 if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) && 1105 cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 && 1106 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 1107 // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on 1108 // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags 1109 // can be set differently by this operation. It comes down to whether 1110 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then 1111 // everything is fine. If not then the optimization is wrong. Thus general 1112 // comparisons are only valid if op2 != 0. 1113 1114 // So, finally, the only LLVM-native comparisons that don't mention C and V 1115 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in 1116 // the absence of information about op2. 1117 Opcode = AArch64ISD::ADDS; 1118 RHS = RHS.getOperand(1); 1119 } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) && 1120 cast<ConstantSDNode>(RHS)->getZExtValue() == 0 && 1121 !isUnsignedIntSetCC(CC)) { 1122 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST 1123 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one 1124 // of the signed comparisons. 1125 Opcode = AArch64ISD::ANDS; 1126 RHS = LHS.getOperand(1); 1127 LHS = LHS.getOperand(0); 1128 } 1129 1130 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) 1131 .getValue(1); 1132} 1133 1134static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 1135 SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { 1136 SDValue Cmp; 1137 AArch64CC::CondCode AArch64CC; 1138 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 1139 EVT VT = RHS.getValueType(); 1140 uint64_t C = RHSC->getZExtValue(); 1141 if (!isLegalArithImmed(C)) { 1142 // Constant does not fit, try adjusting it by one? 1143 switch (CC) { 1144 default: 1145 break; 1146 case ISD::SETLT: 1147 case ISD::SETGE: 1148 if ((VT == MVT::i32 && C != 0x80000000 && 1149 isLegalArithImmed((uint32_t)(C - 1))) || 1150 (VT == MVT::i64 && C != 0x80000000ULL && 1151 isLegalArithImmed(C - 1ULL))) { 1152 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 1153 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1154 RHS = DAG.getConstant(C, VT); 1155 } 1156 break; 1157 case ISD::SETULT: 1158 case ISD::SETUGE: 1159 if ((VT == MVT::i32 && C != 0 && 1160 isLegalArithImmed((uint32_t)(C - 1))) || 1161 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { 1162 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 1163 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; 1164 RHS = DAG.getConstant(C, VT); 1165 } 1166 break; 1167 case ISD::SETLE: 1168 case ISD::SETGT: 1169 if ((VT == MVT::i32 && C != INT32_MAX && 1170 isLegalArithImmed((uint32_t)(C + 1))) || 1171 (VT == MVT::i64 && C != INT64_MAX && 1172 isLegalArithImmed(C + 1ULL))) { 1173 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 1174 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1175 RHS = DAG.getConstant(C, VT); 1176 } 1177 break; 1178 case ISD::SETULE: 1179 case ISD::SETUGT: 1180 if ((VT == MVT::i32 && C != UINT32_MAX && 1181 isLegalArithImmed((uint32_t)(C + 1))) || 1182 (VT == MVT::i64 && C != UINT64_MAX && 1183 isLegalArithImmed(C + 1ULL))) { 1184 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 1185 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; 1186 RHS = DAG.getConstant(C, VT); 1187 } 1188 break; 1189 } 1190 } 1191 } 1192 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. 1193 // For the i8 operand, the largest immediate is 255, so this can be easily 1194 // encoded in the compare instruction. For the i16 operand, however, the 1195 // largest immediate cannot be encoded in the compare. 1196 // Therefore, use a sign extending load and cmn to avoid materializing the -1 1197 // constant. For example, 1198 // movz w1, #65535 1199 // ldrh w0, [x0, #0] 1200 // cmp w0, w1 1201 // > 1202 // ldrsh w0, [x0, #0] 1203 // cmn w0, #1 1204 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) 1205 // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure 1206 // both the LHS and RHS are truely zero extended and to make sure the 1207 // transformation is profitable. 1208 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { 1209 if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) && 1210 isa<LoadSDNode>(LHS)) { 1211 if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && 1212 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && 1213 LHS.getNode()->hasNUsesOfValue(1, 0)) { 1214 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); 1215 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { 1216 SDValue SExt = 1217 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, 1218 DAG.getValueType(MVT::i16)); 1219 Cmp = emitComparison(SExt, 1220 DAG.getConstant(ValueofRHS, RHS.getValueType()), 1221 CC, dl, DAG); 1222 AArch64CC = changeIntCCToAArch64CC(CC); 1223 AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); 1224 return Cmp; 1225 } 1226 } 1227 } 1228 } 1229 Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 1230 AArch64CC = changeIntCCToAArch64CC(CC); 1231 AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); 1232 return Cmp; 1233} 1234 1235static std::pair<SDValue, SDValue> 1236getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { 1237 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && 1238 "Unsupported value type"); 1239 SDValue Value, Overflow; 1240 SDLoc DL(Op); 1241 SDValue LHS = Op.getOperand(0); 1242 SDValue RHS = Op.getOperand(1); 1243 unsigned Opc = 0; 1244 switch (Op.getOpcode()) { 1245 default: 1246 llvm_unreachable("Unknown overflow instruction!"); 1247 case ISD::SADDO: 1248 Opc = AArch64ISD::ADDS; 1249 CC = AArch64CC::VS; 1250 break; 1251 case ISD::UADDO: 1252 Opc = AArch64ISD::ADDS; 1253 CC = AArch64CC::HS; 1254 break; 1255 case ISD::SSUBO: 1256 Opc = AArch64ISD::SUBS; 1257 CC = AArch64CC::VS; 1258 break; 1259 case ISD::USUBO: 1260 Opc = AArch64ISD::SUBS; 1261 CC = AArch64CC::LO; 1262 break; 1263 // Multiply needs a little bit extra work. 1264 case ISD::SMULO: 1265 case ISD::UMULO: { 1266 CC = AArch64CC::NE; 1267 bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false; 1268 if (Op.getValueType() == MVT::i32) { 1269 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1270 // For a 32 bit multiply with overflow check we want the instruction 1271 // selector to generate a widening multiply (SMADDL/UMADDL). For that we 1272 // need to generate the following pattern: 1273 // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) 1274 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); 1275 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); 1276 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1277 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, 1278 DAG.getConstant(0, MVT::i64)); 1279 // On AArch64 the upper 32 bits are always zero extended for a 32 bit 1280 // operation. We need to clear out the upper 32 bits, because we used a 1281 // widening multiply that wrote all 64 bits. In the end this should be a 1282 // noop. 1283 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); 1284 if (IsSigned) { 1285 // The signed overflow check requires more than just a simple check for 1286 // any bit set in the upper 32 bits of the result. These bits could be 1287 // just the sign bits of a negative number. To perform the overflow 1288 // check we have to arithmetic shift right the 32nd bit of the result by 1289 // 31 bits. Then we compare the result to the upper 32 bits. 1290 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, 1291 DAG.getConstant(32, MVT::i64)); 1292 UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); 1293 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, 1294 DAG.getConstant(31, MVT::i64)); 1295 // It is important that LowerBits is last, otherwise the arithmetic 1296 // shift will not be folded into the compare (SUBS). 1297 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); 1298 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1299 .getValue(1); 1300 } else { 1301 // The overflow check for unsigned multiply is easy. We only need to 1302 // check if any of the upper 32 bits are set. This can be done with a 1303 // CMP (shifted register). For that we need to generate the following 1304 // pattern: 1305 // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) 1306 SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, 1307 DAG.getConstant(32, MVT::i64)); 1308 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1309 Overflow = 1310 DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), 1311 UpperBits).getValue(1); 1312 } 1313 break; 1314 } 1315 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); 1316 // For the 64 bit multiply 1317 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); 1318 if (IsSigned) { 1319 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); 1320 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, 1321 DAG.getConstant(63, MVT::i64)); 1322 // It is important that LowerBits is last, otherwise the arithmetic 1323 // shift will not be folded into the compare (SUBS). 1324 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1325 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) 1326 .getValue(1); 1327 } else { 1328 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); 1329 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); 1330 Overflow = 1331 DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), 1332 UpperBits).getValue(1); 1333 } 1334 break; 1335 } 1336 } // switch (...) 1337 1338 if (Opc) { 1339 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); 1340 1341 // Emit the AArch64 operation with overflow check. 1342 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); 1343 Overflow = Value.getValue(1); 1344 } 1345 return std::make_pair(Value, Overflow); 1346} 1347 1348SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, 1349 RTLIB::Libcall Call) const { 1350 SmallVector<SDValue, 2> Ops; 1351 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) 1352 Ops.push_back(Op.getOperand(i)); 1353 1354 return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, 1355 SDLoc(Op)).first; 1356} 1357 1358static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { 1359 SDValue Sel = Op.getOperand(0); 1360 SDValue Other = Op.getOperand(1); 1361 1362 // If neither operand is a SELECT_CC, give up. 1363 if (Sel.getOpcode() != ISD::SELECT_CC) 1364 std::swap(Sel, Other); 1365 if (Sel.getOpcode() != ISD::SELECT_CC) 1366 return Op; 1367 1368 // The folding we want to perform is: 1369 // (xor x, (select_cc a, b, cc, 0, -1) ) 1370 // --> 1371 // (csel x, (xor x, -1), cc ...) 1372 // 1373 // The latter will get matched to a CSINV instruction. 1374 1375 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); 1376 SDValue LHS = Sel.getOperand(0); 1377 SDValue RHS = Sel.getOperand(1); 1378 SDValue TVal = Sel.getOperand(2); 1379 SDValue FVal = Sel.getOperand(3); 1380 SDLoc dl(Sel); 1381 1382 // FIXME: This could be generalized to non-integer comparisons. 1383 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 1384 return Op; 1385 1386 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 1387 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 1388 1389 // The the values aren't constants, this isn't the pattern we're looking for. 1390 if (!CFVal || !CTVal) 1391 return Op; 1392 1393 // We can commute the SELECT_CC by inverting the condition. This 1394 // might be needed to make this fit into a CSINV pattern. 1395 if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { 1396 std::swap(TVal, FVal); 1397 std::swap(CTVal, CFVal); 1398 CC = ISD::getSetCCInverse(CC, true); 1399 } 1400 1401 // If the constants line up, perform the transform! 1402 if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { 1403 SDValue CCVal; 1404 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 1405 1406 FVal = Other; 1407 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, 1408 DAG.getConstant(-1ULL, Other.getValueType())); 1409 1410 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, 1411 CCVal, Cmp); 1412 } 1413 1414 return Op; 1415} 1416 1417static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { 1418 EVT VT = Op.getValueType(); 1419 1420 // Let legalize expand this if it isn't a legal type yet. 1421 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 1422 return SDValue(); 1423 1424 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 1425 1426 unsigned Opc; 1427 bool ExtraOp = false; 1428 switch (Op.getOpcode()) { 1429 default: 1430 llvm_unreachable("Invalid code"); 1431 case ISD::ADDC: 1432 Opc = AArch64ISD::ADDS; 1433 break; 1434 case ISD::SUBC: 1435 Opc = AArch64ISD::SUBS; 1436 break; 1437 case ISD::ADDE: 1438 Opc = AArch64ISD::ADCS; 1439 ExtraOp = true; 1440 break; 1441 case ISD::SUBE: 1442 Opc = AArch64ISD::SBCS; 1443 ExtraOp = true; 1444 break; 1445 } 1446 1447 if (!ExtraOp) 1448 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); 1449 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), 1450 Op.getOperand(2)); 1451} 1452 1453static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { 1454 // Let legalize expand this if it isn't a legal type yet. 1455 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 1456 return SDValue(); 1457 1458 AArch64CC::CondCode CC; 1459 // The actual operation that sets the overflow or carry flag. 1460 SDValue Value, Overflow; 1461 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); 1462 1463 // We use 0 and 1 as false and true values. 1464 SDValue TVal = DAG.getConstant(1, MVT::i32); 1465 SDValue FVal = DAG.getConstant(0, MVT::i32); 1466 1467 // We use an inverted condition, because the conditional select is inverted 1468 // too. This will allow it to be selected to a single instruction: 1469 // CSINC Wd, WZR, WZR, invert(cond). 1470 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32); 1471 Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, 1472 CCVal, Overflow); 1473 1474 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 1475 return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow); 1476} 1477 1478// Prefetch operands are: 1479// 1: Address to prefetch 1480// 2: bool isWrite 1481// 3: int locality (0 = no locality ... 3 = extreme locality) 1482// 4: bool isDataCache 1483static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { 1484 SDLoc DL(Op); 1485 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 1486 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 1487 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 1488 1489 bool IsStream = !Locality; 1490 // When the locality number is set 1491 if (Locality) { 1492 // The front-end should have filtered out the out-of-range values 1493 assert(Locality <= 3 && "Prefetch locality out-of-range"); 1494 // The locality degree is the opposite of the cache speed. 1495 // Put the number the other way around. 1496 // The encoding starts at 0 for level 1 1497 Locality = 3 - Locality; 1498 } 1499 1500 // built the mask value encoding the expected behavior. 1501 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit 1502 (!IsData << 3) | // IsDataCache bit 1503 (Locality << 1) | // Cache level bits 1504 (unsigned)IsStream; // Stream bit 1505 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), 1506 DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1)); 1507} 1508 1509SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, 1510 SelectionDAG &DAG) const { 1511 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); 1512 1513 RTLIB::Libcall LC; 1514 LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); 1515 1516 return LowerF128Call(Op, DAG, LC); 1517} 1518 1519SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, 1520 SelectionDAG &DAG) const { 1521 if (Op.getOperand(0).getValueType() != MVT::f128) { 1522 // It's legal except when f128 is involved 1523 return Op; 1524 } 1525 1526 RTLIB::Libcall LC; 1527 LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); 1528 1529 // FP_ROUND node has a second operand indicating whether it is known to be 1530 // precise. That doesn't take part in the LibCall so we can't directly use 1531 // LowerF128Call. 1532 SDValue SrcVal = Op.getOperand(0); 1533 return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, 1534 /*isSigned*/ false, SDLoc(Op)).first; 1535} 1536 1537static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 1538 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1539 // Any additional optimization in this function should be recorded 1540 // in the cost tables. 1541 EVT InVT = Op.getOperand(0).getValueType(); 1542 EVT VT = Op.getValueType(); 1543 1544 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1545 SDLoc dl(Op); 1546 SDValue Cv = 1547 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), 1548 Op.getOperand(0)); 1549 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); 1550 } 1551 1552 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 1553 SDLoc dl(Op); 1554 MVT ExtVT = 1555 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), 1556 VT.getVectorNumElements()); 1557 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); 1558 return DAG.getNode(Op.getOpcode(), dl, VT, Ext); 1559 } 1560 1561 // Type changing conversions are illegal. 1562 return Op; 1563} 1564 1565SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, 1566 SelectionDAG &DAG) const { 1567 if (Op.getOperand(0).getValueType().isVector()) 1568 return LowerVectorFP_TO_INT(Op, DAG); 1569 1570 if (Op.getOperand(0).getValueType() != MVT::f128) { 1571 // It's legal except when f128 is involved 1572 return Op; 1573 } 1574 1575 RTLIB::Libcall LC; 1576 if (Op.getOpcode() == ISD::FP_TO_SINT) 1577 LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1578 else 1579 LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); 1580 1581 SmallVector<SDValue, 2> Ops; 1582 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) 1583 Ops.push_back(Op.getOperand(i)); 1584 1585 return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, 1586 SDLoc(Op)).first; 1587} 1588 1589static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 1590 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. 1591 // Any additional optimization in this function should be recorded 1592 // in the cost tables. 1593 EVT VT = Op.getValueType(); 1594 SDLoc dl(Op); 1595 SDValue In = Op.getOperand(0); 1596 EVT InVT = In.getValueType(); 1597 1598 if (VT.getSizeInBits() < InVT.getSizeInBits()) { 1599 MVT CastVT = 1600 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), 1601 InVT.getVectorNumElements()); 1602 In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); 1603 return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0)); 1604 } 1605 1606 if (VT.getSizeInBits() > InVT.getSizeInBits()) { 1607 unsigned CastOpc = 1608 Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 1609 EVT CastVT = VT.changeVectorElementTypeToInteger(); 1610 In = DAG.getNode(CastOpc, dl, CastVT, In); 1611 return DAG.getNode(Op.getOpcode(), dl, VT, In); 1612 } 1613 1614 return Op; 1615} 1616 1617SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, 1618 SelectionDAG &DAG) const { 1619 if (Op.getValueType().isVector()) 1620 return LowerVectorINT_TO_FP(Op, DAG); 1621 1622 // i128 conversions are libcalls. 1623 if (Op.getOperand(0).getValueType() == MVT::i128) 1624 return SDValue(); 1625 1626 // Other conversions are legal, unless it's to the completely software-based 1627 // fp128. 1628 if (Op.getValueType() != MVT::f128) 1629 return Op; 1630 1631 RTLIB::Libcall LC; 1632 if (Op.getOpcode() == ISD::SINT_TO_FP) 1633 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 1634 else 1635 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); 1636 1637 return LowerF128Call(Op, DAG, LC); 1638} 1639 1640SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, 1641 SelectionDAG &DAG) const { 1642 // For iOS, we want to call an alternative entry point: __sincos_stret, 1643 // which returns the values in two S / D registers. 1644 SDLoc dl(Op); 1645 SDValue Arg = Op.getOperand(0); 1646 EVT ArgVT = Arg.getValueType(); 1647 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 1648 1649 ArgListTy Args; 1650 ArgListEntry Entry; 1651 1652 Entry.Node = Arg; 1653 Entry.Ty = ArgTy; 1654 Entry.isSExt = false; 1655 Entry.isZExt = false; 1656 Args.push_back(Entry); 1657 1658 const char *LibcallName = 1659 (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; 1660 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); 1661 1662 StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); 1663 TargetLowering::CallLoweringInfo CLI(DAG); 1664 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) 1665 .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); 1666 1667 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 1668 return CallResult.first; 1669} 1670 1671static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { 1672 if (Op.getValueType() != MVT::f16) 1673 return SDValue(); 1674 1675 assert(Op.getOperand(0).getValueType() == MVT::i16); 1676 SDLoc DL(Op); 1677 1678 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); 1679 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); 1680 return SDValue( 1681 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, 1682 DAG.getTargetConstant(AArch64::hsub, MVT::i32)), 1683 0); 1684} 1685 1686static EVT getExtensionTo64Bits(const EVT &OrigVT) { 1687 if (OrigVT.getSizeInBits() >= 64) 1688 return OrigVT; 1689 1690 assert(OrigVT.isSimple() && "Expecting a simple value type"); 1691 1692 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 1693 switch (OrigSimpleTy) { 1694 default: llvm_unreachable("Unexpected Vector Type"); 1695 case MVT::v2i8: 1696 case MVT::v2i16: 1697 return MVT::v2i32; 1698 case MVT::v4i8: 1699 return MVT::v4i16; 1700 } 1701} 1702 1703static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, 1704 const EVT &OrigTy, 1705 const EVT &ExtTy, 1706 unsigned ExtOpcode) { 1707 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 1708 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 1709 // 64-bits we need to insert a new extension so that it will be 64-bits. 1710 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 1711 if (OrigTy.getSizeInBits() >= 64) 1712 return N; 1713 1714 // Must extend size to at least 64 bits to be used as an operand for VMULL. 1715 EVT NewVT = getExtensionTo64Bits(OrigTy); 1716 1717 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 1718} 1719 1720static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 1721 bool isSigned) { 1722 EVT VT = N->getValueType(0); 1723 1724 if (N->getOpcode() != ISD::BUILD_VECTOR) 1725 return false; 1726 1727 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 1728 SDNode *Elt = N->getOperand(i).getNode(); 1729 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 1730 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 1731 unsigned HalfSize = EltSize / 2; 1732 if (isSigned) { 1733 if (!isIntN(HalfSize, C->getSExtValue())) 1734 return false; 1735 } else { 1736 if (!isUIntN(HalfSize, C->getZExtValue())) 1737 return false; 1738 } 1739 continue; 1740 } 1741 return false; 1742 } 1743 1744 return true; 1745} 1746 1747static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { 1748 if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) 1749 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, 1750 N->getOperand(0)->getValueType(0), 1751 N->getValueType(0), 1752 N->getOpcode()); 1753 1754 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 1755 EVT VT = N->getValueType(0); 1756 unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; 1757 unsigned NumElts = VT.getVectorNumElements(); 1758 MVT TruncVT = MVT::getIntegerVT(EltSize); 1759 SmallVector<SDValue, 8> Ops; 1760 for (unsigned i = 0; i != NumElts; ++i) { 1761 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 1762 const APInt &CInt = C->getAPIntValue(); 1763 // Element types smaller than 32 bits are not legal, so use i32 elements. 1764 // The values are implicitly truncated so sext vs. zext doesn't matter. 1765 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); 1766 } 1767 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), 1768 MVT::getVectorVT(TruncVT, NumElts), Ops); 1769} 1770 1771static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 1772 if (N->getOpcode() == ISD::SIGN_EXTEND) 1773 return true; 1774 if (isExtendedBUILD_VECTOR(N, DAG, true)) 1775 return true; 1776 return false; 1777} 1778 1779static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 1780 if (N->getOpcode() == ISD::ZERO_EXTEND) 1781 return true; 1782 if (isExtendedBUILD_VECTOR(N, DAG, false)) 1783 return true; 1784 return false; 1785} 1786 1787static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 1788 unsigned Opcode = N->getOpcode(); 1789 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 1790 SDNode *N0 = N->getOperand(0).getNode(); 1791 SDNode *N1 = N->getOperand(1).getNode(); 1792 return N0->hasOneUse() && N1->hasOneUse() && 1793 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 1794 } 1795 return false; 1796} 1797 1798static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 1799 unsigned Opcode = N->getOpcode(); 1800 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 1801 SDNode *N0 = N->getOperand(0).getNode(); 1802 SDNode *N1 = N->getOperand(1).getNode(); 1803 return N0->hasOneUse() && N1->hasOneUse() && 1804 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 1805 } 1806 return false; 1807} 1808 1809static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 1810 // Multiplications are only custom-lowered for 128-bit vectors so that 1811 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 1812 EVT VT = Op.getValueType(); 1813 assert(VT.is128BitVector() && VT.isInteger() && 1814 "unexpected type for custom-lowering ISD::MUL"); 1815 SDNode *N0 = Op.getOperand(0).getNode(); 1816 SDNode *N1 = Op.getOperand(1).getNode(); 1817 unsigned NewOpc = 0; 1818 bool isMLA = false; 1819 bool isN0SExt = isSignExtended(N0, DAG); 1820 bool isN1SExt = isSignExtended(N1, DAG); 1821 if (isN0SExt && isN1SExt) 1822 NewOpc = AArch64ISD::SMULL; 1823 else { 1824 bool isN0ZExt = isZeroExtended(N0, DAG); 1825 bool isN1ZExt = isZeroExtended(N1, DAG); 1826 if (isN0ZExt && isN1ZExt) 1827 NewOpc = AArch64ISD::UMULL; 1828 else if (isN1SExt || isN1ZExt) { 1829 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 1830 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 1831 if (isN1SExt && isAddSubSExt(N0, DAG)) { 1832 NewOpc = AArch64ISD::SMULL; 1833 isMLA = true; 1834 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 1835 NewOpc = AArch64ISD::UMULL; 1836 isMLA = true; 1837 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 1838 std::swap(N0, N1); 1839 NewOpc = AArch64ISD::UMULL; 1840 isMLA = true; 1841 } 1842 } 1843 1844 if (!NewOpc) { 1845 if (VT == MVT::v2i64) 1846 // Fall through to expand this. It is not legal. 1847 return SDValue(); 1848 else 1849 // Other vector multiplications are legal. 1850 return Op; 1851 } 1852 } 1853 1854 // Legalize to a S/UMULL instruction 1855 SDLoc DL(Op); 1856 SDValue Op0; 1857 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); 1858 if (!isMLA) { 1859 Op0 = skipExtensionForVectorMULL(N0, DAG); 1860 assert(Op0.getValueType().is64BitVector() && 1861 Op1.getValueType().is64BitVector() && 1862 "unexpected types for extended operands to VMULL"); 1863 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 1864 } 1865 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during 1866 // isel lowering to take advantage of no-stall back to back s/umul + s/umla. 1867 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 1868 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); 1869 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); 1870 EVT Op1VT = Op1.getValueType(); 1871 return DAG.getNode(N0->getOpcode(), DL, VT, 1872 DAG.getNode(NewOpc, DL, VT, 1873 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 1874 DAG.getNode(NewOpc, DL, VT, 1875 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 1876} 1877 1878SDValue AArch64TargetLowering::LowerOperation(SDValue Op, 1879 SelectionDAG &DAG) const { 1880 switch (Op.getOpcode()) { 1881 default: 1882 llvm_unreachable("unimplemented operand"); 1883 return SDValue(); 1884 case ISD::BITCAST: 1885 return LowerBITCAST(Op, DAG); 1886 case ISD::GlobalAddress: 1887 return LowerGlobalAddress(Op, DAG); 1888 case ISD::GlobalTLSAddress: 1889 return LowerGlobalTLSAddress(Op, DAG); 1890 case ISD::SETCC: 1891 return LowerSETCC(Op, DAG); 1892 case ISD::BR_CC: 1893 return LowerBR_CC(Op, DAG); 1894 case ISD::SELECT: 1895 return LowerSELECT(Op, DAG); 1896 case ISD::SELECT_CC: 1897 return LowerSELECT_CC(Op, DAG); 1898 case ISD::JumpTable: 1899 return LowerJumpTable(Op, DAG); 1900 case ISD::ConstantPool: 1901 return LowerConstantPool(Op, DAG); 1902 case ISD::BlockAddress: 1903 return LowerBlockAddress(Op, DAG); 1904 case ISD::VASTART: 1905 return LowerVASTART(Op, DAG); 1906 case ISD::VACOPY: 1907 return LowerVACOPY(Op, DAG); 1908 case ISD::VAARG: 1909 return LowerVAARG(Op, DAG); 1910 case ISD::ADDC: 1911 case ISD::ADDE: 1912 case ISD::SUBC: 1913 case ISD::SUBE: 1914 return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); 1915 case ISD::SADDO: 1916 case ISD::UADDO: 1917 case ISD::SSUBO: 1918 case ISD::USUBO: 1919 case ISD::SMULO: 1920 case ISD::UMULO: 1921 return LowerXALUO(Op, DAG); 1922 case ISD::FADD: 1923 return LowerF128Call(Op, DAG, RTLIB::ADD_F128); 1924 case ISD::FSUB: 1925 return LowerF128Call(Op, DAG, RTLIB::SUB_F128); 1926 case ISD::FMUL: 1927 return LowerF128Call(Op, DAG, RTLIB::MUL_F128); 1928 case ISD::FDIV: 1929 return LowerF128Call(Op, DAG, RTLIB::DIV_F128); 1930 case ISD::FP_ROUND: 1931 return LowerFP_ROUND(Op, DAG); 1932 case ISD::FP_EXTEND: 1933 return LowerFP_EXTEND(Op, DAG); 1934 case ISD::FRAMEADDR: 1935 return LowerFRAMEADDR(Op, DAG); 1936 case ISD::RETURNADDR: 1937 return LowerRETURNADDR(Op, DAG); 1938 case ISD::INSERT_VECTOR_ELT: 1939 return LowerINSERT_VECTOR_ELT(Op, DAG); 1940 case ISD::EXTRACT_VECTOR_ELT: 1941 return LowerEXTRACT_VECTOR_ELT(Op, DAG); 1942 case ISD::BUILD_VECTOR: 1943 return LowerBUILD_VECTOR(Op, DAG); 1944 case ISD::VECTOR_SHUFFLE: 1945 return LowerVECTOR_SHUFFLE(Op, DAG); 1946 case ISD::EXTRACT_SUBVECTOR: 1947 return LowerEXTRACT_SUBVECTOR(Op, DAG); 1948 case ISD::SRA: 1949 case ISD::SRL: 1950 case ISD::SHL: 1951 return LowerVectorSRA_SRL_SHL(Op, DAG); 1952 case ISD::SHL_PARTS: 1953 return LowerShiftLeftParts(Op, DAG); 1954 case ISD::SRL_PARTS: 1955 case ISD::SRA_PARTS: 1956 return LowerShiftRightParts(Op, DAG); 1957 case ISD::CTPOP: 1958 return LowerCTPOP(Op, DAG); 1959 case ISD::FCOPYSIGN: 1960 return LowerFCOPYSIGN(Op, DAG); 1961 case ISD::AND: 1962 return LowerVectorAND(Op, DAG); 1963 case ISD::OR: 1964 return LowerVectorOR(Op, DAG); 1965 case ISD::XOR: 1966 return LowerXOR(Op, DAG); 1967 case ISD::PREFETCH: 1968 return LowerPREFETCH(Op, DAG); 1969 case ISD::SINT_TO_FP: 1970 case ISD::UINT_TO_FP: 1971 return LowerINT_TO_FP(Op, DAG); 1972 case ISD::FP_TO_SINT: 1973 case ISD::FP_TO_UINT: 1974 return LowerFP_TO_INT(Op, DAG); 1975 case ISD::FSINCOS: 1976 return LowerFSINCOS(Op, DAG); 1977 case ISD::MUL: 1978 return LowerMUL(Op, DAG); 1979 } 1980} 1981 1982/// getFunctionAlignment - Return the Log2 alignment of this function. 1983unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const { 1984 return 2; 1985} 1986 1987//===----------------------------------------------------------------------===// 1988// Calling Convention Implementation 1989//===----------------------------------------------------------------------===// 1990 1991#include "AArch64GenCallingConv.inc" 1992 1993/// Selects the correct CCAssignFn for a given CallingConvention value. 1994CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, 1995 bool IsVarArg) const { 1996 switch (CC) { 1997 default: 1998 llvm_unreachable("Unsupported calling convention."); 1999 case CallingConv::WebKit_JS: 2000 return CC_AArch64_WebKit_JS; 2001 case CallingConv::GHC: 2002 return CC_AArch64_GHC; 2003 case CallingConv::C: 2004 case CallingConv::Fast: 2005 if (!Subtarget->isTargetDarwin()) 2006 return CC_AArch64_AAPCS; 2007 return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; 2008 } 2009} 2010 2011SDValue AArch64TargetLowering::LowerFormalArguments( 2012 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 2013 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 2014 SmallVectorImpl<SDValue> &InVals) const { 2015 MachineFunction &MF = DAG.getMachineFunction(); 2016 MachineFrameInfo *MFI = MF.getFrameInfo(); 2017 2018 // Assign locations to all of the incoming arguments. 2019 SmallVector<CCValAssign, 16> ArgLocs; 2020 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2021 *DAG.getContext()); 2022 2023 // At this point, Ins[].VT may already be promoted to i32. To correctly 2024 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2025 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2026 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here 2027 // we use a special version of AnalyzeFormalArguments to pass in ValVT and 2028 // LocVT. 2029 unsigned NumArgs = Ins.size(); 2030 Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); 2031 unsigned CurArgIdx = 0; 2032 for (unsigned i = 0; i != NumArgs; ++i) { 2033 MVT ValVT = Ins[i].VT;
|
2046 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2047 bool Res = 2048 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 2049 assert(!Res && "Call operand has unhandled type"); 2050 (void)Res; 2051 } 2052 assert(ArgLocs.size() == Ins.size()); 2053 SmallVector<SDValue, 16> ArgValues; 2054 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2055 CCValAssign &VA = ArgLocs[i]; 2056 2057 if (Ins[i].Flags.isByVal()) { 2058 // Byval is used for HFAs in the PCS, but the system should work in a 2059 // non-compliant manner for larger structs. 2060 EVT PtrTy = getPointerTy(); 2061 int Size = Ins[i].Flags.getByValSize(); 2062 unsigned NumRegs = (Size + 7) / 8; 2063 2064 // FIXME: This works on big-endian for composite byvals, which are the common 2065 // case. It should also work for fundamental types too. 2066 unsigned FrameIdx = 2067 MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 2068 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); 2069 InVals.push_back(FrameIdxN); 2070 2071 continue; 2072 } 2073 2074 if (VA.isRegLoc()) { 2075 // Arguments stored in registers. 2076 EVT RegVT = VA.getLocVT(); 2077 2078 SDValue ArgValue; 2079 const TargetRegisterClass *RC; 2080 2081 if (RegVT == MVT::i32) 2082 RC = &AArch64::GPR32RegClass; 2083 else if (RegVT == MVT::i64) 2084 RC = &AArch64::GPR64RegClass; 2085 else if (RegVT == MVT::f16) 2086 RC = &AArch64::FPR16RegClass; 2087 else if (RegVT == MVT::f32) 2088 RC = &AArch64::FPR32RegClass; 2089 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 2090 RC = &AArch64::FPR64RegClass; 2091 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 2092 RC = &AArch64::FPR128RegClass; 2093 else 2094 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 2095 2096 // Transform the arguments in physical registers into virtual ones. 2097 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2098 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 2099 2100 // If this is an 8, 16 or 32-bit value, it is really passed promoted 2101 // to 64 bits. Insert an assert[sz]ext to capture this, then 2102 // truncate to the right size. 2103 switch (VA.getLocInfo()) { 2104 default: 2105 llvm_unreachable("Unknown loc info!"); 2106 case CCValAssign::Full: 2107 break; 2108 case CCValAssign::BCvt: 2109 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 2110 break; 2111 case CCValAssign::AExt: 2112 case CCValAssign::SExt: 2113 case CCValAssign::ZExt: 2114 // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt 2115 // nodes after our lowering. 2116 assert(RegVT == Ins[i].VT && "incorrect register location selected"); 2117 break; 2118 } 2119 2120 InVals.push_back(ArgValue); 2121 2122 } else { // VA.isRegLoc() 2123 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 2124 unsigned ArgOffset = VA.getLocMemOffset(); 2125 unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; 2126 2127 uint32_t BEAlign = 0; 2128 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 2129 !Ins[i].Flags.isInConsecutiveRegs()) 2130 BEAlign = 8 - ArgSize; 2131 2132 int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 2133 2134 // Create load nodes to retrieve arguments from the stack. 2135 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 2136 SDValue ArgValue; 2137 2138 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 2139 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 2140 MVT MemVT = VA.getValVT(); 2141 2142 switch (VA.getLocInfo()) { 2143 default: 2144 break; 2145 case CCValAssign::BCvt: 2146 MemVT = VA.getLocVT(); 2147 break; 2148 case CCValAssign::SExt: 2149 ExtType = ISD::SEXTLOAD; 2150 break; 2151 case CCValAssign::ZExt: 2152 ExtType = ISD::ZEXTLOAD; 2153 break; 2154 case CCValAssign::AExt: 2155 ExtType = ISD::EXTLOAD; 2156 break; 2157 } 2158 2159 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, 2160 MachinePointerInfo::getFixedStack(FI), 2161 MemVT, false, false, false, 0); 2162 2163 InVals.push_back(ArgValue); 2164 } 2165 } 2166 2167 // varargs 2168 if (isVarArg) { 2169 if (!Subtarget->isTargetDarwin()) { 2170 // The AAPCS variadic function ABI is identical to the non-variadic 2171 // one. As a result there may be more arguments in registers and we should 2172 // save them for future reference. 2173 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 2174 } 2175 2176 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 2177 // This will point to the next argument passed via stack. 2178 unsigned StackOffset = CCInfo.getNextStackOffset(); 2179 // We currently pass all varargs at 8-byte alignment. 2180 StackOffset = ((StackOffset + 7) & ~7); 2181 AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); 2182 } 2183 2184 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2185 unsigned StackArgSize = CCInfo.getNextStackOffset(); 2186 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2187 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 2188 // This is a non-standard ABI so by fiat I say we're allowed to make full 2189 // use of the stack area to be popped, which must be aligned to 16 bytes in 2190 // any case: 2191 StackArgSize = RoundUpToAlignment(StackArgSize, 16); 2192 2193 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 2194 // a multiple of 16. 2195 FuncInfo->setArgumentStackToRestore(StackArgSize); 2196 2197 // This realignment carries over to the available bytes below. Our own 2198 // callers will guarantee the space is free by giving an aligned value to 2199 // CALLSEQ_START. 2200 } 2201 // Even if we're not expected to free up the space, it's useful to know how 2202 // much is there while considering tail calls (because we can reuse it). 2203 FuncInfo->setBytesInStackArgArea(StackArgSize); 2204 2205 return Chain; 2206} 2207 2208void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 2209 SelectionDAG &DAG, SDLoc DL, 2210 SDValue &Chain) const { 2211 MachineFunction &MF = DAG.getMachineFunction(); 2212 MachineFrameInfo *MFI = MF.getFrameInfo(); 2213 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2214 2215 SmallVector<SDValue, 8> MemOps; 2216 2217 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, 2218 AArch64::X3, AArch64::X4, AArch64::X5, 2219 AArch64::X6, AArch64::X7 }; 2220 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); 2221 unsigned FirstVariadicGPR = 2222 CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs); 2223 2224 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 2225 int GPRIdx = 0; 2226 if (GPRSaveSize != 0) { 2227 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); 2228 2229 SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); 2230 2231 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 2232 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 2233 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 2234 SDValue Store = 2235 DAG.getStore(Val.getValue(1), DL, Val, FIN, 2236 MachinePointerInfo::getStack(i * 8), false, false, 0); 2237 MemOps.push_back(Store); 2238 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 2239 DAG.getConstant(8, getPointerTy())); 2240 } 2241 } 2242 FuncInfo->setVarArgsGPRIndex(GPRIdx); 2243 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 2244 2245 if (Subtarget->hasFPARMv8()) { 2246 static const MCPhysReg FPRArgRegs[] = { 2247 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 2248 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; 2249 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); 2250 unsigned FirstVariadicFPR = 2251 CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs); 2252 2253 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 2254 int FPRIdx = 0; 2255 if (FPRSaveSize != 0) { 2256 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); 2257 2258 SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); 2259 2260 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 2261 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 2262 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 2263 2264 SDValue Store = 2265 DAG.getStore(Val.getValue(1), DL, Val, FIN, 2266 MachinePointerInfo::getStack(i * 16), false, false, 0); 2267 MemOps.push_back(Store); 2268 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 2269 DAG.getConstant(16, getPointerTy())); 2270 } 2271 } 2272 FuncInfo->setVarArgsFPRIndex(FPRIdx); 2273 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 2274 } 2275 2276 if (!MemOps.empty()) { 2277 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 2278 } 2279} 2280 2281/// LowerCallResult - Lower the result values of a call into the 2282/// appropriate copies out of appropriate physical registers. 2283SDValue AArch64TargetLowering::LowerCallResult( 2284 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2285 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 2286 SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2287 SDValue ThisVal) const { 2288 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 2289 ? RetCC_AArch64_WebKit_JS 2290 : RetCC_AArch64_AAPCS; 2291 // Assign locations to each value returned by this call. 2292 SmallVector<CCValAssign, 16> RVLocs; 2293 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2294 *DAG.getContext()); 2295 CCInfo.AnalyzeCallResult(Ins, RetCC); 2296 2297 // Copy all of the result registers out of their specified physreg. 2298 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2299 CCValAssign VA = RVLocs[i]; 2300 2301 // Pass 'this' value directly from the argument to return value, to avoid 2302 // reg unit interference 2303 if (i == 0 && isThisReturn) { 2304 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 2305 "unexpected return calling convention register assignment"); 2306 InVals.push_back(ThisVal); 2307 continue; 2308 } 2309 2310 SDValue Val = 2311 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 2312 Chain = Val.getValue(1); 2313 InFlag = Val.getValue(2); 2314 2315 switch (VA.getLocInfo()) { 2316 default: 2317 llvm_unreachable("Unknown loc info!"); 2318 case CCValAssign::Full: 2319 break; 2320 case CCValAssign::BCvt: 2321 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 2322 break; 2323 } 2324 2325 InVals.push_back(Val); 2326 } 2327 2328 return Chain; 2329} 2330 2331bool AArch64TargetLowering::isEligibleForTailCallOptimization( 2332 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2333 bool isCalleeStructRet, bool isCallerStructRet, 2334 const SmallVectorImpl<ISD::OutputArg> &Outs, 2335 const SmallVectorImpl<SDValue> &OutVals, 2336 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 2337 // For CallingConv::C this function knows whether the ABI needs 2338 // changing. That's not true for other conventions so they will have to opt in 2339 // manually. 2340 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 2341 return false; 2342 2343 const MachineFunction &MF = DAG.getMachineFunction(); 2344 const Function *CallerF = MF.getFunction(); 2345 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2346 bool CCMatch = CallerCC == CalleeCC; 2347 2348 // Byval parameters hand the function a pointer directly into the stack area 2349 // we want to reuse during a tail call. Working around this *is* possible (see 2350 // X86) but less efficient and uglier in LowerCall. 2351 for (Function::const_arg_iterator i = CallerF->arg_begin(), 2352 e = CallerF->arg_end(); 2353 i != e; ++i) 2354 if (i->hasByValAttr()) 2355 return false; 2356 2357 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2358 if (IsTailCallConvention(CalleeCC) && CCMatch) 2359 return true; 2360 return false; 2361 } 2362 2363 // Externally-defined functions with weak linkage should not be 2364 // tail-called on AArch64 when the OS does not support dynamic 2365 // pre-emption of symbols, as the AAELF spec requires normal calls 2366 // to undefined weak functions to be replaced with a NOP or jump to the 2367 // next instruction. The behaviour of branch instructions in this 2368 // situation (as used for tail calls) is implementation-defined, so we 2369 // cannot rely on the linker replacing the tail call with a return. 2370 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2371 const GlobalValue *GV = G->getGlobal(); 2372 const Triple TT(getTargetMachine().getTargetTriple()); 2373 if (GV->hasExternalWeakLinkage() && 2374 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2375 return false; 2376 } 2377 2378 // Now we search for cases where we can use a tail call without changing the 2379 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 2380 // concept. 2381 2382 // I want anyone implementing a new calling convention to think long and hard 2383 // about this assert. 2384 assert((!isVarArg || CalleeCC == CallingConv::C) && 2385 "Unexpected variadic calling convention"); 2386 2387 if (isVarArg && !Outs.empty()) { 2388 // At least two cases here: if caller is fastcc then we can't have any 2389 // memory arguments (we'd be expected to clean up the stack afterwards). If 2390 // caller is C then we could potentially use its argument area. 2391 2392 // FIXME: for now we take the most conservative of these in both cases: 2393 // disallow all variadic memory operands. 2394 SmallVector<CCValAssign, 16> ArgLocs; 2395 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 2396 *DAG.getContext()); 2397 2398 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); 2399 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2400 if (!ArgLocs[i].isRegLoc()) 2401 return false; 2402 } 2403 2404 // If the calling conventions do not match, then we'd better make sure the 2405 // results are returned in the same way as what the caller expects. 2406 if (!CCMatch) { 2407 SmallVector<CCValAssign, 16> RVLocs1; 2408 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, 2409 *DAG.getContext()); 2410 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); 2411 2412 SmallVector<CCValAssign, 16> RVLocs2; 2413 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, 2414 *DAG.getContext()); 2415 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); 2416 2417 if (RVLocs1.size() != RVLocs2.size()) 2418 return false; 2419 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2420 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2421 return false; 2422 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2423 return false; 2424 if (RVLocs1[i].isRegLoc()) { 2425 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2426 return false; 2427 } else { 2428 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2429 return false; 2430 } 2431 } 2432 } 2433 2434 // Nothing more to check if the callee is taking no arguments 2435 if (Outs.empty()) 2436 return true; 2437 2438 SmallVector<CCValAssign, 16> ArgLocs; 2439 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 2440 *DAG.getContext()); 2441 2442 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2443 2444 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2445 2446 // If the stack arguments for this call would fit into our own save area then 2447 // the call can be made tail. 2448 return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); 2449} 2450 2451SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 2452 SelectionDAG &DAG, 2453 MachineFrameInfo *MFI, 2454 int ClobberedFI) const { 2455 SmallVector<SDValue, 8> ArgChains; 2456 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); 2457 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; 2458 2459 // Include the original chain at the beginning of the list. When this is 2460 // used by target LowerCall hooks, this helps legalize find the 2461 // CALLSEQ_BEGIN node. 2462 ArgChains.push_back(Chain); 2463 2464 // Add a chain value for each stack argument corresponding 2465 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 2466 UE = DAG.getEntryNode().getNode()->use_end(); 2467 U != UE; ++U) 2468 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 2469 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 2470 if (FI->getIndex() < 0) { 2471 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); 2472 int64_t InLastByte = InFirstByte; 2473 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; 2474 2475 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 2476 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 2477 ArgChains.push_back(SDValue(L, 1)); 2478 } 2479 2480 // Build a tokenfactor for all the chains. 2481 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 2482} 2483 2484bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 2485 bool TailCallOpt) const { 2486 return CallCC == CallingConv::Fast && TailCallOpt; 2487} 2488 2489bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { 2490 return CallCC == CallingConv::Fast; 2491} 2492 2493/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 2494/// and add input and output parameter nodes. 2495SDValue 2496AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 2497 SmallVectorImpl<SDValue> &InVals) const { 2498 SelectionDAG &DAG = CLI.DAG; 2499 SDLoc &DL = CLI.DL; 2500 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2501 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2502 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2503 SDValue Chain = CLI.Chain; 2504 SDValue Callee = CLI.Callee; 2505 bool &IsTailCall = CLI.IsTailCall; 2506 CallingConv::ID CallConv = CLI.CallConv; 2507 bool IsVarArg = CLI.IsVarArg; 2508 2509 MachineFunction &MF = DAG.getMachineFunction(); 2510 bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2511 bool IsThisReturn = false; 2512 2513 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2514 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2515 bool IsSibCall = false; 2516 2517 if (IsTailCall) { 2518 // Check if it's really possible to do a tail call. 2519 IsTailCall = isEligibleForTailCallOptimization( 2520 Callee, CallConv, IsVarArg, IsStructRet, 2521 MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); 2522 if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) 2523 report_fatal_error("failed to perform tail call elimination on a call " 2524 "site marked musttail"); 2525 2526 // A sibling call is one where we're under the usual C ABI and not planning 2527 // to change that but can still do a tail call: 2528 if (!TailCallOpt && IsTailCall) 2529 IsSibCall = true; 2530 2531 if (IsTailCall) 2532 ++NumTailCalls; 2533 } 2534 2535 // Analyze operands of the call, assigning locations to each operand. 2536 SmallVector<CCValAssign, 16> ArgLocs; 2537 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, 2538 *DAG.getContext()); 2539 2540 if (IsVarArg) { 2541 // Handle fixed and variable vector arguments differently. 2542 // Variable vector arguments always go into memory. 2543 unsigned NumArgs = Outs.size(); 2544 2545 for (unsigned i = 0; i != NumArgs; ++i) { 2546 MVT ArgVT = Outs[i].VT; 2547 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2548 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, 2549 /*IsVarArg=*/ !Outs[i].IsFixed); 2550 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 2551 assert(!Res && "Call operand has unhandled type"); 2552 (void)Res; 2553 } 2554 } else { 2555 // At this point, Outs[].VT may already be promoted to i32. To correctly 2556 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2557 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2558 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here 2559 // we use a special version of AnalyzeCallOperands to pass in ValVT and 2560 // LocVT. 2561 unsigned NumArgs = Outs.size(); 2562 for (unsigned i = 0; i != NumArgs; ++i) { 2563 MVT ValVT = Outs[i].VT; 2564 // Get type of the original argument. 2565 EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty, 2566 /*AllowUnknown*/ true); 2567 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; 2568 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2569 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 2570 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 2571 ValVT = MVT::i8; 2572 else if (ActualMVT == MVT::i16) 2573 ValVT = MVT::i16; 2574 2575 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2576 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); 2577 assert(!Res && "Call operand has unhandled type"); 2578 (void)Res; 2579 } 2580 } 2581 2582 // Get a count of how many bytes are to be pushed on the stack. 2583 unsigned NumBytes = CCInfo.getNextStackOffset(); 2584 2585 if (IsSibCall) { 2586 // Since we're not changing the ABI to make this a tail call, the memory 2587 // operands are already available in the caller's incoming argument space. 2588 NumBytes = 0; 2589 } 2590 2591 // FPDiff is the byte offset of the call's argument area from the callee's. 2592 // Stores to callee stack arguments will be placed in FixedStackSlots offset 2593 // by this amount for a tail call. In a sibling call it must be 0 because the 2594 // caller will deallocate the entire stack and the callee still expects its 2595 // arguments to begin at SP+0. Completely unused for non-tail calls. 2596 int FPDiff = 0; 2597 2598 if (IsTailCall && !IsSibCall) { 2599 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 2600 2601 // Since callee will pop argument stack as a tail call, we must keep the 2602 // popped size 16-byte aligned. 2603 NumBytes = RoundUpToAlignment(NumBytes, 16); 2604 2605 // FPDiff will be negative if this tail call requires more space than we 2606 // would automatically have in our incoming argument space. Positive if we 2607 // can actually shrink the stack. 2608 FPDiff = NumReusableBytes - NumBytes; 2609 2610 // The stack pointer must be 16-byte aligned at all times it's used for a 2611 // memory operation, which in practice means at *all* times and in 2612 // particular across call boundaries. Therefore our own arguments started at 2613 // a 16-byte aligned SP and the delta applied for the tail call should 2614 // satisfy the same constraint. 2615 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 2616 } 2617 2618 // Adjust the stack pointer for the new arguments... 2619 // These operations are automatically eliminated by the prolog/epilog pass 2620 if (!IsSibCall) 2621 Chain = 2622 DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL); 2623 2624 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy()); 2625 2626 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2627 SmallVector<SDValue, 8> MemOpChains; 2628 2629 // Walk the register/memloc assignments, inserting copies/loads. 2630 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; 2631 ++i, ++realArgIdx) { 2632 CCValAssign &VA = ArgLocs[i]; 2633 SDValue Arg = OutVals[realArgIdx]; 2634 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2635 2636 // Promote the value if needed. 2637 switch (VA.getLocInfo()) { 2638 default: 2639 llvm_unreachable("Unknown loc info!"); 2640 case CCValAssign::Full: 2641 break; 2642 case CCValAssign::SExt: 2643 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 2644 break; 2645 case CCValAssign::ZExt: 2646 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 2647 break; 2648 case CCValAssign::AExt: 2649 if (Outs[realArgIdx].ArgVT == MVT::i1) { 2650 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 2651 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 2652 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 2653 } 2654 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 2655 break; 2656 case CCValAssign::BCvt: 2657 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 2658 break; 2659 case CCValAssign::FPExt: 2660 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 2661 break; 2662 } 2663 2664 if (VA.isRegLoc()) { 2665 if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { 2666 assert(VA.getLocVT() == MVT::i64 && 2667 "unexpected calling convention register assignment"); 2668 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 2669 "unexpected use of 'returned'"); 2670 IsThisReturn = true; 2671 } 2672 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2673 } else { 2674 assert(VA.isMemLoc()); 2675 2676 SDValue DstAddr; 2677 MachinePointerInfo DstInfo; 2678 2679 // FIXME: This works on big-endian for composite byvals, which are the 2680 // common case. It should also work for fundamental types too. 2681 uint32_t BEAlign = 0; 2682 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 2683 : VA.getValVT().getSizeInBits(); 2684 OpSize = (OpSize + 7) / 8; 2685 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 2686 !Flags.isInConsecutiveRegs()) { 2687 if (OpSize < 8) 2688 BEAlign = 8 - OpSize; 2689 } 2690 unsigned LocMemOffset = VA.getLocMemOffset(); 2691 int32_t Offset = LocMemOffset + BEAlign; 2692 SDValue PtrOff = DAG.getIntPtrConstant(Offset); 2693 PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); 2694 2695 if (IsTailCall) { 2696 Offset = Offset + FPDiff; 2697 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2698 2699 DstAddr = DAG.getFrameIndex(FI, getPointerTy()); 2700 DstInfo = MachinePointerInfo::getFixedStack(FI); 2701 2702 // Make sure any stack arguments overlapping with where we're storing 2703 // are loaded before this eventual operation. Otherwise they'll be 2704 // clobbered. 2705 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 2706 } else { 2707 SDValue PtrOff = DAG.getIntPtrConstant(Offset); 2708 2709 DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); 2710 DstInfo = MachinePointerInfo::getStack(LocMemOffset); 2711 } 2712 2713 if (Outs[i].Flags.isByVal()) { 2714 SDValue SizeNode = 2715 DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64); 2716 SDValue Cpy = DAG.getMemcpy( 2717 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), 2718 /*isVol = */ false, 2719 /*AlwaysInline = */ false, DstInfo, MachinePointerInfo()); 2720 2721 MemOpChains.push_back(Cpy); 2722 } else { 2723 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 2724 // promoted to a legal register type i32, we should truncate Arg back to 2725 // i1/i8/i16. 2726 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 2727 VA.getValVT() == MVT::i16) 2728 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 2729 2730 SDValue Store = 2731 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); 2732 MemOpChains.push_back(Store); 2733 } 2734 } 2735 } 2736 2737 if (!MemOpChains.empty()) 2738 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 2739 2740 // Build a sequence of copy-to-reg nodes chained together with token chain 2741 // and flag operands which copy the outgoing args into the appropriate regs. 2742 SDValue InFlag; 2743 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2744 Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first, 2745 RegsToPass[i].second, InFlag); 2746 InFlag = Chain.getValue(1); 2747 } 2748 2749 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2750 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2751 // node so that legalize doesn't hack it. 2752 if (getTargetMachine().getCodeModel() == CodeModel::Large && 2753 Subtarget->isTargetMachO()) { 2754 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2755 const GlobalValue *GV = G->getGlobal(); 2756 bool InternalLinkage = GV->hasInternalLinkage(); 2757 if (InternalLinkage) 2758 Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); 2759 else { 2760 Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 2761 AArch64II::MO_GOT); 2762 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); 2763 } 2764 } else if (ExternalSymbolSDNode *S = 2765 dyn_cast<ExternalSymbolSDNode>(Callee)) { 2766 const char *Sym = S->getSymbol(); 2767 Callee = 2768 DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT); 2769 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); 2770 } 2771 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2772 const GlobalValue *GV = G->getGlobal(); 2773 Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); 2774 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2775 const char *Sym = S->getSymbol(); 2776 Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0); 2777 } 2778 2779 // We don't usually want to end the call-sequence here because we would tidy 2780 // the frame up *after* the call, however in the ABI-changing tail-call case 2781 // we've carefully laid out the parameters so that when sp is reset they'll be 2782 // in the correct location. 2783 if (IsTailCall && !IsSibCall) { 2784 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2785 DAG.getIntPtrConstant(0, true), InFlag, DL); 2786 InFlag = Chain.getValue(1); 2787 } 2788 2789 std::vector<SDValue> Ops; 2790 Ops.push_back(Chain); 2791 Ops.push_back(Callee); 2792 2793 if (IsTailCall) { 2794 // Each tail call may have to adjust the stack by a different amount, so 2795 // this information must travel along with the operation for eventual 2796 // consumption by emitEpilogue. 2797 Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); 2798 } 2799 2800 // Add argument registers to the end of the list so that they are known live 2801 // into the call. 2802 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2803 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2804 RegsToPass[i].second.getValueType())); 2805 2806 // Add a register mask operand representing the call-preserved registers. 2807 const uint32_t *Mask; 2808 const TargetRegisterInfo *TRI = 2809 getTargetMachine().getSubtargetImpl()->getRegisterInfo(); 2810 const AArch64RegisterInfo *ARI = 2811 static_cast<const AArch64RegisterInfo *>(TRI); 2812 if (IsThisReturn) { 2813 // For 'this' returns, use the X0-preserving mask if applicable 2814 Mask = ARI->getThisReturnPreservedMask(CallConv); 2815 if (!Mask) { 2816 IsThisReturn = false; 2817 Mask = ARI->getCallPreservedMask(CallConv); 2818 } 2819 } else 2820 Mask = ARI->getCallPreservedMask(CallConv); 2821 2822 assert(Mask && "Missing call preserved mask for calling convention"); 2823 Ops.push_back(DAG.getRegisterMask(Mask)); 2824 2825 if (InFlag.getNode()) 2826 Ops.push_back(InFlag); 2827 2828 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2829 2830 // If we're doing a tall call, use a TC_RETURN here rather than an 2831 // actual call instruction. 2832 if (IsTailCall) 2833 return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 2834 2835 // Returns a chain and a flag for retval copy to use. 2836 Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); 2837 InFlag = Chain.getValue(1); 2838 2839 uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) 2840 ? RoundUpToAlignment(NumBytes, 16) 2841 : 0; 2842 2843 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2844 DAG.getIntPtrConstant(CalleePopBytes, true), 2845 InFlag, DL); 2846 if (!Ins.empty()) 2847 InFlag = Chain.getValue(1); 2848 2849 // Handle result values, copying them out of physregs into vregs that we 2850 // return. 2851 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, 2852 InVals, IsThisReturn, 2853 IsThisReturn ? OutVals[0] : SDValue()); 2854} 2855 2856bool AArch64TargetLowering::CanLowerReturn( 2857 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 2858 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 2859 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 2860 ? RetCC_AArch64_WebKit_JS 2861 : RetCC_AArch64_AAPCS; 2862 SmallVector<CCValAssign, 16> RVLocs; 2863 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2864 return CCInfo.CheckReturn(Outs, RetCC); 2865} 2866 2867SDValue 2868AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2869 bool isVarArg, 2870 const SmallVectorImpl<ISD::OutputArg> &Outs, 2871 const SmallVectorImpl<SDValue> &OutVals, 2872 SDLoc DL, SelectionDAG &DAG) const { 2873 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 2874 ? RetCC_AArch64_WebKit_JS 2875 : RetCC_AArch64_AAPCS; 2876 SmallVector<CCValAssign, 16> RVLocs; 2877 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2878 *DAG.getContext()); 2879 CCInfo.AnalyzeReturn(Outs, RetCC); 2880 2881 // Copy the result values into the output registers. 2882 SDValue Flag; 2883 SmallVector<SDValue, 4> RetOps(1, Chain); 2884 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 2885 ++i, ++realRVLocIdx) { 2886 CCValAssign &VA = RVLocs[i]; 2887 assert(VA.isRegLoc() && "Can only return in registers!"); 2888 SDValue Arg = OutVals[realRVLocIdx]; 2889 2890 switch (VA.getLocInfo()) { 2891 default: 2892 llvm_unreachable("Unknown loc info!"); 2893 case CCValAssign::Full: 2894 if (Outs[i].ArgVT == MVT::i1) { 2895 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 2896 // value. This is strictly redundant on Darwin (which uses "zeroext 2897 // i1"), but will be optimised out before ISel. 2898 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 2899 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 2900 } 2901 break; 2902 case CCValAssign::BCvt: 2903 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 2904 break; 2905 } 2906 2907 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 2908 Flag = Chain.getValue(1); 2909 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2910 } 2911 2912 RetOps[0] = Chain; // Update chain. 2913 2914 // Add the flag if we have it. 2915 if (Flag.getNode()) 2916 RetOps.push_back(Flag); 2917 2918 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); 2919} 2920 2921//===----------------------------------------------------------------------===// 2922// Other Lowering Code 2923//===----------------------------------------------------------------------===// 2924 2925SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 2926 SelectionDAG &DAG) const { 2927 EVT PtrVT = getPointerTy(); 2928 SDLoc DL(Op); 2929 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 2930 const GlobalValue *GV = GN->getGlobal(); 2931 unsigned char OpFlags = 2932 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 2933 2934 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 2935 "unexpected offset in global node"); 2936 2937 // This also catched the large code model case for Darwin. 2938 if ((OpFlags & AArch64II::MO_GOT) != 0) { 2939 SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); 2940 // FIXME: Once remat is capable of dealing with instructions with register 2941 // operands, expand this into two nodes instead of using a wrapper node. 2942 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 2943 } 2944 2945 if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { 2946 assert(getTargetMachine().getCodeModel() == CodeModel::Small && 2947 "use of MO_CONSTPOOL only supported on small model"); 2948 SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); 2949 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 2950 unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; 2951 SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); 2952 SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 2953 SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr, 2954 MachinePointerInfo::getConstantPool(), 2955 /*isVolatile=*/ false, 2956 /*isNonTemporal=*/ true, 2957 /*isInvariant=*/ true, 8); 2958 if (GN->getOffset() != 0) 2959 return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, 2960 DAG.getConstant(GN->getOffset(), PtrVT)); 2961 return GlobalAddr; 2962 } 2963 2964 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2965 const unsigned char MO_NC = AArch64II::MO_NC; 2966 return DAG.getNode( 2967 AArch64ISD::WrapperLarge, DL, PtrVT, 2968 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), 2969 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 2970 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 2971 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 2972 } else { 2973 // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and 2974 // the only correct model on Darwin. 2975 SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2976 OpFlags | AArch64II::MO_PAGE); 2977 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; 2978 SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); 2979 2980 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 2981 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 2982 } 2983} 2984 2985/// \brief Convert a TLS address reference into the correct sequence of loads 2986/// and calls to compute the variable's address (for Darwin, currently) and 2987/// return an SDValue containing the final node. 2988 2989/// Darwin only has one TLS scheme which must be capable of dealing with the 2990/// fully general situation, in the worst case. This means: 2991/// + "extern __thread" declaration. 2992/// + Defined in a possibly unknown dynamic library. 2993/// 2994/// The general system is that each __thread variable has a [3 x i64] descriptor 2995/// which contains information used by the runtime to calculate the address. The 2996/// only part of this the compiler needs to know about is the first xword, which 2997/// contains a function pointer that must be called with the address of the 2998/// entire descriptor in "x0". 2999/// 3000/// Since this descriptor may be in a different unit, in general even the 3001/// descriptor must be accessed via an indirect load. The "ideal" code sequence 3002/// is: 3003/// adrp x0, _var@TLVPPAGE 3004/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 3005/// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 3006/// ; the function pointer 3007/// blr x1 ; Uses descriptor address in x0 3008/// ; Address of _var is now in x0. 3009/// 3010/// If the address of _var's descriptor *is* known to the linker, then it can 3011/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 3012/// a slight efficiency gain. 3013SDValue 3014AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 3015 SelectionDAG &DAG) const { 3016 assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); 3017 3018 SDLoc DL(Op); 3019 MVT PtrVT = getPointerTy(); 3020 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3021 3022 SDValue TLVPAddr = 3023 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3024 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 3025 3026 // The first entry in the descriptor is a function pointer that we must call 3027 // to obtain the address of the variable. 3028 SDValue Chain = DAG.getEntryNode(); 3029 SDValue FuncTLVGet = 3030 DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(), 3031 false, true, true, 8); 3032 Chain = FuncTLVGet.getValue(1); 3033 3034 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3035 MFI->setAdjustsStack(true); 3036 3037 // TLS calls preserve all registers except those that absolutely must be 3038 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3039 // silly). 3040 const TargetRegisterInfo *TRI = 3041 getTargetMachine().getSubtargetImpl()->getRegisterInfo(); 3042 const AArch64RegisterInfo *ARI = 3043 static_cast<const AArch64RegisterInfo *>(TRI); 3044 const uint32_t *Mask = ARI->getTLSCallPreservedMask(); 3045 3046 // Finally, we can make the call. This is just a degenerate version of a 3047 // normal AArch64 call node: x0 takes the address of the descriptor, and 3048 // returns the address of the variable in this thread. 3049 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 3050 Chain = 3051 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3052 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 3053 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3054 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 3055} 3056 3057/// When accessing thread-local variables under either the general-dynamic or 3058/// local-dynamic system, we make a "TLS-descriptor" call. The variable will 3059/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 3060/// is a function pointer to carry out the resolution. 3061/// 3062/// The sequence is: 3063/// adrp x0, :tlsdesc:var 3064/// ldr x1, [x0, #:tlsdesc_lo12:var] 3065/// add x0, x0, #:tlsdesc_lo12:var 3066/// .tlsdesccall var 3067/// blr x1 3068/// (TPIDR_EL0 offset now in x0) 3069/// 3070/// The above sequence must be produced unscheduled, to enable the linker to 3071/// optimize/relax this sequence. 3072/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 3073/// above sequence, and expanded really late in the compilation flow, to ensure 3074/// the sequence is produced as per above. 3075SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, 3076 SelectionDAG &DAG) const { 3077 EVT PtrVT = getPointerTy(); 3078 3079 SDValue Chain = DAG.getEntryNode(); 3080 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3081 3082 SmallVector<SDValue, 2> Ops; 3083 Ops.push_back(Chain); 3084 Ops.push_back(SymAddr); 3085 3086 Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops); 3087 SDValue Glue = Chain.getValue(1); 3088 3089 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 3090} 3091 3092SDValue 3093AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 3094 SelectionDAG &DAG) const { 3095 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 3096 assert(getTargetMachine().getCodeModel() == CodeModel::Small && 3097 "ELF TLS only supported in small memory model"); 3098 // Different choices can be made for the maximum size of the TLS area for a 3099 // module. For the small address model, the default TLS size is 16MiB and the 3100 // maximum TLS size is 4GiB. 3101 // FIXME: add -mtls-size command line option and make it control the 16MiB 3102 // vs. 4GiB code sequence generation. 3103 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3104 3105 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 3106 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 3107 if (Model == TLSModel::LocalDynamic) 3108 Model = TLSModel::GeneralDynamic; 3109 } 3110 3111 SDValue TPOff; 3112 EVT PtrVT = getPointerTy(); 3113 SDLoc DL(Op); 3114 const GlobalValue *GV = GA->getGlobal(); 3115 3116 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 3117 3118 if (Model == TLSModel::LocalExec) { 3119 SDValue HiVar = DAG.getTargetGlobalAddress( 3120 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3121 SDValue LoVar = DAG.getTargetGlobalAddress( 3122 GV, DL, PtrVT, 0, 3123 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3124 3125 SDValue TPWithOff_lo = 3126 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 3127 HiVar, DAG.getTargetConstant(0, MVT::i32)), 3128 0); 3129 SDValue TPWithOff = 3130 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, 3131 LoVar, DAG.getTargetConstant(0, MVT::i32)), 3132 0); 3133 return TPWithOff; 3134 } else if (Model == TLSModel::InitialExec) { 3135 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3136 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 3137 } else if (Model == TLSModel::LocalDynamic) { 3138 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 3139 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 3140 // the beginning of the module's TLS region, followed by a DTPREL offset 3141 // calculation. 3142 3143 // These accesses will need deduplicating if there's more than one. 3144 AArch64FunctionInfo *MFI = 3145 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 3146 MFI->incNumLocalDynamicTLSAccesses(); 3147 3148 // The call needs a relocation too for linker relaxation. It doesn't make 3149 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3150 // the address. 3151 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 3152 AArch64II::MO_TLS); 3153 3154 // Now we can calculate the offset from TPIDR_EL0 to this module's 3155 // thread-local area. 3156 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3157 3158 // Now use :dtprel_whatever: operations to calculate this variable's offset 3159 // in its thread-storage area. 3160 SDValue HiVar = DAG.getTargetGlobalAddress( 3161 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3162 SDValue LoVar = DAG.getTargetGlobalAddress( 3163 GV, DL, MVT::i64, 0, 3164 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3165 3166 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 3167 DAG.getTargetConstant(0, MVT::i32)), 3168 0); 3169 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 3170 DAG.getTargetConstant(0, MVT::i32)), 3171 0); 3172 } else if (Model == TLSModel::GeneralDynamic) { 3173 // The call needs a relocation too for linker relaxation. It doesn't make 3174 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3175 // the address. 3176 SDValue SymAddr = 3177 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3178 3179 // Finally we can make a call to calculate the offset from tpidr_el0. 3180 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3181 } else 3182 llvm_unreachable("Unsupported ELF TLS access model"); 3183 3184 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 3185} 3186 3187SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 3188 SelectionDAG &DAG) const { 3189 if (Subtarget->isTargetDarwin()) 3190 return LowerDarwinGlobalTLSAddress(Op, DAG); 3191 else if (Subtarget->isTargetELF()) 3192 return LowerELFGlobalTLSAddress(Op, DAG); 3193 3194 llvm_unreachable("Unexpected platform trying to use TLS"); 3195} 3196SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 3197 SDValue Chain = Op.getOperand(0); 3198 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 3199 SDValue LHS = Op.getOperand(2); 3200 SDValue RHS = Op.getOperand(3); 3201 SDValue Dest = Op.getOperand(4); 3202 SDLoc dl(Op); 3203 3204 // Handle f128 first, since lowering it will result in comparing the return 3205 // value of a libcall against zero, which is just what the rest of LowerBR_CC 3206 // is expecting to deal with. 3207 if (LHS.getValueType() == MVT::f128) { 3208 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3209 3210 // If softenSetCCOperands returned a scalar, we need to compare the result 3211 // against zero to select between true and false values. 3212 if (!RHS.getNode()) { 3213 RHS = DAG.getConstant(0, LHS.getValueType()); 3214 CC = ISD::SETNE; 3215 } 3216 } 3217 3218 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 3219 // instruction. 3220 unsigned Opc = LHS.getOpcode(); 3221 if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) && 3222 cast<ConstantSDNode>(RHS)->isOne() && 3223 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3224 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 3225 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && 3226 "Unexpected condition code."); 3227 // Only lower legal XALUO ops. 3228 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 3229 return SDValue(); 3230 3231 // The actual operation with overflow check. 3232 AArch64CC::CondCode OFCC; 3233 SDValue Value, Overflow; 3234 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 3235 3236 if (CC == ISD::SETNE) 3237 OFCC = getInvertedCondCode(OFCC); 3238 SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); 3239 3240 return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest, 3241 CCVal, Overflow); 3242 } 3243 3244 if (LHS.getValueType().isInteger()) { 3245 assert((LHS.getValueType() == RHS.getValueType()) && 3246 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3247 3248 // If the RHS of the comparison is zero, we can potentially fold this 3249 // to a specialized branch. 3250 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 3251 if (RHSC && RHSC->getZExtValue() == 0) { 3252 if (CC == ISD::SETEQ) { 3253 // See if we can use a TBZ to fold in an AND as well. 3254 // TBZ has a smaller branch displacement than CBZ. If the offset is 3255 // out of bounds, a late MI-layer pass rewrites branches. 3256 // 403.gcc is an example that hits this case. 3257 if (LHS.getOpcode() == ISD::AND && 3258 isa<ConstantSDNode>(LHS.getOperand(1)) && 3259 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3260 SDValue Test = LHS.getOperand(0); 3261 uint64_t Mask = LHS.getConstantOperandVal(1); 3262 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 3263 DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); 3264 } 3265 3266 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 3267 } else if (CC == ISD::SETNE) { 3268 // See if we can use a TBZ to fold in an AND as well. 3269 // TBZ has a smaller branch displacement than CBZ. If the offset is 3270 // out of bounds, a late MI-layer pass rewrites branches. 3271 // 403.gcc is an example that hits this case. 3272 if (LHS.getOpcode() == ISD::AND && 3273 isa<ConstantSDNode>(LHS.getOperand(1)) && 3274 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3275 SDValue Test = LHS.getOperand(0); 3276 uint64_t Mask = LHS.getConstantOperandVal(1); 3277 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 3278 DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); 3279 } 3280 3281 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 3282 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 3283 // Don't combine AND since emitComparison converts the AND to an ANDS 3284 // (a.k.a. TST) and the test in the test bit and branch instruction 3285 // becomes redundant. This would also increase register pressure. 3286 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3287 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 3288 DAG.getConstant(Mask, MVT::i64), Dest); 3289 } 3290 } 3291 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 3292 LHS.getOpcode() != ISD::AND) { 3293 // Don't combine AND since emitComparison converts the AND to an ANDS 3294 // (a.k.a. TST) and the test in the test bit and branch instruction 3295 // becomes redundant. This would also increase register pressure. 3296 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3297 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 3298 DAG.getConstant(Mask, MVT::i64), Dest); 3299 } 3300 3301 SDValue CCVal; 3302 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3303 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 3304 Cmp); 3305 } 3306 3307 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3308 3309 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 3310 // clean. Some of them require two branches to implement. 3311 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3312 AArch64CC::CondCode CC1, CC2; 3313 changeFPCCToAArch64CC(CC, CC1, CC2); 3314 SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); 3315 SDValue BR1 = 3316 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 3317 if (CC2 != AArch64CC::AL) { 3318 SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); 3319 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 3320 Cmp); 3321 } 3322 3323 return BR1; 3324} 3325 3326SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 3327 SelectionDAG &DAG) const { 3328 EVT VT = Op.getValueType(); 3329 SDLoc DL(Op); 3330 3331 SDValue In1 = Op.getOperand(0); 3332 SDValue In2 = Op.getOperand(1); 3333 EVT SrcVT = In2.getValueType(); 3334 if (SrcVT != VT) { 3335 if (SrcVT == MVT::f32 && VT == MVT::f64) 3336 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); 3337 else if (SrcVT == MVT::f64 && VT == MVT::f32) 3338 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0)); 3339 else 3340 // FIXME: Src type is different, bail out for now. Can VT really be a 3341 // vector type? 3342 return SDValue(); 3343 } 3344 3345 EVT VecVT; 3346 EVT EltVT; 3347 SDValue EltMask, VecVal1, VecVal2; 3348 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { 3349 EltVT = MVT::i32; 3350 VecVT = MVT::v4i32; 3351 EltMask = DAG.getConstant(0x80000000ULL, EltVT); 3352 3353 if (!VT.isVector()) { 3354 VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3355 DAG.getUNDEF(VecVT), In1); 3356 VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3357 DAG.getUNDEF(VecVT), In2); 3358 } else { 3359 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3360 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3361 } 3362 } else if (VT == MVT::f64 || VT == MVT::v2f64) { 3363 EltVT = MVT::i64; 3364 VecVT = MVT::v2i64; 3365 3366 // We want to materialize a mask with the the high bit set, but the AdvSIMD 3367 // immediate moves cannot materialize that in a single instruction for 3368 // 64-bit elements. Instead, materialize zero and then negate it. 3369 EltMask = DAG.getConstant(0, EltVT); 3370 3371 if (!VT.isVector()) { 3372 VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3373 DAG.getUNDEF(VecVT), In1); 3374 VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3375 DAG.getUNDEF(VecVT), In2); 3376 } else { 3377 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3378 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3379 } 3380 } else { 3381 llvm_unreachable("Invalid type for copysign!"); 3382 } 3383 3384 std::vector<SDValue> BuildVectorOps; 3385 for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i) 3386 BuildVectorOps.push_back(EltMask); 3387 3388 SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps); 3389 3390 // If we couldn't materialize the mask above, then the mask vector will be 3391 // the zero vector, and we need to negate it here. 3392 if (VT == MVT::f64 || VT == MVT::v2f64) { 3393 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); 3394 BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); 3395 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); 3396 } 3397 3398 SDValue Sel = 3399 DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); 3400 3401 if (VT == MVT::f32) 3402 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); 3403 else if (VT == MVT::f64) 3404 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); 3405 else 3406 return DAG.getNode(ISD::BITCAST, DL, VT, Sel); 3407} 3408 3409SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { 3410 if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( 3411 AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) 3412 return SDValue(); 3413 3414 if (!Subtarget->hasNEON()) 3415 return SDValue(); 3416 3417 // While there is no integer popcount instruction, it can 3418 // be more efficiently lowered to the following sequence that uses 3419 // AdvSIMD registers/instructions as long as the copies to/from 3420 // the AdvSIMD registers are cheap. 3421 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 3422 // CNT V0.8B, V0.8B // 8xbyte pop-counts 3423 // ADDV B0, V0.8B // sum 8xbyte pop-counts 3424 // UMOV X0, V0.B[0] // copy byte result back to integer reg 3425 SDValue Val = Op.getOperand(0); 3426 SDLoc DL(Op); 3427 EVT VT = Op.getValueType(); 3428 SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8); 3429 3430 SDValue VecVal; 3431 if (VT == MVT::i32) { 3432 VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val); 3433 VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec, 3434 VecVal); 3435 } else { 3436 VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 3437 } 3438 3439 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal); 3440 SDValue UaddLV = DAG.getNode( 3441 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 3442 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop); 3443 3444 if (VT == MVT::i64) 3445 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 3446 return UaddLV; 3447} 3448 3449SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 3450 3451 if (Op.getValueType().isVector()) 3452 return LowerVSETCC(Op, DAG); 3453 3454 SDValue LHS = Op.getOperand(0); 3455 SDValue RHS = Op.getOperand(1); 3456 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 3457 SDLoc dl(Op); 3458 3459 // We chose ZeroOrOneBooleanContents, so use zero and one. 3460 EVT VT = Op.getValueType(); 3461 SDValue TVal = DAG.getConstant(1, VT); 3462 SDValue FVal = DAG.getConstant(0, VT); 3463 3464 // Handle f128 first, since one possible outcome is a normal integer 3465 // comparison which gets picked up by the next if statement. 3466 if (LHS.getValueType() == MVT::f128) { 3467 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3468 3469 // If softenSetCCOperands returned a scalar, use it. 3470 if (!RHS.getNode()) { 3471 assert(LHS.getValueType() == Op.getValueType() && 3472 "Unexpected setcc expansion!"); 3473 return LHS; 3474 } 3475 } 3476 3477 if (LHS.getValueType().isInteger()) { 3478 SDValue CCVal; 3479 SDValue Cmp = 3480 getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); 3481 3482 // Note that we inverted the condition above, so we reverse the order of 3483 // the true and false operands here. This will allow the setcc to be 3484 // matched to a single CSINC instruction. 3485 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 3486 } 3487 3488 // Now we know we're dealing with FP values. 3489 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3490 3491 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 3492 // and do the comparison. 3493 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3494 3495 AArch64CC::CondCode CC1, CC2; 3496 changeFPCCToAArch64CC(CC, CC1, CC2); 3497 if (CC2 == AArch64CC::AL) { 3498 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); 3499 SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); 3500 3501 // Note that we inverted the condition above, so we reverse the order of 3502 // the true and false operands here. This will allow the setcc to be 3503 // matched to a single CSINC instruction. 3504 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 3505 } else { 3506 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 3507 // totally clean. Some of them require two CSELs to implement. As is in 3508 // this case, we emit the first CSEL and then emit a second using the output 3509 // of the first as the RHS. We're effectively OR'ing the two CC's together. 3510 3511 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 3512 SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); 3513 SDValue CS1 = 3514 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 3515 3516 SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); 3517 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 3518 } 3519} 3520 3521/// A SELECT_CC operation is really some kind of max or min if both values being 3522/// compared are, in some sense, equal to the results in either case. However, 3523/// it is permissible to compare f32 values and produce directly extended f64 3524/// values. 3525/// 3526/// Extending the comparison operands would also be allowed, but is less likely 3527/// to happen in practice since their use is right here. Note that truncate 3528/// operations would *not* be semantically equivalent. 3529static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { 3530 if (Cmp == Result) 3531 return true; 3532 3533 ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp); 3534 ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result); 3535 if (CCmp && CResult && Cmp.getValueType() == MVT::f32 && 3536 Result.getValueType() == MVT::f64) { 3537 bool Lossy; 3538 APFloat CmpVal = CCmp->getValueAPF(); 3539 CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy); 3540 return CResult->getValueAPF().bitwiseIsEqual(CmpVal); 3541 } 3542 3543 return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp; 3544} 3545 3546SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 3547 SelectionDAG &DAG) const { 3548 SDValue CC = Op->getOperand(0); 3549 SDValue TVal = Op->getOperand(1); 3550 SDValue FVal = Op->getOperand(2); 3551 SDLoc DL(Op); 3552 3553 unsigned Opc = CC.getOpcode(); 3554 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 3555 // instruction. 3556 if (CC.getResNo() == 1 && 3557 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3558 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 3559 // Only lower legal XALUO ops. 3560 if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0))) 3561 return SDValue(); 3562 3563 AArch64CC::CondCode OFCC; 3564 SDValue Value, Overflow; 3565 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG); 3566 SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); 3567 3568 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 3569 CCVal, Overflow); 3570 } 3571 3572 if (CC.getOpcode() == ISD::SETCC) 3573 return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal, 3574 cast<CondCodeSDNode>(CC.getOperand(2))->get()); 3575 else 3576 return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal, 3577 FVal, ISD::SETNE); 3578} 3579 3580SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 3581 SelectionDAG &DAG) const { 3582 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 3583 SDValue LHS = Op.getOperand(0); 3584 SDValue RHS = Op.getOperand(1); 3585 SDValue TVal = Op.getOperand(2); 3586 SDValue FVal = Op.getOperand(3); 3587 SDLoc dl(Op); 3588 3589 // Handle f128 first, because it will result in a comparison of some RTLIB 3590 // call result against zero. 3591 if (LHS.getValueType() == MVT::f128) { 3592 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3593 3594 // If softenSetCCOperands returned a scalar, we need to compare the result 3595 // against zero to select between true and false values. 3596 if (!RHS.getNode()) { 3597 RHS = DAG.getConstant(0, LHS.getValueType()); 3598 CC = ISD::SETNE; 3599 } 3600 } 3601 3602 // Handle integers first. 3603 if (LHS.getValueType().isInteger()) { 3604 assert((LHS.getValueType() == RHS.getValueType()) && 3605 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3606 3607 unsigned Opcode = AArch64ISD::CSEL; 3608 3609 // If both the TVal and the FVal are constants, see if we can swap them in 3610 // order to for a CSINV or CSINC out of them. 3611 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 3612 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 3613 3614 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { 3615 std::swap(TVal, FVal); 3616 std::swap(CTVal, CFVal); 3617 CC = ISD::getSetCCInverse(CC, true); 3618 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { 3619 std::swap(TVal, FVal); 3620 std::swap(CTVal, CFVal); 3621 CC = ISD::getSetCCInverse(CC, true); 3622 } else if (TVal.getOpcode() == ISD::XOR) { 3623 // If TVal is a NOT we want to swap TVal and FVal so that we can match 3624 // with a CSINV rather than a CSEL. 3625 ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1)); 3626 3627 if (CVal && CVal->isAllOnesValue()) { 3628 std::swap(TVal, FVal); 3629 std::swap(CTVal, CFVal); 3630 CC = ISD::getSetCCInverse(CC, true); 3631 } 3632 } else if (TVal.getOpcode() == ISD::SUB) { 3633 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 3634 // that we can match with a CSNEG rather than a CSEL. 3635 ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0)); 3636 3637 if (CVal && CVal->isNullValue()) { 3638 std::swap(TVal, FVal); 3639 std::swap(CTVal, CFVal); 3640 CC = ISD::getSetCCInverse(CC, true); 3641 } 3642 } else if (CTVal && CFVal) { 3643 const int64_t TrueVal = CTVal->getSExtValue(); 3644 const int64_t FalseVal = CFVal->getSExtValue(); 3645 bool Swap = false; 3646 3647 // If both TVal and FVal are constants, see if FVal is the 3648 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 3649 // instead of a CSEL in that case. 3650 if (TrueVal == ~FalseVal) { 3651 Opcode = AArch64ISD::CSINV; 3652 } else if (TrueVal == -FalseVal) { 3653 Opcode = AArch64ISD::CSNEG; 3654 } else if (TVal.getValueType() == MVT::i32) { 3655 // If our operands are only 32-bit wide, make sure we use 32-bit 3656 // arithmetic for the check whether we can use CSINC. This ensures that 3657 // the addition in the check will wrap around properly in case there is 3658 // an overflow (which would not be the case if we do the check with 3659 // 64-bit arithmetic). 3660 const uint32_t TrueVal32 = CTVal->getZExtValue(); 3661 const uint32_t FalseVal32 = CFVal->getZExtValue(); 3662 3663 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 3664 Opcode = AArch64ISD::CSINC; 3665 3666 if (TrueVal32 > FalseVal32) { 3667 Swap = true; 3668 } 3669 } 3670 // 64-bit check whether we can use CSINC. 3671 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { 3672 Opcode = AArch64ISD::CSINC; 3673 3674 if (TrueVal > FalseVal) { 3675 Swap = true; 3676 } 3677 } 3678 3679 // Swap TVal and FVal if necessary. 3680 if (Swap) { 3681 std::swap(TVal, FVal); 3682 std::swap(CTVal, CFVal); 3683 CC = ISD::getSetCCInverse(CC, true); 3684 } 3685 3686 if (Opcode != AArch64ISD::CSEL) { 3687 // Drop FVal since we can get its value by simply inverting/negating 3688 // TVal. 3689 FVal = TVal; 3690 } 3691 } 3692 3693 SDValue CCVal; 3694 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3695 3696 EVT VT = Op.getValueType(); 3697 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 3698 } 3699 3700 // Now we know we're dealing with FP values. 3701 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3702 assert(LHS.getValueType() == RHS.getValueType()); 3703 EVT VT = Op.getValueType(); 3704 3705 // Try to match this select into a max/min operation, which have dedicated 3706 // opcode in the instruction set. 3707 // FIXME: This is not correct in the presence of NaNs, so we only enable this 3708 // in no-NaNs mode. 3709 if (getTargetMachine().Options.NoNaNsFPMath) { 3710 SDValue MinMaxLHS = TVal, MinMaxRHS = FVal; 3711 if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) && 3712 selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) { 3713 CC = ISD::getSetCCSwappedOperands(CC); 3714 std::swap(MinMaxLHS, MinMaxRHS); 3715 } 3716 3717 if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) && 3718 selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) { 3719 switch (CC) { 3720 default: 3721 break; 3722 case ISD::SETGT: 3723 case ISD::SETGE: 3724 case ISD::SETUGT: 3725 case ISD::SETUGE: 3726 case ISD::SETOGT: 3727 case ISD::SETOGE: 3728 return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS); 3729 break; 3730 case ISD::SETLT: 3731 case ISD::SETLE: 3732 case ISD::SETULT: 3733 case ISD::SETULE: 3734 case ISD::SETOLT: 3735 case ISD::SETOLE: 3736 return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS); 3737 break; 3738 } 3739 } 3740 } 3741 3742 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 3743 // and do the comparison. 3744 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3745 3746 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 3747 // clean. Some of them require two CSELs to implement. 3748 AArch64CC::CondCode CC1, CC2; 3749 changeFPCCToAArch64CC(CC, CC1, CC2); 3750 SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); 3751 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 3752 3753 // If we need a second CSEL, emit it, using the output of the first as the 3754 // RHS. We're effectively OR'ing the two CC's together. 3755 if (CC2 != AArch64CC::AL) { 3756 SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); 3757 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 3758 } 3759 3760 // Otherwise, return the output of the first CSEL. 3761 return CS1; 3762} 3763 3764SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 3765 SelectionDAG &DAG) const { 3766 // Jump table entries as PC relative offsets. No additional tweaking 3767 // is necessary here. Just get the address of the jump table. 3768 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 3769 EVT PtrVT = getPointerTy(); 3770 SDLoc DL(Op); 3771 3772 if (getTargetMachine().getCodeModel() == CodeModel::Large && 3773 !Subtarget->isTargetMachO()) { 3774 const unsigned char MO_NC = AArch64II::MO_NC; 3775 return DAG.getNode( 3776 AArch64ISD::WrapperLarge, DL, PtrVT, 3777 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), 3778 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), 3779 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), 3780 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 3781 AArch64II::MO_G0 | MO_NC)); 3782 } 3783 3784 SDValue Hi = 3785 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); 3786 SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 3787 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3788 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3789 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3790} 3791 3792SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 3793 SelectionDAG &DAG) const { 3794 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3795 EVT PtrVT = getPointerTy(); 3796 SDLoc DL(Op); 3797 3798 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 3799 // Use the GOT for the large code model on iOS. 3800 if (Subtarget->isTargetMachO()) { 3801 SDValue GotAddr = DAG.getTargetConstantPool( 3802 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 3803 AArch64II::MO_GOT); 3804 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 3805 } 3806 3807 const unsigned char MO_NC = AArch64II::MO_NC; 3808 return DAG.getNode( 3809 AArch64ISD::WrapperLarge, DL, PtrVT, 3810 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 3811 CP->getOffset(), AArch64II::MO_G3), 3812 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 3813 CP->getOffset(), AArch64II::MO_G2 | MO_NC), 3814 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 3815 CP->getOffset(), AArch64II::MO_G1 | MO_NC), 3816 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 3817 CP->getOffset(), AArch64II::MO_G0 | MO_NC)); 3818 } else { 3819 // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on 3820 // ELF, the only valid one on Darwin. 3821 SDValue Hi = 3822 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 3823 CP->getOffset(), AArch64II::MO_PAGE); 3824 SDValue Lo = DAG.getTargetConstantPool( 3825 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 3826 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3827 3828 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3829 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3830 } 3831} 3832 3833SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 3834 SelectionDAG &DAG) const { 3835 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3836 EVT PtrVT = getPointerTy(); 3837 SDLoc DL(Op); 3838 if (getTargetMachine().getCodeModel() == CodeModel::Large && 3839 !Subtarget->isTargetMachO()) { 3840 const unsigned char MO_NC = AArch64II::MO_NC; 3841 return DAG.getNode( 3842 AArch64ISD::WrapperLarge, DL, PtrVT, 3843 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), 3844 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 3845 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 3846 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 3847 } else { 3848 SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); 3849 SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | 3850 AArch64II::MO_NC); 3851 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3852 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3853 } 3854} 3855 3856SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 3857 SelectionDAG &DAG) const { 3858 AArch64FunctionInfo *FuncInfo = 3859 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 3860 3861 SDLoc DL(Op); 3862 SDValue FR = 3863 DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); 3864 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3865 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 3866 MachinePointerInfo(SV), false, false, 0); 3867} 3868 3869SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 3870 SelectionDAG &DAG) const { 3871 // The layout of the va_list struct is specified in the AArch64 Procedure Call 3872 // Standard, section B.3. 3873 MachineFunction &MF = DAG.getMachineFunction(); 3874 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 3875 SDLoc DL(Op); 3876 3877 SDValue Chain = Op.getOperand(0); 3878 SDValue VAList = Op.getOperand(1); 3879 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3880 SmallVector<SDValue, 4> MemOps; 3881 3882 // void *__stack at offset 0 3883 SDValue Stack = 3884 DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); 3885 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 3886 MachinePointerInfo(SV), false, false, 8)); 3887 3888 // void *__gr_top at offset 8 3889 int GPRSize = FuncInfo->getVarArgsGPRSize(); 3890 if (GPRSize > 0) { 3891 SDValue GRTop, GRTopAddr; 3892 3893 GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3894 DAG.getConstant(8, getPointerTy())); 3895 3896 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy()); 3897 GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, 3898 DAG.getConstant(GPRSize, getPointerTy())); 3899 3900 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 3901 MachinePointerInfo(SV, 8), false, false, 8)); 3902 } 3903 3904 // void *__vr_top at offset 16 3905 int FPRSize = FuncInfo->getVarArgsFPRSize(); 3906 if (FPRSize > 0) { 3907 SDValue VRTop, VRTopAddr; 3908 VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3909 DAG.getConstant(16, getPointerTy())); 3910 3911 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy()); 3912 VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, 3913 DAG.getConstant(FPRSize, getPointerTy())); 3914 3915 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 3916 MachinePointerInfo(SV, 16), false, false, 8)); 3917 } 3918 3919 // int __gr_offs at offset 24 3920 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3921 DAG.getConstant(24, getPointerTy())); 3922 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32), 3923 GROffsAddr, MachinePointerInfo(SV, 24), false, 3924 false, 4)); 3925 3926 // int __vr_offs at offset 28 3927 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3928 DAG.getConstant(28, getPointerTy())); 3929 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32), 3930 VROffsAddr, MachinePointerInfo(SV, 28), false, 3931 false, 4)); 3932 3933 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 3934} 3935 3936SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 3937 SelectionDAG &DAG) const { 3938 return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) 3939 : LowerAAPCS_VASTART(Op, DAG); 3940} 3941 3942SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 3943 SelectionDAG &DAG) const { 3944 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 3945 // pointer. 3946 unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; 3947 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 3948 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 3949 3950 return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1), 3951 Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32), 3952 8, false, false, MachinePointerInfo(DestSV), 3953 MachinePointerInfo(SrcSV)); 3954} 3955 3956SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 3957 assert(Subtarget->isTargetDarwin() && 3958 "automatic va_arg instruction only works on Darwin"); 3959 3960 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3961 EVT VT = Op.getValueType(); 3962 SDLoc DL(Op); 3963 SDValue Chain = Op.getOperand(0); 3964 SDValue Addr = Op.getOperand(1); 3965 unsigned Align = Op.getConstantOperandVal(3); 3966 3967 SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr, 3968 MachinePointerInfo(V), false, false, false, 0); 3969 Chain = VAList.getValue(1); 3970 3971 if (Align > 8) { 3972 assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); 3973 VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3974 DAG.getConstant(Align - 1, getPointerTy())); 3975 VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList, 3976 DAG.getConstant(-(int64_t)Align, getPointerTy())); 3977 } 3978 3979 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 3980 uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); 3981 3982 // Scalar integer and FP values smaller than 64 bits are implicitly extended 3983 // up to 64 bits. At the very least, we have to increase the striding of the 3984 // vaargs list to match this, and for FP values we need to introduce 3985 // FP_ROUND nodes as well. 3986 if (VT.isInteger() && !VT.isVector()) 3987 ArgSize = 8; 3988 bool NeedFPTrunc = false; 3989 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 3990 ArgSize = 8; 3991 NeedFPTrunc = true; 3992 } 3993 3994 // Increment the pointer, VAList, to the next vaarg 3995 SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3996 DAG.getConstant(ArgSize, getPointerTy())); 3997 // Store the incremented VAList to the legalized pointer 3998 SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), 3999 false, false, 0); 4000 4001 // Load the actual argument out of the pointer VAList 4002 if (NeedFPTrunc) { 4003 // Load the value as an f64. 4004 SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, 4005 MachinePointerInfo(), false, false, false, 0); 4006 // Round the value down to an f32. 4007 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 4008 DAG.getIntPtrConstant(1)); 4009 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 4010 // Merge the rounded value with the chain output of the load. 4011 return DAG.getMergeValues(Ops, DL); 4012 } 4013 4014 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, 4015 false, false, 0); 4016} 4017 4018SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 4019 SelectionDAG &DAG) const { 4020 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4021 MFI->setFrameAddressIsTaken(true); 4022 4023 EVT VT = Op.getValueType(); 4024 SDLoc DL(Op); 4025 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4026 SDValue FrameAddr = 4027 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 4028 while (Depth--) 4029 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 4030 MachinePointerInfo(), false, false, false, 0); 4031 return FrameAddr; 4032} 4033 4034// FIXME? Maybe this could be a TableGen attribute on some registers and 4035// this table could be generated automatically from RegInfo. 4036unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, 4037 EVT VT) const { 4038 unsigned Reg = StringSwitch<unsigned>(RegName) 4039 .Case("sp", AArch64::SP) 4040 .Default(0); 4041 if (Reg) 4042 return Reg; 4043 report_fatal_error("Invalid register name global variable"); 4044} 4045 4046SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 4047 SelectionDAG &DAG) const { 4048 MachineFunction &MF = DAG.getMachineFunction(); 4049 MachineFrameInfo *MFI = MF.getFrameInfo(); 4050 MFI->setReturnAddressIsTaken(true); 4051 4052 EVT VT = Op.getValueType(); 4053 SDLoc DL(Op); 4054 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4055 if (Depth) { 4056 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 4057 SDValue Offset = DAG.getConstant(8, getPointerTy()); 4058 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 4059 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), 4060 MachinePointerInfo(), false, false, false, 0); 4061 } 4062 4063 // Return LR, which contains the return address. Mark it an implicit live-in. 4064 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 4065 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 4066} 4067 4068/// LowerShiftRightParts - Lower SRA_PARTS, which returns two 4069/// i64 values and take a 2 x i64 value to shift plus a shift amount. 4070SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, 4071 SelectionDAG &DAG) const { 4072 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4073 EVT VT = Op.getValueType(); 4074 unsigned VTBits = VT.getSizeInBits(); 4075 SDLoc dl(Op); 4076 SDValue ShOpLo = Op.getOperand(0); 4077 SDValue ShOpHi = Op.getOperand(1); 4078 SDValue ShAmt = Op.getOperand(2); 4079 SDValue ARMcc; 4080 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 4081 4082 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 4083 4084 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4085 DAG.getConstant(VTBits, MVT::i64), ShAmt); 4086 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 4087 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4088 DAG.getConstant(VTBits, MVT::i64)); 4089 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 4090 4091 SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), 4092 ISD::SETGE, dl, DAG); 4093 SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32); 4094 4095 SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 4096 SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 4097 SDValue Lo = 4098 DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); 4099 4100 // AArch64 shifts larger than the register width are wrapped rather than 4101 // clamped, so we can't just emit "hi >> x". 4102 SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 4103 SDValue TrueValHi = Opc == ISD::SRA 4104 ? DAG.getNode(Opc, dl, VT, ShOpHi, 4105 DAG.getConstant(VTBits - 1, MVT::i64)) 4106 : DAG.getConstant(0, VT); 4107 SDValue Hi = 4108 DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); 4109 4110 SDValue Ops[2] = { Lo, Hi }; 4111 return DAG.getMergeValues(Ops, dl); 4112} 4113 4114/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 4115/// i64 values and take a 2 x i64 value to shift plus a shift amount. 4116SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, 4117 SelectionDAG &DAG) const { 4118 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4119 EVT VT = Op.getValueType(); 4120 unsigned VTBits = VT.getSizeInBits(); 4121 SDLoc dl(Op); 4122 SDValue ShOpLo = Op.getOperand(0); 4123 SDValue ShOpHi = Op.getOperand(1); 4124 SDValue ShAmt = Op.getOperand(2); 4125 SDValue ARMcc; 4126 4127 assert(Op.getOpcode() == ISD::SHL_PARTS); 4128 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4129 DAG.getConstant(VTBits, MVT::i64), ShAmt); 4130 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 4131 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4132 DAG.getConstant(VTBits, MVT::i64)); 4133 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 4134 SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 4135 4136 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 4137 4138 SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), 4139 ISD::SETGE, dl, DAG); 4140 SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32); 4141 SDValue Hi = 4142 DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); 4143 4144 // AArch64 shifts of larger than register sizes are wrapped rather than 4145 // clamped, so we can't just emit "lo << a" if a is too big. 4146 SDValue TrueValLo = DAG.getConstant(0, VT); 4147 SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4148 SDValue Lo = 4149 DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); 4150 4151 SDValue Ops[2] = { Lo, Hi }; 4152 return DAG.getMergeValues(Ops, dl); 4153} 4154 4155bool AArch64TargetLowering::isOffsetFoldingLegal( 4156 const GlobalAddressSDNode *GA) const { 4157 // The AArch64 target doesn't support folding offsets into global addresses. 4158 return false; 4159} 4160 4161bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 4162 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. 4163 // FIXME: We should be able to handle f128 as well with a clever lowering. 4164 if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) 4165 return true; 4166 4167 if (VT == MVT::f64) 4168 return AArch64_AM::getFP64Imm(Imm) != -1; 4169 else if (VT == MVT::f32) 4170 return AArch64_AM::getFP32Imm(Imm) != -1; 4171 return false; 4172} 4173 4174//===----------------------------------------------------------------------===// 4175// AArch64 Optimization Hooks 4176//===----------------------------------------------------------------------===// 4177 4178//===----------------------------------------------------------------------===// 4179// AArch64 Inline Assembly Support 4180//===----------------------------------------------------------------------===// 4181 4182// Table of Constraints 4183// TODO: This is the current set of constraints supported by ARM for the 4184// compiler, not all of them may make sense, e.g. S may be difficult to support. 4185// 4186// r - A general register 4187// w - An FP/SIMD register of some size in the range v0-v31 4188// x - An FP/SIMD register of some size in the range v0-v15 4189// I - Constant that can be used with an ADD instruction 4190// J - Constant that can be used with a SUB instruction 4191// K - Constant that can be used with a 32-bit logical instruction 4192// L - Constant that can be used with a 64-bit logical instruction 4193// M - Constant that can be used as a 32-bit MOV immediate 4194// N - Constant that can be used as a 64-bit MOV immediate 4195// Q - A memory reference with base register and no offset 4196// S - A symbolic address 4197// Y - Floating point constant zero 4198// Z - Integer constant zero 4199// 4200// Note that general register operands will be output using their 64-bit x 4201// register name, whatever the size of the variable, unless the asm operand 4202// is prefixed by the %w modifier. Floating-point and SIMD register operands 4203// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 4204// %q modifier. 4205 4206/// getConstraintType - Given a constraint letter, return the type of 4207/// constraint it is for this target. 4208AArch64TargetLowering::ConstraintType 4209AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { 4210 if (Constraint.size() == 1) { 4211 switch (Constraint[0]) { 4212 default: 4213 break; 4214 case 'z': 4215 return C_Other; 4216 case 'x': 4217 case 'w': 4218 return C_RegisterClass; 4219 // An address with a single base register. Due to the way we 4220 // currently handle addresses it is the same as 'r'. 4221 case 'Q': 4222 return C_Memory; 4223 } 4224 } 4225 return TargetLowering::getConstraintType(Constraint); 4226} 4227 4228/// Examine constraint type and operand type and determine a weight value. 4229/// This object must already have been set up with the operand type 4230/// and the current alternative constraint selected. 4231TargetLowering::ConstraintWeight 4232AArch64TargetLowering::getSingleConstraintMatchWeight( 4233 AsmOperandInfo &info, const char *constraint) const { 4234 ConstraintWeight weight = CW_Invalid; 4235 Value *CallOperandVal = info.CallOperandVal; 4236 // If we don't have a value, we can't do a match, 4237 // but allow it at the lowest weight. 4238 if (!CallOperandVal) 4239 return CW_Default; 4240 Type *type = CallOperandVal->getType(); 4241 // Look at the constraint type. 4242 switch (*constraint) { 4243 default: 4244 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 4245 break; 4246 case 'x': 4247 case 'w': 4248 if (type->isFloatingPointTy() || type->isVectorTy()) 4249 weight = CW_Register; 4250 break; 4251 case 'z': 4252 weight = CW_Constant; 4253 break; 4254 } 4255 return weight; 4256} 4257 4258std::pair<unsigned, const TargetRegisterClass *> 4259AArch64TargetLowering::getRegForInlineAsmConstraint( 4260 const std::string &Constraint, MVT VT) const { 4261 if (Constraint.size() == 1) { 4262 switch (Constraint[0]) { 4263 case 'r': 4264 if (VT.getSizeInBits() == 64) 4265 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 4266 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 4267 case 'w': 4268 if (VT == MVT::f32) 4269 return std::make_pair(0U, &AArch64::FPR32RegClass); 4270 if (VT.getSizeInBits() == 64) 4271 return std::make_pair(0U, &AArch64::FPR64RegClass); 4272 if (VT.getSizeInBits() == 128) 4273 return std::make_pair(0U, &AArch64::FPR128RegClass); 4274 break; 4275 // The instructions that this constraint is designed for can 4276 // only take 128-bit registers so just use that regclass. 4277 case 'x': 4278 if (VT.getSizeInBits() == 128) 4279 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 4280 break; 4281 } 4282 } 4283 if (StringRef("{cc}").equals_lower(Constraint)) 4284 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 4285 4286 // Use the default implementation in TargetLowering to convert the register 4287 // constraint into a member of a register class. 4288 std::pair<unsigned, const TargetRegisterClass *> Res; 4289 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 4290 4291 // Not found as a standard register? 4292 if (!Res.second) { 4293 unsigned Size = Constraint.size(); 4294 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 4295 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 4296 const std::string Reg = 4297 std::string(&Constraint[2], &Constraint[Size - 1]); 4298 int RegNo = atoi(Reg.c_str()); 4299 if (RegNo >= 0 && RegNo <= 31) { 4300 // v0 - v31 are aliases of q0 - q31. 4301 // By default we'll emit v0-v31 for this unless there's a modifier where 4302 // we'll emit the correct register as well. 4303 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 4304 Res.second = &AArch64::FPR128RegClass; 4305 } 4306 } 4307 } 4308 4309 return Res; 4310} 4311 4312/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 4313/// vector. If it is invalid, don't add anything to Ops. 4314void AArch64TargetLowering::LowerAsmOperandForConstraint( 4315 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 4316 SelectionDAG &DAG) const { 4317 SDValue Result; 4318 4319 // Currently only support length 1 constraints. 4320 if (Constraint.length() != 1) 4321 return; 4322 4323 char ConstraintLetter = Constraint[0]; 4324 switch (ConstraintLetter) { 4325 default: 4326 break; 4327 4328 // This set of constraints deal with valid constants for various instructions. 4329 // Validate and return a target constant for them if we can. 4330 case 'z': { 4331 // 'z' maps to xzr or wzr so it needs an input of 0. 4332 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4333 if (!C || C->getZExtValue() != 0) 4334 return; 4335 4336 if (Op.getValueType() == MVT::i64) 4337 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 4338 else 4339 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 4340 break; 4341 } 4342 4343 case 'I': 4344 case 'J': 4345 case 'K': 4346 case 'L': 4347 case 'M': 4348 case 'N': 4349 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4350 if (!C) 4351 return; 4352 4353 // Grab the value and do some validation. 4354 uint64_t CVal = C->getZExtValue(); 4355 switch (ConstraintLetter) { 4356 // The I constraint applies only to simple ADD or SUB immediate operands: 4357 // i.e. 0 to 4095 with optional shift by 12 4358 // The J constraint applies only to ADD or SUB immediates that would be 4359 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 4360 // instruction [or vice versa], in other words -1 to -4095 with optional 4361 // left shift by 12. 4362 case 'I': 4363 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 4364 break; 4365 return; 4366 case 'J': { 4367 uint64_t NVal = -C->getSExtValue(); 4368 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 4369 CVal = C->getSExtValue(); 4370 break; 4371 } 4372 return; 4373 } 4374 // The K and L constraints apply *only* to logical immediates, including 4375 // what used to be the MOVI alias for ORR (though the MOVI alias has now 4376 // been removed and MOV should be used). So these constraints have to 4377 // distinguish between bit patterns that are valid 32-bit or 64-bit 4378 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 4379 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 4380 // versa. 4381 case 'K': 4382 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4383 break; 4384 return; 4385 case 'L': 4386 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4387 break; 4388 return; 4389 // The M and N constraints are a superset of K and L respectively, for use 4390 // with the MOV (immediate) alias. As well as the logical immediates they 4391 // also match 32 or 64-bit immediates that can be loaded either using a 4392 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 4393 // (M) or 64-bit 0x1234000000000000 (N) etc. 4394 // As a note some of this code is liberally stolen from the asm parser. 4395 case 'M': { 4396 if (!isUInt<32>(CVal)) 4397 return; 4398 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4399 break; 4400 if ((CVal & 0xFFFF) == CVal) 4401 break; 4402 if ((CVal & 0xFFFF0000ULL) == CVal) 4403 break; 4404 uint64_t NCVal = ~(uint32_t)CVal; 4405 if ((NCVal & 0xFFFFULL) == NCVal) 4406 break; 4407 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4408 break; 4409 return; 4410 } 4411 case 'N': { 4412 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4413 break; 4414 if ((CVal & 0xFFFFULL) == CVal) 4415 break; 4416 if ((CVal & 0xFFFF0000ULL) == CVal) 4417 break; 4418 if ((CVal & 0xFFFF00000000ULL) == CVal) 4419 break; 4420 if ((CVal & 0xFFFF000000000000ULL) == CVal) 4421 break; 4422 uint64_t NCVal = ~CVal; 4423 if ((NCVal & 0xFFFFULL) == NCVal) 4424 break; 4425 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4426 break; 4427 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 4428 break; 4429 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 4430 break; 4431 return; 4432 } 4433 default: 4434 return; 4435 } 4436 4437 // All assembler immediates are 64-bit integers. 4438 Result = DAG.getTargetConstant(CVal, MVT::i64); 4439 break; 4440 } 4441 4442 if (Result.getNode()) { 4443 Ops.push_back(Result); 4444 return; 4445 } 4446 4447 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 4448} 4449 4450//===----------------------------------------------------------------------===// 4451// AArch64 Advanced SIMD Support 4452//===----------------------------------------------------------------------===// 4453 4454/// WidenVector - Given a value in the V64 register class, produce the 4455/// equivalent value in the V128 register class. 4456static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 4457 EVT VT = V64Reg.getValueType(); 4458 unsigned NarrowSize = VT.getVectorNumElements(); 4459 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4460 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 4461 SDLoc DL(V64Reg); 4462 4463 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 4464 V64Reg, DAG.getConstant(0, MVT::i32)); 4465} 4466 4467/// getExtFactor - Determine the adjustment factor for the position when 4468/// generating an "extract from vector registers" instruction. 4469static unsigned getExtFactor(SDValue &V) { 4470 EVT EltType = V.getValueType().getVectorElementType(); 4471 return EltType.getSizeInBits() / 8; 4472} 4473 4474/// NarrowVector - Given a value in the V128 register class, produce the 4475/// equivalent value in the V64 register class. 4476static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { 4477 EVT VT = V128Reg.getValueType(); 4478 unsigned WideSize = VT.getVectorNumElements(); 4479 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4480 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); 4481 SDLoc DL(V128Reg); 4482 4483 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); 4484} 4485 4486// Gather data to see if the operation can be modelled as a 4487// shuffle in combination with VEXTs. 4488SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 4489 SelectionDAG &DAG) const { 4490 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 4491 SDLoc dl(Op); 4492 EVT VT = Op.getValueType(); 4493 unsigned NumElts = VT.getVectorNumElements(); 4494 4495 struct ShuffleSourceInfo { 4496 SDValue Vec; 4497 unsigned MinElt; 4498 unsigned MaxElt; 4499 4500 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 4501 // be compatible with the shuffle we intend to construct. As a result 4502 // ShuffleVec will be some sliding window into the original Vec. 4503 SDValue ShuffleVec; 4504 4505 // Code should guarantee that element i in Vec starts at element "WindowBase 4506 // + i * WindowScale in ShuffleVec". 4507 int WindowBase; 4508 int WindowScale; 4509 4510 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 4511 ShuffleSourceInfo(SDValue Vec) 4512 : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), 4513 WindowScale(1) {} 4514 }; 4515 4516 // First gather all vectors used as an immediate source for this BUILD_VECTOR 4517 // node. 4518 SmallVector<ShuffleSourceInfo, 2> Sources; 4519 for (unsigned i = 0; i < NumElts; ++i) { 4520 SDValue V = Op.getOperand(i); 4521 if (V.getOpcode() == ISD::UNDEF) 4522 continue; 4523 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 4524 // A shuffle can only come from building a vector from various 4525 // elements of other vectors. 4526 return SDValue(); 4527 } 4528 4529 // Add this element source to the list if it's not already there. 4530 SDValue SourceVec = V.getOperand(0); 4531 auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); 4532 if (Source == Sources.end()) 4533 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 4534 4535 // Update the minimum and maximum lane number seen. 4536 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 4537 Source->MinElt = std::min(Source->MinElt, EltNo); 4538 Source->MaxElt = std::max(Source->MaxElt, EltNo); 4539 } 4540 4541 // Currently only do something sane when at most two source vectors 4542 // are involved. 4543 if (Sources.size() > 2) 4544 return SDValue(); 4545 4546 // Find out the smallest element size among result and two sources, and use 4547 // it as element size to build the shuffle_vector. 4548 EVT SmallestEltTy = VT.getVectorElementType(); 4549 for (auto &Source : Sources) { 4550 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 4551 if (SrcEltTy.bitsLT(SmallestEltTy)) { 4552 SmallestEltTy = SrcEltTy; 4553 } 4554 } 4555 unsigned ResMultiplier = 4556 VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); 4557 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 4558 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 4559 4560 // If the source vector is too wide or too narrow, we may nevertheless be able 4561 // to construct a compatible shuffle either by concatenating it with UNDEF or 4562 // extracting a suitable range of elements. 4563 for (auto &Src : Sources) { 4564 EVT SrcVT = Src.ShuffleVec.getValueType(); 4565 4566 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 4567 continue; 4568 4569 // This stage of the search produces a source with the same element type as 4570 // the original, but with a total width matching the BUILD_VECTOR output. 4571 EVT EltVT = SrcVT.getVectorElementType(); 4572 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 4573 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 4574 4575 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 4576 assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); 4577 // We can pad out the smaller vector for free, so if it's part of a 4578 // shuffle... 4579 Src.ShuffleVec = 4580 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 4581 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 4582 continue; 4583 } 4584 4585 assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); 4586 4587 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 4588 // Span too large for a VEXT to cope 4589 return SDValue(); 4590 } 4591 4592 if (Src.MinElt >= NumSrcElts) { 4593 // The extraction can just take the second half 4594 Src.ShuffleVec = 4595 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4596 DAG.getConstant(NumSrcElts, MVT::i64)); 4597 Src.WindowBase = -NumSrcElts; 4598 } else if (Src.MaxElt < NumSrcElts) { 4599 // The extraction can just take the first half 4600 Src.ShuffleVec = 4601 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4602 DAG.getConstant(0, MVT::i64)); 4603 } else { 4604 // An actual VEXT is needed 4605 SDValue VEXTSrc1 = 4606 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4607 DAG.getConstant(0, MVT::i64)); 4608 SDValue VEXTSrc2 = 4609 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4610 DAG.getConstant(NumSrcElts, MVT::i64)); 4611 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 4612 4613 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 4614 VEXTSrc2, DAG.getConstant(Imm, MVT::i32)); 4615 Src.WindowBase = -Src.MinElt; 4616 } 4617 } 4618 4619 // Another possible incompatibility occurs from the vector element types. We 4620 // can fix this by bitcasting the source vectors to the same type we intend 4621 // for the shuffle. 4622 for (auto &Src : Sources) { 4623 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 4624 if (SrcEltTy == SmallestEltTy) 4625 continue; 4626 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 4627 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 4628 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 4629 Src.WindowBase *= Src.WindowScale; 4630 } 4631 4632 // Final sanity check before we try to actually produce a shuffle. 4633 DEBUG( 4634 for (auto Src : Sources) 4635 assert(Src.ShuffleVec.getValueType() == ShuffleVT); 4636 ); 4637 4638 // The stars all align, our next step is to produce the mask for the shuffle. 4639 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 4640 int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); 4641 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 4642 SDValue Entry = Op.getOperand(i); 4643 if (Entry.getOpcode() == ISD::UNDEF) 4644 continue; 4645 4646 auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); 4647 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 4648 4649 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 4650 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 4651 // segment. 4652 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 4653 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 4654 VT.getVectorElementType().getSizeInBits()); 4655 int LanesDefined = BitsDefined / BitsPerShuffleLane; 4656 4657 // This source is expected to fill ResMultiplier lanes of the final shuffle, 4658 // starting at the appropriate offset. 4659 int *LaneMask = &Mask[i * ResMultiplier]; 4660 4661 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 4662 ExtractBase += NumElts * (Src - Sources.begin()); 4663 for (int j = 0; j < LanesDefined; ++j) 4664 LaneMask[j] = ExtractBase + j; 4665 } 4666 4667 // Final check before we try to produce nonsense... 4668 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 4669 return SDValue(); 4670 4671 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 4672 for (unsigned i = 0; i < Sources.size(); ++i) 4673 ShuffleOps[i] = Sources[i].ShuffleVec; 4674 4675 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 4676 ShuffleOps[1], &Mask[0]); 4677 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 4678} 4679 4680// check if an EXT instruction can handle the shuffle mask when the 4681// vector sources of the shuffle are the same. 4682static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 4683 unsigned NumElts = VT.getVectorNumElements(); 4684 4685 // Assume that the first shuffle index is not UNDEF. Fail if it is. 4686 if (M[0] < 0) 4687 return false; 4688 4689 Imm = M[0]; 4690 4691 // If this is a VEXT shuffle, the immediate value is the index of the first 4692 // element. The other shuffle indices must be the successive elements after 4693 // the first one. 4694 unsigned ExpectedElt = Imm; 4695 for (unsigned i = 1; i < NumElts; ++i) { 4696 // Increment the expected index. If it wraps around, just follow it 4697 // back to index zero and keep going. 4698 ++ExpectedElt; 4699 if (ExpectedElt == NumElts) 4700 ExpectedElt = 0; 4701 4702 if (M[i] < 0) 4703 continue; // ignore UNDEF indices 4704 if (ExpectedElt != static_cast<unsigned>(M[i])) 4705 return false; 4706 } 4707 4708 return true; 4709} 4710 4711// check if an EXT instruction can handle the shuffle mask when the 4712// vector sources of the shuffle are different. 4713static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 4714 unsigned &Imm) { 4715 // Look for the first non-undef element. 4716 const int *FirstRealElt = std::find_if(M.begin(), M.end(), 4717 [](int Elt) {return Elt >= 0;}); 4718 4719 // Benefit form APInt to handle overflow when calculating expected element. 4720 unsigned NumElts = VT.getVectorNumElements(); 4721 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 4722 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 4723 // The following shuffle indices must be the successive elements after the 4724 // first real element. 4725 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), 4726 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); 4727 if (FirstWrongElt != M.end()) 4728 return false; 4729 4730 // The index of an EXT is the first element if it is not UNDEF. 4731 // Watch out for the beginning UNDEFs. The EXT index should be the expected 4732 // value of the first element. E.g. 4733 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 4734 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 4735 // ExpectedElt is the last mask index plus 1. 4736 Imm = ExpectedElt.getZExtValue(); 4737 4738 // There are two difference cases requiring to reverse input vectors. 4739 // For example, for vector <4 x i32> we have the following cases, 4740 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 4741 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 4742 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 4743 // to reverse two input vectors. 4744 if (Imm < NumElts) 4745 ReverseEXT = true; 4746 else 4747 Imm -= NumElts; 4748 4749 return true; 4750} 4751 4752/// isREVMask - Check if a vector shuffle corresponds to a REV 4753/// instruction with the specified blocksize. (The order of the elements 4754/// within each block of the vector is reversed.) 4755static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 4756 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 4757 "Only possible block sizes for REV are: 16, 32, 64"); 4758 4759 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4760 if (EltSz == 64) 4761 return false; 4762 4763 unsigned NumElts = VT.getVectorNumElements(); 4764 unsigned BlockElts = M[0] + 1; 4765 // If the first shuffle index is UNDEF, be optimistic. 4766 if (M[0] < 0) 4767 BlockElts = BlockSize / EltSz; 4768 4769 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 4770 return false; 4771 4772 for (unsigned i = 0; i < NumElts; ++i) { 4773 if (M[i] < 0) 4774 continue; // ignore UNDEF indices 4775 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 4776 return false; 4777 } 4778 4779 return true; 4780} 4781 4782static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4783 unsigned NumElts = VT.getVectorNumElements(); 4784 WhichResult = (M[0] == 0 ? 0 : 1); 4785 unsigned Idx = WhichResult * NumElts / 2; 4786 for (unsigned i = 0; i != NumElts; i += 2) { 4787 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 4788 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 4789 return false; 4790 Idx += 1; 4791 } 4792 4793 return true; 4794} 4795 4796static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4797 unsigned NumElts = VT.getVectorNumElements(); 4798 WhichResult = (M[0] == 0 ? 0 : 1); 4799 for (unsigned i = 0; i != NumElts; ++i) { 4800 if (M[i] < 0) 4801 continue; // ignore UNDEF indices 4802 if ((unsigned)M[i] != 2 * i + WhichResult) 4803 return false; 4804 } 4805 4806 return true; 4807} 4808 4809static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4810 unsigned NumElts = VT.getVectorNumElements(); 4811 WhichResult = (M[0] == 0 ? 0 : 1); 4812 for (unsigned i = 0; i < NumElts; i += 2) { 4813 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 4814 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 4815 return false; 4816 } 4817 return true; 4818} 4819 4820/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 4821/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 4822/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 4823static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4824 unsigned NumElts = VT.getVectorNumElements(); 4825 WhichResult = (M[0] == 0 ? 0 : 1); 4826 unsigned Idx = WhichResult * NumElts / 2; 4827 for (unsigned i = 0; i != NumElts; i += 2) { 4828 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 4829 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 4830 return false; 4831 Idx += 1; 4832 } 4833 4834 return true; 4835} 4836 4837/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 4838/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 4839/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 4840static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4841 unsigned Half = VT.getVectorNumElements() / 2; 4842 WhichResult = (M[0] == 0 ? 0 : 1); 4843 for (unsigned j = 0; j != 2; ++j) { 4844 unsigned Idx = WhichResult; 4845 for (unsigned i = 0; i != Half; ++i) { 4846 int MIdx = M[i + j * Half]; 4847 if (MIdx >= 0 && (unsigned)MIdx != Idx) 4848 return false; 4849 Idx += 2; 4850 } 4851 } 4852 4853 return true; 4854} 4855 4856/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 4857/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 4858/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 4859static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4860 unsigned NumElts = VT.getVectorNumElements(); 4861 WhichResult = (M[0] == 0 ? 0 : 1); 4862 for (unsigned i = 0; i < NumElts; i += 2) { 4863 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 4864 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 4865 return false; 4866 } 4867 return true; 4868} 4869 4870static bool isINSMask(ArrayRef<int> M, int NumInputElements, 4871 bool &DstIsLeft, int &Anomaly) { 4872 if (M.size() != static_cast<size_t>(NumInputElements)) 4873 return false; 4874 4875 int NumLHSMatch = 0, NumRHSMatch = 0; 4876 int LastLHSMismatch = -1, LastRHSMismatch = -1; 4877 4878 for (int i = 0; i < NumInputElements; ++i) { 4879 if (M[i] == -1) { 4880 ++NumLHSMatch; 4881 ++NumRHSMatch; 4882 continue; 4883 } 4884 4885 if (M[i] == i) 4886 ++NumLHSMatch; 4887 else 4888 LastLHSMismatch = i; 4889 4890 if (M[i] == i + NumInputElements) 4891 ++NumRHSMatch; 4892 else 4893 LastRHSMismatch = i; 4894 } 4895 4896 if (NumLHSMatch == NumInputElements - 1) { 4897 DstIsLeft = true; 4898 Anomaly = LastLHSMismatch; 4899 return true; 4900 } else if (NumRHSMatch == NumInputElements - 1) { 4901 DstIsLeft = false; 4902 Anomaly = LastRHSMismatch; 4903 return true; 4904 } 4905 4906 return false; 4907} 4908 4909static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 4910 if (VT.getSizeInBits() != 128) 4911 return false; 4912 4913 unsigned NumElts = VT.getVectorNumElements(); 4914 4915 for (int I = 0, E = NumElts / 2; I != E; I++) { 4916 if (Mask[I] != I) 4917 return false; 4918 } 4919 4920 int Offset = NumElts / 2; 4921 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 4922 if (Mask[I] != I + SplitLHS * Offset) 4923 return false; 4924 } 4925 4926 return true; 4927} 4928 4929static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 4930 SDLoc DL(Op); 4931 EVT VT = Op.getValueType(); 4932 SDValue V0 = Op.getOperand(0); 4933 SDValue V1 = Op.getOperand(1); 4934 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 4935 4936 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 4937 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 4938 return SDValue(); 4939 4940 bool SplitV0 = V0.getValueType().getSizeInBits() == 128; 4941 4942 if (!isConcatMask(Mask, VT, SplitV0)) 4943 return SDValue(); 4944 4945 EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4946 VT.getVectorNumElements() / 2); 4947 if (SplitV0) { 4948 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 4949 DAG.getConstant(0, MVT::i64)); 4950 } 4951 if (V1.getValueType().getSizeInBits() == 128) { 4952 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 4953 DAG.getConstant(0, MVT::i64)); 4954 } 4955 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 4956} 4957 4958/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 4959/// the specified operations to build the shuffle. 4960static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 4961 SDValue RHS, SelectionDAG &DAG, 4962 SDLoc dl) { 4963 unsigned OpNum = (PFEntry >> 26) & 0x0F; 4964 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 4965 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 4966 4967 enum { 4968 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 4969 OP_VREV, 4970 OP_VDUP0, 4971 OP_VDUP1, 4972 OP_VDUP2, 4973 OP_VDUP3, 4974 OP_VEXT1, 4975 OP_VEXT2, 4976 OP_VEXT3, 4977 OP_VUZPL, // VUZP, left result 4978 OP_VUZPR, // VUZP, right result 4979 OP_VZIPL, // VZIP, left result 4980 OP_VZIPR, // VZIP, right result 4981 OP_VTRNL, // VTRN, left result 4982 OP_VTRNR // VTRN, right result 4983 }; 4984 4985 if (OpNum == OP_COPY) { 4986 if (LHSID == (1 * 9 + 2) * 9 + 3) 4987 return LHS; 4988 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 4989 return RHS; 4990 } 4991 4992 SDValue OpLHS, OpRHS; 4993 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 4994 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 4995 EVT VT = OpLHS.getValueType(); 4996 4997 switch (OpNum) { 4998 default: 4999 llvm_unreachable("Unknown shuffle opcode!"); 5000 case OP_VREV: 5001 // VREV divides the vector in half and swaps within the half. 5002 if (VT.getVectorElementType() == MVT::i32 || 5003 VT.getVectorElementType() == MVT::f32) 5004 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 5005 // vrev <4 x i16> -> REV32 5006 if (VT.getVectorElementType() == MVT::i16 || 5007 VT.getVectorElementType() == MVT::f16) 5008 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 5009 // vrev <4 x i8> -> REV16 5010 assert(VT.getVectorElementType() == MVT::i8); 5011 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 5012 case OP_VDUP0: 5013 case OP_VDUP1: 5014 case OP_VDUP2: 5015 case OP_VDUP3: { 5016 EVT EltTy = VT.getVectorElementType(); 5017 unsigned Opcode; 5018 if (EltTy == MVT::i8) 5019 Opcode = AArch64ISD::DUPLANE8; 5020 else if (EltTy == MVT::i16) 5021 Opcode = AArch64ISD::DUPLANE16; 5022 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 5023 Opcode = AArch64ISD::DUPLANE32; 5024 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 5025 Opcode = AArch64ISD::DUPLANE64; 5026 else 5027 llvm_unreachable("Invalid vector element type?"); 5028 5029 if (VT.getSizeInBits() == 64) 5030 OpLHS = WidenVector(OpLHS, DAG); 5031 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64); 5032 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 5033 } 5034 case OP_VEXT1: 5035 case OP_VEXT2: 5036 case OP_VEXT3: { 5037 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 5038 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 5039 DAG.getConstant(Imm, MVT::i32)); 5040 } 5041 case OP_VUZPL: 5042 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, 5043 OpRHS); 5044 case OP_VUZPR: 5045 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, 5046 OpRHS); 5047 case OP_VZIPL: 5048 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, 5049 OpRHS); 5050 case OP_VZIPR: 5051 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, 5052 OpRHS); 5053 case OP_VTRNL: 5054 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, 5055 OpRHS); 5056 case OP_VTRNR: 5057 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, 5058 OpRHS); 5059 } 5060} 5061 5062static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 5063 SelectionDAG &DAG) { 5064 // Check to see if we can use the TBL instruction. 5065 SDValue V1 = Op.getOperand(0); 5066 SDValue V2 = Op.getOperand(1); 5067 SDLoc DL(Op); 5068 5069 EVT EltVT = Op.getValueType().getVectorElementType(); 5070 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 5071 5072 SmallVector<SDValue, 8> TBLMask; 5073 for (int Val : ShuffleMask) { 5074 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5075 unsigned Offset = Byte + Val * BytesPerElt; 5076 TBLMask.push_back(DAG.getConstant(Offset, MVT::i32)); 5077 } 5078 } 5079 5080 MVT IndexVT = MVT::v8i8; 5081 unsigned IndexLen = 8; 5082 if (Op.getValueType().getSizeInBits() == 128) { 5083 IndexVT = MVT::v16i8; 5084 IndexLen = 16; 5085 } 5086 5087 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 5088 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 5089 5090 SDValue Shuffle; 5091 if (V2.getNode()->getOpcode() == ISD::UNDEF) { 5092 if (IndexLen == 8) 5093 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 5094 Shuffle = DAG.getNode( 5095 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5096 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst, 5097 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5098 makeArrayRef(TBLMask.data(), IndexLen))); 5099 } else { 5100 if (IndexLen == 8) { 5101 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 5102 Shuffle = DAG.getNode( 5103 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5104 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst, 5105 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5106 makeArrayRef(TBLMask.data(), IndexLen))); 5107 } else { 5108 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 5109 // cannot currently represent the register constraints on the input 5110 // table registers. 5111 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 5112 // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5113 // &TBLMask[0], IndexLen)); 5114 Shuffle = DAG.getNode( 5115 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5116 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst, 5117 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5118 makeArrayRef(TBLMask.data(), IndexLen))); 5119 } 5120 } 5121 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 5122} 5123 5124static unsigned getDUPLANEOp(EVT EltType) { 5125 if (EltType == MVT::i8) 5126 return AArch64ISD::DUPLANE8; 5127 if (EltType == MVT::i16 || EltType == MVT::f16) 5128 return AArch64ISD::DUPLANE16; 5129 if (EltType == MVT::i32 || EltType == MVT::f32) 5130 return AArch64ISD::DUPLANE32; 5131 if (EltType == MVT::i64 || EltType == MVT::f64) 5132 return AArch64ISD::DUPLANE64; 5133 5134 llvm_unreachable("Invalid vector element type?"); 5135} 5136 5137SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 5138 SelectionDAG &DAG) const { 5139 SDLoc dl(Op); 5140 EVT VT = Op.getValueType(); 5141 5142 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 5143 5144 // Convert shuffles that are directly supported on NEON to target-specific 5145 // DAG nodes, instead of keeping them as shuffles and matching them again 5146 // during code selection. This is more efficient and avoids the possibility 5147 // of inconsistencies between legalization and selection. 5148 ArrayRef<int> ShuffleMask = SVN->getMask(); 5149 5150 SDValue V1 = Op.getOperand(0); 5151 SDValue V2 = Op.getOperand(1); 5152 5153 if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], 5154 V1.getValueType().getSimpleVT())) { 5155 int Lane = SVN->getSplatIndex(); 5156 // If this is undef splat, generate it via "just" vdup, if possible. 5157 if (Lane == -1) 5158 Lane = 0; 5159 5160 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 5161 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 5162 V1.getOperand(0)); 5163 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 5164 // constant. If so, we can just reference the lane's definition directly. 5165 if (V1.getOpcode() == ISD::BUILD_VECTOR && 5166 !isa<ConstantSDNode>(V1.getOperand(Lane))) 5167 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 5168 5169 // Otherwise, duplicate from the lane of the input vector. 5170 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 5171 5172 // SelectionDAGBuilder may have "helpfully" already extracted or conatenated 5173 // to make a vector of the same size as this SHUFFLE. We can ignore the 5174 // extract entirely, and canonicalise the concat using WidenVector. 5175 if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 5176 Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); 5177 V1 = V1.getOperand(0); 5178 } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { 5179 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 5180 Lane -= Idx * VT.getVectorNumElements() / 2; 5181 V1 = WidenVector(V1.getOperand(Idx), DAG); 5182 } else if (VT.getSizeInBits() == 64) 5183 V1 = WidenVector(V1, DAG); 5184 5185 return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64)); 5186 } 5187 5188 if (isREVMask(ShuffleMask, VT, 64)) 5189 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 5190 if (isREVMask(ShuffleMask, VT, 32)) 5191 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 5192 if (isREVMask(ShuffleMask, VT, 16)) 5193 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 5194 5195 bool ReverseEXT = false; 5196 unsigned Imm; 5197 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 5198 if (ReverseEXT) 5199 std::swap(V1, V2); 5200 Imm *= getExtFactor(V1); 5201 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, 5202 DAG.getConstant(Imm, MVT::i32)); 5203 } else if (V2->getOpcode() == ISD::UNDEF && 5204 isSingletonEXTMask(ShuffleMask, VT, Imm)) { 5205 Imm *= getExtFactor(V1); 5206 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, 5207 DAG.getConstant(Imm, MVT::i32)); 5208 } 5209 5210 unsigned WhichResult; 5211 if (isZIPMask(ShuffleMask, VT, WhichResult)) { 5212 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 5213 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5214 } 5215 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 5216 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 5217 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5218 } 5219 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 5220 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 5221 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5222 } 5223 5224 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5225 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 5226 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5227 } 5228 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5229 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 5230 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5231 } 5232 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5233 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 5234 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5235 } 5236 5237 SDValue Concat = tryFormConcatFromShuffle(Op, DAG); 5238 if (Concat.getNode()) 5239 return Concat; 5240 5241 bool DstIsLeft; 5242 int Anomaly; 5243 int NumInputElements = V1.getValueType().getVectorNumElements(); 5244 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { 5245 SDValue DstVec = DstIsLeft ? V1 : V2; 5246 SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64); 5247 5248 SDValue SrcVec = V1; 5249 int SrcLane = ShuffleMask[Anomaly]; 5250 if (SrcLane >= NumInputElements) { 5251 SrcVec = V2; 5252 SrcLane -= VT.getVectorNumElements(); 5253 } 5254 SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64); 5255 5256 EVT ScalarVT = VT.getVectorElementType(); 5257 5258 if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) 5259 ScalarVT = MVT::i32; 5260 5261 return DAG.getNode( 5262 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 5263 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), 5264 DstLaneV); 5265 } 5266 5267 // If the shuffle is not directly supported and it has 4 elements, use 5268 // the PerfectShuffle-generated table to synthesize it from other shuffles. 5269 unsigned NumElts = VT.getVectorNumElements(); 5270 if (NumElts == 4) { 5271 unsigned PFIndexes[4]; 5272 for (unsigned i = 0; i != 4; ++i) { 5273 if (ShuffleMask[i] < 0) 5274 PFIndexes[i] = 8; 5275 else 5276 PFIndexes[i] = ShuffleMask[i]; 5277 } 5278 5279 // Compute the index in the perfect shuffle table. 5280 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 5281 PFIndexes[2] * 9 + PFIndexes[3]; 5282 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 5283 unsigned Cost = (PFEntry >> 30); 5284 5285 if (Cost <= 4) 5286 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 5287 } 5288 5289 return GenerateTBL(Op, ShuffleMask, DAG); 5290} 5291 5292static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, 5293 APInt &UndefBits) { 5294 EVT VT = BVN->getValueType(0); 5295 APInt SplatBits, SplatUndef; 5296 unsigned SplatBitSize; 5297 bool HasAnyUndefs; 5298 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 5299 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; 5300 5301 for (unsigned i = 0; i < NumSplats; ++i) { 5302 CnstBits <<= SplatBitSize; 5303 UndefBits <<= SplatBitSize; 5304 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); 5305 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); 5306 } 5307 5308 return true; 5309 } 5310 5311 return false; 5312} 5313 5314SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, 5315 SelectionDAG &DAG) const { 5316 BuildVectorSDNode *BVN = 5317 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 5318 SDValue LHS = Op.getOperand(0); 5319 SDLoc dl(Op); 5320 EVT VT = Op.getValueType(); 5321 5322 if (!BVN) 5323 return Op; 5324 5325 APInt CnstBits(VT.getSizeInBits(), 0); 5326 APInt UndefBits(VT.getSizeInBits(), 0); 5327 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5328 // We only have BIC vector immediate instruction, which is and-not. 5329 CnstBits = ~CnstBits; 5330 5331 // We make use of a little bit of goto ickiness in order to avoid having to 5332 // duplicate the immediate matching logic for the undef toggled case. 5333 bool SecondTry = false; 5334 AttemptModImm: 5335 5336 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5337 CnstBits = CnstBits.zextOrTrunc(64); 5338 uint64_t CnstVal = CnstBits.getZExtValue(); 5339 5340 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5341 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5342 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5343 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5344 DAG.getConstant(CnstVal, MVT::i32), 5345 DAG.getConstant(0, MVT::i32)); 5346 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5347 } 5348 5349 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5350 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5351 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5352 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5353 DAG.getConstant(CnstVal, MVT::i32), 5354 DAG.getConstant(8, MVT::i32)); 5355 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5356 } 5357 5358 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5359 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5360 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5361 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5362 DAG.getConstant(CnstVal, MVT::i32), 5363 DAG.getConstant(16, MVT::i32)); 5364 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5365 } 5366 5367 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5368 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5369 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5370 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5371 DAG.getConstant(CnstVal, MVT::i32), 5372 DAG.getConstant(24, MVT::i32)); 5373 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5374 } 5375 5376 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5377 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5378 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5379 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5380 DAG.getConstant(CnstVal, MVT::i32), 5381 DAG.getConstant(0, MVT::i32)); 5382 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5383 } 5384 5385 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5386 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5387 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5388 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5389 DAG.getConstant(CnstVal, MVT::i32), 5390 DAG.getConstant(8, MVT::i32)); 5391 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5392 } 5393 } 5394 5395 if (SecondTry) 5396 goto FailedModImm; 5397 SecondTry = true; 5398 CnstBits = ~UndefBits; 5399 goto AttemptModImm; 5400 } 5401 5402// We can always fall back to a non-immediate AND. 5403FailedModImm: 5404 return Op; 5405} 5406 5407// Specialized code to quickly find if PotentialBVec is a BuildVector that 5408// consists of only the same constant int value, returned in reference arg 5409// ConstVal 5410static bool isAllConstantBuildVector(const SDValue &PotentialBVec, 5411 uint64_t &ConstVal) { 5412 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); 5413 if (!Bvec) 5414 return false; 5415 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); 5416 if (!FirstElt) 5417 return false; 5418 EVT VT = Bvec->getValueType(0); 5419 unsigned NumElts = VT.getVectorNumElements(); 5420 for (unsigned i = 1; i < NumElts; ++i) 5421 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) 5422 return false; 5423 ConstVal = FirstElt->getZExtValue(); 5424 return true; 5425} 5426 5427static unsigned getIntrinsicID(const SDNode *N) { 5428 unsigned Opcode = N->getOpcode(); 5429 switch (Opcode) { 5430 default: 5431 return Intrinsic::not_intrinsic; 5432 case ISD::INTRINSIC_WO_CHAIN: { 5433 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 5434 if (IID < Intrinsic::num_intrinsics) 5435 return IID; 5436 return Intrinsic::not_intrinsic; 5437 } 5438 } 5439} 5440 5441// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), 5442// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a 5443// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. 5444// Also, logical shift right -> sri, with the same structure. 5445static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { 5446 EVT VT = N->getValueType(0); 5447 5448 if (!VT.isVector()) 5449 return SDValue(); 5450 5451 SDLoc DL(N); 5452 5453 // Is the first op an AND? 5454 const SDValue And = N->getOperand(0); 5455 if (And.getOpcode() != ISD::AND) 5456 return SDValue(); 5457 5458 // Is the second op an shl or lshr? 5459 SDValue Shift = N->getOperand(1); 5460 // This will have been turned into: AArch64ISD::VSHL vector, #shift 5461 // or AArch64ISD::VLSHR vector, #shift 5462 unsigned ShiftOpc = Shift.getOpcode(); 5463 if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) 5464 return SDValue(); 5465 bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; 5466 5467 // Is the shift amount constant? 5468 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 5469 if (!C2node) 5470 return SDValue(); 5471 5472 // Is the and mask vector all constant? 5473 uint64_t C1; 5474 if (!isAllConstantBuildVector(And.getOperand(1), C1)) 5475 return SDValue(); 5476 5477 // Is C1 == ~C2, taking into account how much one can shift elements of a 5478 // particular size? 5479 uint64_t C2 = C2node->getZExtValue(); 5480 unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); 5481 if (C2 > ElemSizeInBits) 5482 return SDValue(); 5483 unsigned ElemMask = (1 << ElemSizeInBits) - 1; 5484 if ((C1 & ElemMask) != (~C2 & ElemMask)) 5485 return SDValue(); 5486 5487 SDValue X = And.getOperand(0); 5488 SDValue Y = Shift.getOperand(0); 5489 5490 unsigned Intrin = 5491 IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; 5492 SDValue ResultSLI = 5493 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 5494 DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1)); 5495 5496 DEBUG(dbgs() << "aarch64-lower: transformed: \n"); 5497 DEBUG(N->dump(&DAG)); 5498 DEBUG(dbgs() << "into: \n"); 5499 DEBUG(ResultSLI->dump(&DAG)); 5500 5501 ++NumShiftInserts; 5502 return ResultSLI; 5503} 5504 5505SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, 5506 SelectionDAG &DAG) const { 5507 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) 5508 if (EnableAArch64SlrGeneration) { 5509 SDValue Res = tryLowerToSLI(Op.getNode(), DAG); 5510 if (Res.getNode()) 5511 return Res; 5512 } 5513 5514 BuildVectorSDNode *BVN = 5515 dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); 5516 SDValue LHS = Op.getOperand(1); 5517 SDLoc dl(Op); 5518 EVT VT = Op.getValueType(); 5519 5520 // OR commutes, so try swapping the operands. 5521 if (!BVN) { 5522 LHS = Op.getOperand(0); 5523 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 5524 } 5525 if (!BVN) 5526 return Op; 5527 5528 APInt CnstBits(VT.getSizeInBits(), 0); 5529 APInt UndefBits(VT.getSizeInBits(), 0); 5530 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5531 // We make use of a little bit of goto ickiness in order to avoid having to 5532 // duplicate the immediate matching logic for the undef toggled case. 5533 bool SecondTry = false; 5534 AttemptModImm: 5535 5536 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5537 CnstBits = CnstBits.zextOrTrunc(64); 5538 uint64_t CnstVal = CnstBits.getZExtValue(); 5539 5540 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5541 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5542 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5543 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5544 DAG.getConstant(CnstVal, MVT::i32), 5545 DAG.getConstant(0, MVT::i32)); 5546 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5547 } 5548 5549 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5550 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5551 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5552 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5553 DAG.getConstant(CnstVal, MVT::i32), 5554 DAG.getConstant(8, MVT::i32)); 5555 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5556 } 5557 5558 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5559 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5560 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5561 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5562 DAG.getConstant(CnstVal, MVT::i32), 5563 DAG.getConstant(16, MVT::i32)); 5564 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5565 } 5566 5567 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5568 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5569 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5570 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5571 DAG.getConstant(CnstVal, MVT::i32), 5572 DAG.getConstant(24, MVT::i32)); 5573 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5574 } 5575 5576 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5577 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5578 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5579 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5580 DAG.getConstant(CnstVal, MVT::i32), 5581 DAG.getConstant(0, MVT::i32)); 5582 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5583 } 5584 5585 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5586 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5587 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5588 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5589 DAG.getConstant(CnstVal, MVT::i32), 5590 DAG.getConstant(8, MVT::i32)); 5591 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5592 } 5593 } 5594 5595 if (SecondTry) 5596 goto FailedModImm; 5597 SecondTry = true; 5598 CnstBits = UndefBits; 5599 goto AttemptModImm; 5600 } 5601 5602// We can always fall back to a non-immediate OR. 5603FailedModImm: 5604 return Op; 5605} 5606 5607// Normalize the operands of BUILD_VECTOR. The value of constant operands will 5608// be truncated to fit element width. 5609static SDValue NormalizeBuildVector(SDValue Op, 5610 SelectionDAG &DAG) { 5611 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 5612 SDLoc dl(Op); 5613 EVT VT = Op.getValueType(); 5614 EVT EltTy= VT.getVectorElementType(); 5615 5616 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) 5617 return Op; 5618 5619 SmallVector<SDValue, 16> Ops; 5620 for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) { 5621 SDValue Lane = Op.getOperand(I); 5622 if (Lane.getOpcode() == ISD::Constant) { 5623 APInt LowBits(EltTy.getSizeInBits(), 5624 cast<ConstantSDNode>(Lane)->getZExtValue()); 5625 Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32); 5626 } 5627 Ops.push_back(Lane); 5628 } 5629 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 5630} 5631 5632SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, 5633 SelectionDAG &DAG) const { 5634 SDLoc dl(Op); 5635 EVT VT = Op.getValueType(); 5636 Op = NormalizeBuildVector(Op, DAG); 5637 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 5638 5639 APInt CnstBits(VT.getSizeInBits(), 0); 5640 APInt UndefBits(VT.getSizeInBits(), 0); 5641 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5642 // We make use of a little bit of goto ickiness in order to avoid having to 5643 // duplicate the immediate matching logic for the undef toggled case. 5644 bool SecondTry = false; 5645 AttemptModImm: 5646 5647 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5648 CnstBits = CnstBits.zextOrTrunc(64); 5649 uint64_t CnstVal = CnstBits.getZExtValue(); 5650 5651 // Certain magic vector constants (used to express things like NOT 5652 // and NEG) are passed through unmodified. This allows codegen patterns 5653 // for these operations to match. Special-purpose patterns will lower 5654 // these immediates to MOVIs if it proves necessary. 5655 if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL)) 5656 return Op; 5657 5658 // The many faces of MOVI... 5659 if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) { 5660 CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal); 5661 if (VT.getSizeInBits() == 128) { 5662 SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, 5663 DAG.getConstant(CnstVal, MVT::i32)); 5664 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5665 } 5666 5667 // Support the V64 version via subregister insertion. 5668 SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, 5669 DAG.getConstant(CnstVal, MVT::i32)); 5670 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5671 } 5672 5673 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5674 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5675 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5676 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5677 DAG.getConstant(CnstVal, MVT::i32), 5678 DAG.getConstant(0, MVT::i32)); 5679 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5680 } 5681 5682 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5683 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5684 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5685 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5686 DAG.getConstant(CnstVal, MVT::i32), 5687 DAG.getConstant(8, MVT::i32)); 5688 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5689 } 5690 5691 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5692 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5693 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5694 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5695 DAG.getConstant(CnstVal, MVT::i32), 5696 DAG.getConstant(16, MVT::i32)); 5697 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5698 } 5699 5700 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5701 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5702 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5703 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5704 DAG.getConstant(CnstVal, MVT::i32), 5705 DAG.getConstant(24, MVT::i32)); 5706 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5707 } 5708 5709 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5710 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5711 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5712 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5713 DAG.getConstant(CnstVal, MVT::i32), 5714 DAG.getConstant(0, MVT::i32)); 5715 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5716 } 5717 5718 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5719 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5720 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5721 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5722 DAG.getConstant(CnstVal, MVT::i32), 5723 DAG.getConstant(8, MVT::i32)); 5724 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5725 } 5726 5727 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { 5728 CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); 5729 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5730 SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, 5731 DAG.getConstant(CnstVal, MVT::i32), 5732 DAG.getConstant(264, MVT::i32)); 5733 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5734 } 5735 5736 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { 5737 CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); 5738 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5739 SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, 5740 DAG.getConstant(CnstVal, MVT::i32), 5741 DAG.getConstant(272, MVT::i32)); 5742 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5743 } 5744 5745 if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) { 5746 CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal); 5747 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; 5748 SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, 5749 DAG.getConstant(CnstVal, MVT::i32)); 5750 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5751 } 5752 5753 // The few faces of FMOV... 5754 if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) { 5755 CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal); 5756 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; 5757 SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, 5758 DAG.getConstant(CnstVal, MVT::i32)); 5759 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5760 } 5761 5762 if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) && 5763 VT.getSizeInBits() == 128) { 5764 CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); 5765 SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, 5766 DAG.getConstant(CnstVal, MVT::i32)); 5767 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5768 } 5769 5770 // The many faces of MVNI... 5771 CnstVal = ~CnstVal; 5772 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5773 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5774 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5775 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5776 DAG.getConstant(CnstVal, MVT::i32), 5777 DAG.getConstant(0, MVT::i32)); 5778 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5779 } 5780 5781 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5782 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5783 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5784 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5785 DAG.getConstant(CnstVal, MVT::i32), 5786 DAG.getConstant(8, MVT::i32)); 5787 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5788 } 5789 5790 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5791 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5792 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5793 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5794 DAG.getConstant(CnstVal, MVT::i32), 5795 DAG.getConstant(16, MVT::i32)); 5796 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5797 } 5798 5799 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5800 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5801 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5802 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5803 DAG.getConstant(CnstVal, MVT::i32), 5804 DAG.getConstant(24, MVT::i32)); 5805 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5806 } 5807 5808 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5809 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5810 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5811 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5812 DAG.getConstant(CnstVal, MVT::i32), 5813 DAG.getConstant(0, MVT::i32)); 5814 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5815 } 5816 5817 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5818 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5819 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5820 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5821 DAG.getConstant(CnstVal, MVT::i32), 5822 DAG.getConstant(8, MVT::i32)); 5823 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5824 } 5825 5826 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { 5827 CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); 5828 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5829 SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, 5830 DAG.getConstant(CnstVal, MVT::i32), 5831 DAG.getConstant(264, MVT::i32)); 5832 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5833 } 5834 5835 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { 5836 CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); 5837 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5838 SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, 5839 DAG.getConstant(CnstVal, MVT::i32), 5840 DAG.getConstant(272, MVT::i32)); 5841 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5842 } 5843 } 5844 5845 if (SecondTry) 5846 goto FailedModImm; 5847 SecondTry = true; 5848 CnstBits = UndefBits; 5849 goto AttemptModImm; 5850 } 5851FailedModImm: 5852 5853 // Scan through the operands to find some interesting properties we can 5854 // exploit: 5855 // 1) If only one value is used, we can use a DUP, or 5856 // 2) if only the low element is not undef, we can just insert that, or 5857 // 3) if only one constant value is used (w/ some non-constant lanes), 5858 // we can splat the constant value into the whole vector then fill 5859 // in the non-constant lanes. 5860 // 4) FIXME: If different constant values are used, but we can intelligently 5861 // select the values we'll be overwriting for the non-constant 5862 // lanes such that we can directly materialize the vector 5863 // some other way (MOVI, e.g.), we can be sneaky. 5864 unsigned NumElts = VT.getVectorNumElements(); 5865 bool isOnlyLowElement = true; 5866 bool usesOnlyOneValue = true; 5867 bool usesOnlyOneConstantValue = true; 5868 bool isConstant = true; 5869 unsigned NumConstantLanes = 0; 5870 SDValue Value; 5871 SDValue ConstantValue; 5872 for (unsigned i = 0; i < NumElts; ++i) { 5873 SDValue V = Op.getOperand(i); 5874 if (V.getOpcode() == ISD::UNDEF) 5875 continue; 5876 if (i > 0) 5877 isOnlyLowElement = false; 5878 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 5879 isConstant = false; 5880 5881 if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) { 5882 ++NumConstantLanes; 5883 if (!ConstantValue.getNode()) 5884 ConstantValue = V; 5885 else if (ConstantValue != V) 5886 usesOnlyOneConstantValue = false; 5887 } 5888 5889 if (!Value.getNode()) 5890 Value = V; 5891 else if (V != Value) 5892 usesOnlyOneValue = false; 5893 } 5894 5895 if (!Value.getNode()) 5896 return DAG.getUNDEF(VT); 5897 5898 if (isOnlyLowElement) 5899 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 5900 5901 // Use DUP for non-constant splats. For f32 constant splats, reduce to 5902 // i32 and try again. 5903 if (usesOnlyOneValue) { 5904 if (!isConstant) { 5905 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5906 Value.getValueType() != VT) 5907 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); 5908 5909 // This is actually a DUPLANExx operation, which keeps everything vectory. 5910 5911 // DUPLANE works on 128-bit vectors, widen it if necessary. 5912 SDValue Lane = Value.getOperand(1); 5913 Value = Value.getOperand(0); 5914 if (Value.getValueType().getSizeInBits() == 64) 5915 Value = WidenVector(Value, DAG); 5916 5917 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 5918 return DAG.getNode(Opcode, dl, VT, Value, Lane); 5919 } 5920 5921 if (VT.getVectorElementType().isFloatingPoint()) { 5922 SmallVector<SDValue, 8> Ops; 5923 MVT NewType = 5924 (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; 5925 for (unsigned i = 0; i < NumElts; ++i) 5926 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); 5927 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); 5928 SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); 5929 Val = LowerBUILD_VECTOR(Val, DAG); 5930 if (Val.getNode()) 5931 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 5932 } 5933 } 5934 5935 // If there was only one constant value used and for more than one lane, 5936 // start by splatting that value, then replace the non-constant lanes. This 5937 // is better than the default, which will perform a separate initialization 5938 // for each lane. 5939 if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { 5940 SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); 5941 // Now insert the non-constant lanes. 5942 for (unsigned i = 0; i < NumElts; ++i) { 5943 SDValue V = Op.getOperand(i); 5944 SDValue LaneIdx = DAG.getConstant(i, MVT::i64); 5945 if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) { 5946 // Note that type legalization likely mucked about with the VT of the 5947 // source operand, so we may have to convert it here before inserting. 5948 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); 5949 } 5950 } 5951 return Val; 5952 } 5953 5954 // If all elements are constants and the case above didn't get hit, fall back 5955 // to the default expansion, which will generate a load from the constant 5956 // pool. 5957 if (isConstant) 5958 return SDValue(); 5959 5960 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 5961 if (NumElts >= 4) { 5962 SDValue shuffle = ReconstructShuffle(Op, DAG); 5963 if (shuffle != SDValue()) 5964 return shuffle; 5965 } 5966 5967 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 5968 // know the default expansion would otherwise fall back on something even 5969 // worse. For a vector with one or two non-undef values, that's 5970 // scalar_to_vector for the elements followed by a shuffle (provided the 5971 // shuffle is valid for the target) and materialization element by element 5972 // on the stack followed by a load for everything else. 5973 if (!isConstant && !usesOnlyOneValue) { 5974 SDValue Vec = DAG.getUNDEF(VT); 5975 SDValue Op0 = Op.getOperand(0); 5976 unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); 5977 unsigned i = 0; 5978 // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to 5979 // a) Avoid a RMW dependency on the full vector register, and 5980 // b) Allow the register coalescer to fold away the copy if the 5981 // value is already in an S or D register. 5982 if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) { 5983 unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; 5984 MachineSDNode *N = 5985 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, 5986 DAG.getTargetConstant(SubIdx, MVT::i32)); 5987 Vec = SDValue(N, 0); 5988 ++i; 5989 } 5990 for (; i < NumElts; ++i) { 5991 SDValue V = Op.getOperand(i); 5992 if (V.getOpcode() == ISD::UNDEF) 5993 continue; 5994 SDValue LaneIdx = DAG.getConstant(i, MVT::i64); 5995 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 5996 } 5997 return Vec; 5998 } 5999 6000 // Just use the default expansion. We failed to find a better alternative. 6001 return SDValue(); 6002} 6003 6004SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 6005 SelectionDAG &DAG) const { 6006 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); 6007 6008 // Check for non-constant or out of range lane. 6009 EVT VT = Op.getOperand(0).getValueType(); 6010 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 6011 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 6012 return SDValue(); 6013 6014 6015 // Insertion/extraction are legal for V128 types. 6016 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 6017 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 6018 VT == MVT::v8f16) 6019 return Op; 6020 6021 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 6022 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 6023 return SDValue(); 6024 6025 // For V64 types, we perform insertion by expanding the value 6026 // to a V128 type and perform the insertion on that. 6027 SDLoc DL(Op); 6028 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 6029 EVT WideTy = WideVec.getValueType(); 6030 6031 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, 6032 Op.getOperand(1), Op.getOperand(2)); 6033 // Re-narrow the resultant vector. 6034 return NarrowVector(Node, DAG); 6035} 6036 6037SDValue 6038AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6039 SelectionDAG &DAG) const { 6040 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); 6041 6042 // Check for non-constant or out of range lane. 6043 EVT VT = Op.getOperand(0).getValueType(); 6044 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6045 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 6046 return SDValue(); 6047 6048 6049 // Insertion/extraction are legal for V128 types. 6050 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 6051 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 6052 VT == MVT::v8f16) 6053 return Op; 6054 6055 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 6056 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 6057 return SDValue(); 6058 6059 // For V64 types, we perform extraction by expanding the value 6060 // to a V128 type and perform the extraction on that. 6061 SDLoc DL(Op); 6062 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 6063 EVT WideTy = WideVec.getValueType(); 6064 6065 EVT ExtrTy = WideTy.getVectorElementType(); 6066 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) 6067 ExtrTy = MVT::i32; 6068 6069 // For extractions, we just return the result directly. 6070 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, 6071 Op.getOperand(1)); 6072} 6073 6074SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 6075 SelectionDAG &DAG) const { 6076 EVT VT = Op.getOperand(0).getValueType(); 6077 SDLoc dl(Op); 6078 // Just in case... 6079 if (!VT.isVector()) 6080 return SDValue(); 6081 6082 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6083 if (!Cst) 6084 return SDValue(); 6085 unsigned Val = Cst->getZExtValue(); 6086 6087 unsigned Size = Op.getValueType().getSizeInBits(); 6088 if (Val == 0) { 6089 switch (Size) { 6090 case 8: 6091 return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(), 6092 Op.getOperand(0)); 6093 case 16: 6094 return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(), 6095 Op.getOperand(0)); 6096 case 32: 6097 return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(), 6098 Op.getOperand(0)); 6099 case 64: 6100 return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(), 6101 Op.getOperand(0)); 6102 default: 6103 llvm_unreachable("Unexpected vector type in extract_subvector!"); 6104 } 6105 } 6106 // If this is extracting the upper 64-bits of a 128-bit vector, we match 6107 // that directly. 6108 if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) 6109 return Op; 6110 6111 return SDValue(); 6112} 6113 6114bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 6115 EVT VT) const { 6116 if (VT.getVectorNumElements() == 4 && 6117 (VT.is128BitVector() || VT.is64BitVector())) { 6118 unsigned PFIndexes[4]; 6119 for (unsigned i = 0; i != 4; ++i) { 6120 if (M[i] < 0) 6121 PFIndexes[i] = 8; 6122 else 6123 PFIndexes[i] = M[i]; 6124 } 6125 6126 // Compute the index in the perfect shuffle table. 6127 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 6128 PFIndexes[2] * 9 + PFIndexes[3]; 6129 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6130 unsigned Cost = (PFEntry >> 30); 6131 6132 if (Cost <= 4) 6133 return true; 6134 } 6135 6136 bool DummyBool; 6137 int DummyInt; 6138 unsigned DummyUnsigned; 6139 6140 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || 6141 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || 6142 isEXTMask(M, VT, DummyBool, DummyUnsigned) || 6143 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. 6144 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || 6145 isZIPMask(M, VT, DummyUnsigned) || 6146 isTRN_v_undef_Mask(M, VT, DummyUnsigned) || 6147 isUZP_v_undef_Mask(M, VT, DummyUnsigned) || 6148 isZIP_v_undef_Mask(M, VT, DummyUnsigned) || 6149 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || 6150 isConcatMask(M, VT, VT.getSizeInBits() == 128)); 6151} 6152 6153/// getVShiftImm - Check if this is a valid build_vector for the immediate 6154/// operand of a vector shift operation, where all the elements of the 6155/// build_vector must have the same constant integer value. 6156static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6157 // Ignore bit_converts. 6158 while (Op.getOpcode() == ISD::BITCAST) 6159 Op = Op.getOperand(0); 6160 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6161 APInt SplatBits, SplatUndef; 6162 unsigned SplatBitSize; 6163 bool HasAnyUndefs; 6164 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 6165 HasAnyUndefs, ElementBits) || 6166 SplatBitSize > ElementBits) 6167 return false; 6168 Cnt = SplatBits.getSExtValue(); 6169 return true; 6170} 6171 6172/// isVShiftLImm - Check if this is a valid build_vector for the immediate 6173/// operand of a vector shift left operation. That value must be in the range: 6174/// 0 <= Value < ElementBits for a left shift; or 6175/// 0 <= Value <= ElementBits for a long left shift. 6176static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6177 assert(VT.isVector() && "vector shift count is not a vector type"); 6178 unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); 6179 if (!getVShiftImm(Op, ElementBits, Cnt)) 6180 return false; 6181 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6182} 6183 6184/// isVShiftRImm - Check if this is a valid build_vector for the immediate 6185/// operand of a vector shift right operation. For a shift opcode, the value 6186/// is positive, but for an intrinsic the value count must be negative. The 6187/// absolute value must be in the range: 6188/// 1 <= |Value| <= ElementBits for a right shift; or 6189/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6190static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6191 int64_t &Cnt) { 6192 assert(VT.isVector() && "vector shift count is not a vector type"); 6193 unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); 6194 if (!getVShiftImm(Op, ElementBits, Cnt)) 6195 return false; 6196 if (isIntrinsic) 6197 Cnt = -Cnt; 6198 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6199} 6200 6201SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, 6202 SelectionDAG &DAG) const { 6203 EVT VT = Op.getValueType(); 6204 SDLoc DL(Op); 6205 int64_t Cnt; 6206 6207 if (!Op.getOperand(1).getValueType().isVector()) 6208 return Op; 6209 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 6210 6211 switch (Op.getOpcode()) { 6212 default: 6213 llvm_unreachable("unexpected shift opcode"); 6214 6215 case ISD::SHL: 6216 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) 6217 return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0), 6218 DAG.getConstant(Cnt, MVT::i32)); 6219 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 6220 DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32), 6221 Op.getOperand(0), Op.getOperand(1)); 6222 case ISD::SRA: 6223 case ISD::SRL: 6224 // Right shift immediate 6225 if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) && 6226 Cnt < EltSize) { 6227 unsigned Opc = 6228 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; 6229 return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0), 6230 DAG.getConstant(Cnt, MVT::i32)); 6231 } 6232 6233 // Right shift register. Note, there is not a shift right register 6234 // instruction, but the shift left register instruction takes a signed 6235 // value, where negative numbers specify a right shift. 6236 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl 6237 : Intrinsic::aarch64_neon_ushl; 6238 // negate the shift amount 6239 SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); 6240 SDValue NegShiftLeft = 6241 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 6242 DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift); 6243 return NegShiftLeft; 6244 } 6245 6246 return SDValue(); 6247} 6248 6249static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, 6250 AArch64CC::CondCode CC, bool NoNans, EVT VT, 6251 SDLoc dl, SelectionDAG &DAG) { 6252 EVT SrcVT = LHS.getValueType(); 6253 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && 6254 "function only supposed to emit natural comparisons"); 6255 6256 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 6257 APInt CnstBits(VT.getSizeInBits(), 0); 6258 APInt UndefBits(VT.getSizeInBits(), 0); 6259 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); 6260 bool IsZero = IsCnst && (CnstBits == 0); 6261 6262 if (SrcVT.getVectorElementType().isFloatingPoint()) { 6263 switch (CC) { 6264 default: 6265 return SDValue(); 6266 case AArch64CC::NE: { 6267 SDValue Fcmeq; 6268 if (IsZero) 6269 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 6270 else 6271 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 6272 return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); 6273 } 6274 case AArch64CC::EQ: 6275 if (IsZero) 6276 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 6277 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 6278 case AArch64CC::GE: 6279 if (IsZero) 6280 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); 6281 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); 6282 case AArch64CC::GT: 6283 if (IsZero) 6284 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); 6285 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); 6286 case AArch64CC::LS: 6287 if (IsZero) 6288 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); 6289 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); 6290 case AArch64CC::LT: 6291 if (!NoNans) 6292 return SDValue(); 6293 // If we ignore NaNs then we can use to the MI implementation. 6294 // Fallthrough. 6295 case AArch64CC::MI: 6296 if (IsZero) 6297 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); 6298 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); 6299 } 6300 } 6301 6302 switch (CC) { 6303 default: 6304 return SDValue(); 6305 case AArch64CC::NE: { 6306 SDValue Cmeq; 6307 if (IsZero) 6308 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 6309 else 6310 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 6311 return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); 6312 } 6313 case AArch64CC::EQ: 6314 if (IsZero) 6315 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 6316 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 6317 case AArch64CC::GE: 6318 if (IsZero) 6319 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); 6320 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); 6321 case AArch64CC::GT: 6322 if (IsZero) 6323 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); 6324 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); 6325 case AArch64CC::LE: 6326 if (IsZero) 6327 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); 6328 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); 6329 case AArch64CC::LS: 6330 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); 6331 case AArch64CC::LO: 6332 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); 6333 case AArch64CC::LT: 6334 if (IsZero) 6335 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); 6336 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); 6337 case AArch64CC::HI: 6338 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); 6339 case AArch64CC::HS: 6340 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); 6341 } 6342} 6343 6344SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, 6345 SelectionDAG &DAG) const { 6346 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6347 SDValue LHS = Op.getOperand(0); 6348 SDValue RHS = Op.getOperand(1); 6349 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); 6350 SDLoc dl(Op); 6351 6352 if (LHS.getValueType().getVectorElementType().isInteger()) { 6353 assert(LHS.getValueType() == RHS.getValueType()); 6354 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 6355 SDValue Cmp = 6356 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); 6357 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 6358 } 6359 6360 assert(LHS.getValueType().getVectorElementType() == MVT::f32 || 6361 LHS.getValueType().getVectorElementType() == MVT::f64); 6362 6363 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 6364 // clean. Some of them require two branches to implement. 6365 AArch64CC::CondCode CC1, CC2; 6366 bool ShouldInvert; 6367 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); 6368 6369 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; 6370 SDValue Cmp = 6371 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); 6372 if (!Cmp.getNode()) 6373 return SDValue(); 6374 6375 if (CC2 != AArch64CC::AL) { 6376 SDValue Cmp2 = 6377 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); 6378 if (!Cmp2.getNode()) 6379 return SDValue(); 6380 6381 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); 6382 } 6383 6384 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 6385 6386 if (ShouldInvert) 6387 return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); 6388 6389 return Cmp; 6390} 6391 6392/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 6393/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 6394/// specified in the intrinsic calls. 6395bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 6396 const CallInst &I, 6397 unsigned Intrinsic) const { 6398 switch (Intrinsic) { 6399 case Intrinsic::aarch64_neon_ld2: 6400 case Intrinsic::aarch64_neon_ld3: 6401 case Intrinsic::aarch64_neon_ld4: 6402 case Intrinsic::aarch64_neon_ld1x2: 6403 case Intrinsic::aarch64_neon_ld1x3: 6404 case Intrinsic::aarch64_neon_ld1x4: 6405 case Intrinsic::aarch64_neon_ld2lane: 6406 case Intrinsic::aarch64_neon_ld3lane: 6407 case Intrinsic::aarch64_neon_ld4lane: 6408 case Intrinsic::aarch64_neon_ld2r: 6409 case Intrinsic::aarch64_neon_ld3r: 6410 case Intrinsic::aarch64_neon_ld4r: { 6411 Info.opc = ISD::INTRINSIC_W_CHAIN; 6412 // Conservatively set memVT to the entire set of vectors loaded. 6413 uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; 6414 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 6415 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 6416 Info.offset = 0; 6417 Info.align = 0; 6418 Info.vol = false; // volatile loads with NEON intrinsics not supported 6419 Info.readMem = true; 6420 Info.writeMem = false; 6421 return true; 6422 } 6423 case Intrinsic::aarch64_neon_st2: 6424 case Intrinsic::aarch64_neon_st3: 6425 case Intrinsic::aarch64_neon_st4: 6426 case Intrinsic::aarch64_neon_st1x2: 6427 case Intrinsic::aarch64_neon_st1x3: 6428 case Intrinsic::aarch64_neon_st1x4: 6429 case Intrinsic::aarch64_neon_st2lane: 6430 case Intrinsic::aarch64_neon_st3lane: 6431 case Intrinsic::aarch64_neon_st4lane: { 6432 Info.opc = ISD::INTRINSIC_VOID; 6433 // Conservatively set memVT to the entire set of vectors stored. 6434 unsigned NumElts = 0; 6435 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 6436 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 6437 if (!ArgTy->isVectorTy()) 6438 break; 6439 NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; 6440 } 6441 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 6442 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 6443 Info.offset = 0; 6444 Info.align = 0; 6445 Info.vol = false; // volatile stores with NEON intrinsics not supported 6446 Info.readMem = false; 6447 Info.writeMem = true; 6448 return true; 6449 } 6450 case Intrinsic::aarch64_ldaxr: 6451 case Intrinsic::aarch64_ldxr: { 6452 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 6453 Info.opc = ISD::INTRINSIC_W_CHAIN; 6454 Info.memVT = MVT::getVT(PtrTy->getElementType()); 6455 Info.ptrVal = I.getArgOperand(0); 6456 Info.offset = 0; 6457 Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); 6458 Info.vol = true; 6459 Info.readMem = true; 6460 Info.writeMem = false; 6461 return true; 6462 } 6463 case Intrinsic::aarch64_stlxr: 6464 case Intrinsic::aarch64_stxr: { 6465 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 6466 Info.opc = ISD::INTRINSIC_W_CHAIN; 6467 Info.memVT = MVT::getVT(PtrTy->getElementType()); 6468 Info.ptrVal = I.getArgOperand(1); 6469 Info.offset = 0; 6470 Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); 6471 Info.vol = true; 6472 Info.readMem = false; 6473 Info.writeMem = true; 6474 return true; 6475 } 6476 case Intrinsic::aarch64_ldaxp: 6477 case Intrinsic::aarch64_ldxp: { 6478 Info.opc = ISD::INTRINSIC_W_CHAIN; 6479 Info.memVT = MVT::i128; 6480 Info.ptrVal = I.getArgOperand(0); 6481 Info.offset = 0; 6482 Info.align = 16; 6483 Info.vol = true; 6484 Info.readMem = true; 6485 Info.writeMem = false; 6486 return true; 6487 } 6488 case Intrinsic::aarch64_stlxp: 6489 case Intrinsic::aarch64_stxp: { 6490 Info.opc = ISD::INTRINSIC_W_CHAIN; 6491 Info.memVT = MVT::i128; 6492 Info.ptrVal = I.getArgOperand(2); 6493 Info.offset = 0; 6494 Info.align = 16; 6495 Info.vol = true; 6496 Info.readMem = false; 6497 Info.writeMem = true; 6498 return true; 6499 } 6500 default: 6501 break; 6502 } 6503 6504 return false; 6505} 6506 6507// Truncations from 64-bit GPR to 32-bit GPR is free. 6508bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 6509 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 6510 return false; 6511 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6512 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6513 return NumBits1 > NumBits2; 6514} 6515bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 6516 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 6517 return false; 6518 unsigned NumBits1 = VT1.getSizeInBits(); 6519 unsigned NumBits2 = VT2.getSizeInBits(); 6520 return NumBits1 > NumBits2; 6521} 6522 6523// All 32-bit GPR operations implicitly zero the high-half of the corresponding 6524// 64-bit GPR. 6525bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 6526 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 6527 return false; 6528 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6529 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6530 return NumBits1 == 32 && NumBits2 == 64; 6531} 6532bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 6533 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 6534 return false; 6535 unsigned NumBits1 = VT1.getSizeInBits(); 6536 unsigned NumBits2 = VT2.getSizeInBits(); 6537 return NumBits1 == 32 && NumBits2 == 64; 6538} 6539 6540bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 6541 EVT VT1 = Val.getValueType(); 6542 if (isZExtFree(VT1, VT2)) { 6543 return true; 6544 } 6545 6546 if (Val.getOpcode() != ISD::LOAD) 6547 return false; 6548 6549 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. 6550 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && 6551 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && 6552 VT1.getSizeInBits() <= 32); 6553} 6554 6555bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, 6556 unsigned &RequiredAligment) const { 6557 if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) 6558 return false; 6559 // Cyclone supports unaligned accesses. 6560 RequiredAligment = 0; 6561 unsigned NumBits = LoadedType->getPrimitiveSizeInBits(); 6562 return NumBits == 32 || NumBits == 64; 6563} 6564 6565bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, 6566 unsigned &RequiredAligment) const { 6567 if (!LoadedType.isSimple() || 6568 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) 6569 return false; 6570 // Cyclone supports unaligned accesses. 6571 RequiredAligment = 0; 6572 unsigned NumBits = LoadedType.getSizeInBits(); 6573 return NumBits == 32 || NumBits == 64; 6574} 6575 6576static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 6577 unsigned AlignCheck) { 6578 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 6579 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 6580} 6581 6582EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 6583 unsigned SrcAlign, bool IsMemset, 6584 bool ZeroMemset, 6585 bool MemcpyStrSrc, 6586 MachineFunction &MF) const { 6587 // Don't use AdvSIMD to implement 16-byte memset. It would have taken one 6588 // instruction to materialize the v2i64 zero and one store (with restrictive 6589 // addressing mode). Just do two i64 store of zero-registers. 6590 bool Fast; 6591 const Function *F = MF.getFunction(); 6592 if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && 6593 !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, 6594 Attribute::NoImplicitFloat) && 6595 (memOpAlign(SrcAlign, DstAlign, 16) || 6596 (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) 6597 return MVT::f128; 6598 6599 return Size >= 8 ? MVT::i64 : MVT::i32; 6600} 6601 6602// 12-bit optionally shifted immediates are legal for adds. 6603bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { 6604 if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) 6605 return true; 6606 return false; 6607} 6608 6609// Integer comparisons are implemented with ADDS/SUBS, so the range of valid 6610// immediates is the same as for an add or a sub. 6611bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { 6612 if (Immed < 0) 6613 Immed *= -1; 6614 return isLegalAddImmediate(Immed); 6615} 6616 6617/// isLegalAddressingMode - Return true if the addressing mode represented 6618/// by AM is legal for this target, for a load/store of the specified type. 6619bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, 6620 Type *Ty) const { 6621 // AArch64 has five basic addressing modes: 6622 // reg 6623 // reg + 9-bit signed offset 6624 // reg + SIZE_IN_BYTES * 12-bit unsigned offset 6625 // reg1 + reg2 6626 // reg + SIZE_IN_BYTES * reg 6627 6628 // No global is ever allowed as a base. 6629 if (AM.BaseGV) 6630 return false; 6631 6632 // No reg+reg+imm addressing. 6633 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) 6634 return false; 6635 6636 // check reg + imm case: 6637 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 6638 uint64_t NumBytes = 0; 6639 if (Ty->isSized()) { 6640 uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty); 6641 NumBytes = NumBits / 8; 6642 if (!isPowerOf2_64(NumBits)) 6643 NumBytes = 0; 6644 } 6645 6646 if (!AM.Scale) { 6647 int64_t Offset = AM.BaseOffs; 6648 6649 // 9-bit signed offset 6650 if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1) 6651 return true; 6652 6653 // 12-bit unsigned offset 6654 unsigned shift = Log2_64(NumBytes); 6655 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 6656 // Must be a multiple of NumBytes (NumBytes is a power of 2) 6657 (Offset >> shift) << shift == Offset) 6658 return true; 6659 return false; 6660 } 6661 6662 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 6663 6664 if (!AM.Scale || AM.Scale == 1 || 6665 (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) 6666 return true; 6667 return false; 6668} 6669 6670int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM, 6671 Type *Ty) const { 6672 // Scaling factors are not free at all. 6673 // Operands | Rt Latency 6674 // ------------------------------------------- 6675 // Rt, [Xn, Xm] | 4 6676 // ------------------------------------------- 6677 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 6678 // Rt, [Xn, Wm, <extend> #imm] | 6679 if (isLegalAddressingMode(AM, Ty)) 6680 // Scale represents reg2 * scale, thus account for 1 if 6681 // it is not equal to 0 or 1. 6682 return AM.Scale != 0 && AM.Scale != 1; 6683 return -1; 6684} 6685 6686bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 6687 VT = VT.getScalarType(); 6688 6689 if (!VT.isSimple()) 6690 return false; 6691 6692 switch (VT.getSimpleVT().SimpleTy) { 6693 case MVT::f32: 6694 case MVT::f64: 6695 return true; 6696 default: 6697 break; 6698 } 6699 6700 return false; 6701} 6702 6703const MCPhysReg * 6704AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { 6705 // LR is a callee-save register, but we must treat it as clobbered by any call 6706 // site. Hence we include LR in the scratch registers, which are in turn added 6707 // as implicit-defs for stackmaps and patchpoints. 6708 static const MCPhysReg ScratchRegs[] = { 6709 AArch64::X16, AArch64::X17, AArch64::LR, 0 6710 }; 6711 return ScratchRegs; 6712} 6713 6714bool 6715AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const { 6716 EVT VT = N->getValueType(0); 6717 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine 6718 // it with shift to let it be lowered to UBFX. 6719 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && 6720 isa<ConstantSDNode>(N->getOperand(1))) { 6721 uint64_t TruncMask = N->getConstantOperandVal(1); 6722 if (isMask_64(TruncMask) && 6723 N->getOperand(0).getOpcode() == ISD::SRL && 6724 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1))) 6725 return false; 6726 } 6727 return true; 6728} 6729 6730bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 6731 Type *Ty) const { 6732 assert(Ty->isIntegerTy()); 6733 6734 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 6735 if (BitSize == 0) 6736 return false; 6737 6738 int64_t Val = Imm.getSExtValue(); 6739 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) 6740 return true; 6741 6742 if ((int64_t)Val < 0) 6743 Val = ~Val; 6744 if (BitSize == 32) 6745 Val &= (1LL << 32) - 1; 6746 6747 unsigned LZ = countLeadingZeros((uint64_t)Val); 6748 unsigned Shift = (63 - LZ) / 16; 6749 // MOVZ is free so return true for one or fewer MOVK. 6750 return (Shift < 3) ? true : false; 6751} 6752 6753// Generate SUBS and CSEL for integer abs. 6754static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 6755 EVT VT = N->getValueType(0); 6756 6757 SDValue N0 = N->getOperand(0); 6758 SDValue N1 = N->getOperand(1); 6759 SDLoc DL(N); 6760 6761 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 6762 // and change it to SUB and CSEL. 6763 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 6764 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && 6765 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) 6766 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 6767 if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { 6768 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 6769 N0.getOperand(0)); 6770 // Generate SUBS & CSEL. 6771 SDValue Cmp = 6772 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), 6773 N0.getOperand(0), DAG.getConstant(0, VT)); 6774 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, 6775 DAG.getConstant(AArch64CC::PL, MVT::i32), 6776 SDValue(Cmp.getNode(), 1)); 6777 } 6778 return SDValue(); 6779} 6780 6781// performXorCombine - Attempts to handle integer ABS. 6782static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, 6783 TargetLowering::DAGCombinerInfo &DCI, 6784 const AArch64Subtarget *Subtarget) { 6785 if (DCI.isBeforeLegalizeOps()) 6786 return SDValue(); 6787 6788 return performIntegerAbsCombine(N, DAG); 6789} 6790 6791SDValue 6792AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 6793 SelectionDAG &DAG, 6794 std::vector<SDNode *> *Created) const { 6795 // fold (sdiv X, pow2) 6796 EVT VT = N->getValueType(0); 6797 if ((VT != MVT::i32 && VT != MVT::i64) || 6798 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 6799 return SDValue(); 6800 6801 SDLoc DL(N); 6802 SDValue N0 = N->getOperand(0); 6803 unsigned Lg2 = Divisor.countTrailingZeros(); 6804 SDValue Zero = DAG.getConstant(0, VT); 6805 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT); 6806 6807 // Add (N0 < 0) ? Pow2 - 1 : 0; 6808 SDValue CCVal; 6809 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); 6810 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); 6811 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); 6812 6813 if (Created) { 6814 Created->push_back(Cmp.getNode()); 6815 Created->push_back(Add.getNode()); 6816 Created->push_back(CSel.getNode()); 6817 } 6818 6819 // Divide by pow2. 6820 SDValue SRA = 6821 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64)); 6822 6823 // If we're dividing by a positive value, we're done. Otherwise, we must 6824 // negate the result. 6825 if (Divisor.isNonNegative()) 6826 return SRA; 6827 6828 if (Created) 6829 Created->push_back(SRA.getNode()); 6830 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA); 6831} 6832 6833static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, 6834 TargetLowering::DAGCombinerInfo &DCI, 6835 const AArch64Subtarget *Subtarget) { 6836 if (DCI.isBeforeLegalizeOps()) 6837 return SDValue(); 6838 6839 // Multiplication of a power of two plus/minus one can be done more 6840 // cheaply as as shift+add/sub. For now, this is true unilaterally. If 6841 // future CPUs have a cheaper MADD instruction, this may need to be 6842 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and 6843 // 64-bit is 5 cycles, so this is always a win. 6844 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 6845 APInt Value = C->getAPIntValue(); 6846 EVT VT = N->getValueType(0); 6847 if (Value.isNonNegative()) { 6848 // (mul x, 2^N + 1) => (add (shl x, N), x) 6849 APInt VM1 = Value - 1; 6850 if (VM1.isPowerOf2()) { 6851 SDValue ShiftedVal = 6852 DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), 6853 DAG.getConstant(VM1.logBase2(), MVT::i64)); 6854 return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, 6855 N->getOperand(0)); 6856 } 6857 // (mul x, 2^N - 1) => (sub (shl x, N), x) 6858 APInt VP1 = Value + 1; 6859 if (VP1.isPowerOf2()) { 6860 SDValue ShiftedVal = 6861 DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), 6862 DAG.getConstant(VP1.logBase2(), MVT::i64)); 6863 return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, 6864 N->getOperand(0)); 6865 } 6866 } else { 6867 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 6868 APInt VNM1 = -Value - 1; 6869 if (VNM1.isPowerOf2()) { 6870 SDValue ShiftedVal = 6871 DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), 6872 DAG.getConstant(VNM1.logBase2(), MVT::i64)); 6873 SDValue Add = 6874 DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); 6875 return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add); 6876 } 6877 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 6878 APInt VNP1 = -Value + 1; 6879 if (VNP1.isPowerOf2()) { 6880 SDValue ShiftedVal = 6881 DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), 6882 DAG.getConstant(VNP1.logBase2(), MVT::i64)); 6883 return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0), 6884 ShiftedVal); 6885 } 6886 } 6887 } 6888 return SDValue(); 6889} 6890 6891static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 6892 SelectionDAG &DAG) { 6893 // Take advantage of vector comparisons producing 0 or -1 in each lane to 6894 // optimize away operation when it's from a constant. 6895 // 6896 // The general transformation is: 6897 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 6898 // AND(VECTOR_CMP(x,y), constant2) 6899 // constant2 = UNARYOP(constant) 6900 6901 // Early exit if this isn't a vector operation, the operand of the 6902 // unary operation isn't a bitwise AND, or if the sizes of the operations 6903 // aren't the same. 6904 EVT VT = N->getValueType(0); 6905 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 6906 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 6907 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 6908 return SDValue(); 6909 6910 // Now check that the other operand of the AND is a constant. We could 6911 // make the transformation for non-constant splats as well, but it's unclear 6912 // that would be a benefit as it would not eliminate any operations, just 6913 // perform one more step in scalar code before moving to the vector unit. 6914 if (BuildVectorSDNode *BV = 6915 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 6916 // Bail out if the vector isn't a constant. 6917 if (!BV->isConstant()) 6918 return SDValue(); 6919 6920 // Everything checks out. Build up the new and improved node. 6921 SDLoc DL(N); 6922 EVT IntVT = BV->getValueType(0); 6923 // Create a new constant of the appropriate type for the transformed 6924 // DAG. 6925 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 6926 // The AND node needs bitcasts to/from an integer vector type around it. 6927 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 6928 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 6929 N->getOperand(0)->getOperand(0), MaskConst); 6930 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 6931 return Res; 6932 } 6933 6934 return SDValue(); 6935} 6936 6937static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, 6938 const AArch64Subtarget *Subtarget) { 6939 // First try to optimize away the conversion when it's conditionally from 6940 // a constant. Vectors only. 6941 SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); 6942 if (Res != SDValue()) 6943 return Res; 6944 6945 EVT VT = N->getValueType(0); 6946 if (VT != MVT::f32 && VT != MVT::f64) 6947 return SDValue(); 6948 6949 // Only optimize when the source and destination types have the same width. 6950 if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) 6951 return SDValue(); 6952 6953 // If the result of an integer load is only used by an integer-to-float 6954 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. 6955 // This eliminates an "integer-to-vector-move UOP and improve throughput. 6956 SDValue N0 = N->getOperand(0); 6957 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 6958 // Do not change the width of a volatile load. 6959 !cast<LoadSDNode>(N0)->isVolatile()) { 6960 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 6961 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 6962 LN0->getPointerInfo(), LN0->isVolatile(), 6963 LN0->isNonTemporal(), LN0->isInvariant(), 6964 LN0->getAlignment()); 6965 6966 // Make sure successors of the original load stay after it by updating them 6967 // to use the new Chain. 6968 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); 6969 6970 unsigned Opcode = 6971 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; 6972 return DAG.getNode(Opcode, SDLoc(N), VT, Load); 6973 } 6974 6975 return SDValue(); 6976} 6977 6978/// An EXTR instruction is made up of two shifts, ORed together. This helper 6979/// searches for and classifies those shifts. 6980static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 6981 bool &FromHi) { 6982 if (N.getOpcode() == ISD::SHL) 6983 FromHi = false; 6984 else if (N.getOpcode() == ISD::SRL) 6985 FromHi = true; 6986 else 6987 return false; 6988 6989 if (!isa<ConstantSDNode>(N.getOperand(1))) 6990 return false; 6991 6992 ShiftAmount = N->getConstantOperandVal(1); 6993 Src = N->getOperand(0); 6994 return true; 6995} 6996 6997/// EXTR instruction extracts a contiguous chunk of bits from two existing 6998/// registers viewed as a high/low pair. This function looks for the pattern: 6999/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an 7000/// EXTR. Can't quite be done in TableGen because the two immediates aren't 7001/// independent. 7002static SDValue tryCombineToEXTR(SDNode *N, 7003 TargetLowering::DAGCombinerInfo &DCI) { 7004 SelectionDAG &DAG = DCI.DAG; 7005 SDLoc DL(N); 7006 EVT VT = N->getValueType(0); 7007 7008 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 7009 7010 if (VT != MVT::i32 && VT != MVT::i64) 7011 return SDValue(); 7012 7013 SDValue LHS; 7014 uint32_t ShiftLHS = 0; 7015 bool LHSFromHi = 0; 7016 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 7017 return SDValue(); 7018 7019 SDValue RHS; 7020 uint32_t ShiftRHS = 0; 7021 bool RHSFromHi = 0; 7022 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 7023 return SDValue(); 7024 7025 // If they're both trying to come from the high part of the register, they're 7026 // not really an EXTR. 7027 if (LHSFromHi == RHSFromHi) 7028 return SDValue(); 7029 7030 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 7031 return SDValue(); 7032 7033 if (LHSFromHi) { 7034 std::swap(LHS, RHS); 7035 std::swap(ShiftLHS, ShiftRHS); 7036 } 7037 7038 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, 7039 DAG.getConstant(ShiftRHS, MVT::i64)); 7040} 7041 7042static SDValue tryCombineToBSL(SDNode *N, 7043 TargetLowering::DAGCombinerInfo &DCI) { 7044 EVT VT = N->getValueType(0); 7045 SelectionDAG &DAG = DCI.DAG; 7046 SDLoc DL(N); 7047 7048 if (!VT.isVector()) 7049 return SDValue(); 7050 7051 SDValue N0 = N->getOperand(0); 7052 if (N0.getOpcode() != ISD::AND) 7053 return SDValue(); 7054 7055 SDValue N1 = N->getOperand(1); 7056 if (N1.getOpcode() != ISD::AND) 7057 return SDValue(); 7058 7059 // We only have to look for constant vectors here since the general, variable 7060 // case can be handled in TableGen. 7061 unsigned Bits = VT.getVectorElementType().getSizeInBits(); 7062 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); 7063 for (int i = 1; i >= 0; --i) 7064 for (int j = 1; j >= 0; --j) { 7065 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); 7066 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); 7067 if (!BVN0 || !BVN1) 7068 continue; 7069 7070 bool FoundMatch = true; 7071 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { 7072 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); 7073 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); 7074 if (!CN0 || !CN1 || 7075 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { 7076 FoundMatch = false; 7077 break; 7078 } 7079 } 7080 7081 if (FoundMatch) 7082 return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), 7083 N0->getOperand(1 - i), N1->getOperand(1 - j)); 7084 } 7085 7086 return SDValue(); 7087} 7088 7089static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 7090 const AArch64Subtarget *Subtarget) { 7091 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) 7092 if (!EnableAArch64ExtrGeneration) 7093 return SDValue(); 7094 SelectionDAG &DAG = DCI.DAG; 7095 EVT VT = N->getValueType(0); 7096 7097 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 7098 return SDValue(); 7099 7100 SDValue Res = tryCombineToEXTR(N, DCI); 7101 if (Res.getNode()) 7102 return Res; 7103 7104 Res = tryCombineToBSL(N, DCI); 7105 if (Res.getNode()) 7106 return Res; 7107 7108 return SDValue(); 7109} 7110 7111static SDValue performBitcastCombine(SDNode *N, 7112 TargetLowering::DAGCombinerInfo &DCI, 7113 SelectionDAG &DAG) { 7114 // Wait 'til after everything is legalized to try this. That way we have 7115 // legal vector types and such. 7116 if (DCI.isBeforeLegalizeOps()) 7117 return SDValue(); 7118 7119 // Remove extraneous bitcasts around an extract_subvector. 7120 // For example, 7121 // (v4i16 (bitconvert 7122 // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) 7123 // becomes 7124 // (extract_subvector ((v8i16 ...), (i64 4))) 7125 7126 // Only interested in 64-bit vectors as the ultimate result. 7127 EVT VT = N->getValueType(0); 7128 if (!VT.isVector()) 7129 return SDValue(); 7130 if (VT.getSimpleVT().getSizeInBits() != 64) 7131 return SDValue(); 7132 // Is the operand an extract_subvector starting at the beginning or halfway 7133 // point of the vector? A low half may also come through as an 7134 // EXTRACT_SUBREG, so look for that, too. 7135 SDValue Op0 = N->getOperand(0); 7136 if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && 7137 !(Op0->isMachineOpcode() && 7138 Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) 7139 return SDValue(); 7140 uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue(); 7141 if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { 7142 if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) 7143 return SDValue(); 7144 } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { 7145 if (idx != AArch64::dsub) 7146 return SDValue(); 7147 // The dsub reference is equivalent to a lane zero subvector reference. 7148 idx = 0; 7149 } 7150 // Look through the bitcast of the input to the extract. 7151 if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) 7152 return SDValue(); 7153 SDValue Source = Op0->getOperand(0)->getOperand(0); 7154 // If the source type has twice the number of elements as our destination 7155 // type, we know this is an extract of the high or low half of the vector. 7156 EVT SVT = Source->getValueType(0); 7157 if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) 7158 return SDValue(); 7159 7160 DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); 7161 7162 // Create the simplified form to just extract the low or high half of the 7163 // vector directly rather than bothering with the bitcasts. 7164 SDLoc dl(N); 7165 unsigned NumElements = VT.getVectorNumElements(); 7166 if (idx) { 7167 SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64); 7168 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); 7169 } else { 7170 SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32); 7171 return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, 7172 Source, SubReg), 7173 0); 7174 } 7175} 7176 7177static SDValue performConcatVectorsCombine(SDNode *N, 7178 TargetLowering::DAGCombinerInfo &DCI, 7179 SelectionDAG &DAG) { 7180 // Wait 'til after everything is legalized to try this. That way we have 7181 // legal vector types and such. 7182 if (DCI.isBeforeLegalizeOps()) 7183 return SDValue(); 7184 7185 SDLoc dl(N); 7186 EVT VT = N->getValueType(0); 7187 7188 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector 7189 // splat. The indexed instructions are going to be expecting a DUPLANE64, so 7190 // canonicalise to that. 7191 if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) { 7192 assert(VT.getVectorElementType().getSizeInBits() == 64); 7193 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, 7194 WidenVector(N->getOperand(0), DAG), 7195 DAG.getConstant(0, MVT::i64)); 7196 } 7197 7198 // Canonicalise concat_vectors so that the right-hand vector has as few 7199 // bit-casts as possible before its real operation. The primary matching 7200 // destination for these operations will be the narrowing "2" instructions, 7201 // which depend on the operation being performed on this right-hand vector. 7202 // For example, 7203 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) 7204 // becomes 7205 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) 7206 7207 SDValue Op1 = N->getOperand(1); 7208 if (Op1->getOpcode() != ISD::BITCAST) 7209 return SDValue(); 7210 SDValue RHS = Op1->getOperand(0); 7211 MVT RHSTy = RHS.getValueType().getSimpleVT(); 7212 // If the RHS is not a vector, this is not the pattern we're looking for. 7213 if (!RHSTy.isVector()) 7214 return SDValue(); 7215 7216 DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); 7217 7218 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), 7219 RHSTy.getVectorNumElements() * 2); 7220 return DAG.getNode( 7221 ISD::BITCAST, dl, VT, 7222 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, 7223 DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS)); 7224} 7225 7226static SDValue tryCombineFixedPointConvert(SDNode *N, 7227 TargetLowering::DAGCombinerInfo &DCI, 7228 SelectionDAG &DAG) { 7229 // Wait 'til after everything is legalized to try this. That way we have 7230 // legal vector types and such. 7231 if (DCI.isBeforeLegalizeOps()) 7232 return SDValue(); 7233 // Transform a scalar conversion of a value from a lane extract into a 7234 // lane extract of a vector conversion. E.g., from foo1 to foo2: 7235 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } 7236 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } 7237 // 7238 // The second form interacts better with instruction selection and the 7239 // register allocator to avoid cross-class register copies that aren't 7240 // coalescable due to a lane reference. 7241 7242 // Check the operand and see if it originates from a lane extract. 7243 SDValue Op1 = N->getOperand(1); 7244 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 7245 // Yep, no additional predication needed. Perform the transform. 7246 SDValue IID = N->getOperand(0); 7247 SDValue Shift = N->getOperand(2); 7248 SDValue Vec = Op1.getOperand(0); 7249 SDValue Lane = Op1.getOperand(1); 7250 EVT ResTy = N->getValueType(0); 7251 EVT VecResTy; 7252 SDLoc DL(N); 7253 7254 // The vector width should be 128 bits by the time we get here, even 7255 // if it started as 64 bits (the extract_vector handling will have 7256 // done so). 7257 assert(Vec.getValueType().getSizeInBits() == 128 && 7258 "unexpected vector size on extract_vector_elt!"); 7259 if (Vec.getValueType() == MVT::v4i32) 7260 VecResTy = MVT::v4f32; 7261 else if (Vec.getValueType() == MVT::v2i64) 7262 VecResTy = MVT::v2f64; 7263 else 7264 llvm_unreachable("unexpected vector type!"); 7265 7266 SDValue Convert = 7267 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); 7268 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); 7269 } 7270 return SDValue(); 7271} 7272 7273// AArch64 high-vector "long" operations are formed by performing the non-high 7274// version on an extract_subvector of each operand which gets the high half: 7275// 7276// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) 7277// 7278// However, there are cases which don't have an extract_high explicitly, but 7279// have another operation that can be made compatible with one for free. For 7280// example: 7281// 7282// (dupv64 scalar) --> (extract_high (dup128 scalar)) 7283// 7284// This routine does the actual conversion of such DUPs, once outer routines 7285// have determined that everything else is in order. 7286static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { 7287 // We can handle most types of duplicate, but the lane ones have an extra 7288 // operand saying *which* lane, so we need to know. 7289 bool IsDUPLANE; 7290 switch (N.getOpcode()) { 7291 case AArch64ISD::DUP: 7292 IsDUPLANE = false; 7293 break; 7294 case AArch64ISD::DUPLANE8: 7295 case AArch64ISD::DUPLANE16: 7296 case AArch64ISD::DUPLANE32: 7297 case AArch64ISD::DUPLANE64: 7298 IsDUPLANE = true; 7299 break; 7300 default: 7301 return SDValue(); 7302 } 7303 7304 MVT NarrowTy = N.getSimpleValueType(); 7305 if (!NarrowTy.is64BitVector()) 7306 return SDValue(); 7307 7308 MVT ElementTy = NarrowTy.getVectorElementType(); 7309 unsigned NumElems = NarrowTy.getVectorNumElements(); 7310 MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2); 7311 7312 SDValue NewDUP; 7313 if (IsDUPLANE) 7314 NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0), 7315 N.getOperand(1)); 7316 else 7317 NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0)); 7318 7319 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy, 7320 NewDUP, DAG.getConstant(NumElems, MVT::i64)); 7321} 7322 7323static bool isEssentiallyExtractSubvector(SDValue N) { 7324 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) 7325 return true; 7326 7327 return N.getOpcode() == ISD::BITCAST && 7328 N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; 7329} 7330 7331/// \brief Helper structure to keep track of ISD::SET_CC operands. 7332struct GenericSetCCInfo { 7333 const SDValue *Opnd0; 7334 const SDValue *Opnd1; 7335 ISD::CondCode CC; 7336}; 7337 7338/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code. 7339struct AArch64SetCCInfo { 7340 const SDValue *Cmp; 7341 AArch64CC::CondCode CC; 7342}; 7343 7344/// \brief Helper structure to keep track of SetCC information. 7345union SetCCInfo { 7346 GenericSetCCInfo Generic; 7347 AArch64SetCCInfo AArch64; 7348}; 7349 7350/// \brief Helper structure to be able to read SetCC information. If set to 7351/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a 7352/// GenericSetCCInfo. 7353struct SetCCInfoAndKind { 7354 SetCCInfo Info; 7355 bool IsAArch64; 7356}; 7357 7358/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or 7359/// an 7360/// AArch64 lowered one. 7361/// \p SetCCInfo is filled accordingly. 7362/// \post SetCCInfo is meanginfull only when this function returns true. 7363/// \return True when Op is a kind of SET_CC operation. 7364static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { 7365 // If this is a setcc, this is straight forward. 7366 if (Op.getOpcode() == ISD::SETCC) { 7367 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); 7368 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); 7369 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7370 SetCCInfo.IsAArch64 = false; 7371 return true; 7372 } 7373 // Otherwise, check if this is a matching csel instruction. 7374 // In other words: 7375 // - csel 1, 0, cc 7376 // - csel 0, 1, !cc 7377 if (Op.getOpcode() != AArch64ISD::CSEL) 7378 return false; 7379 // Set the information about the operands. 7380 // TODO: we want the operands of the Cmp not the csel 7381 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); 7382 SetCCInfo.IsAArch64 = true; 7383 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>( 7384 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 7385 7386 // Check that the operands matches the constraints: 7387 // (1) Both operands must be constants. 7388 // (2) One must be 1 and the other must be 0. 7389 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); 7390 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7391 7392 // Check (1). 7393 if (!TValue || !FValue) 7394 return false; 7395 7396 // Check (2). 7397 if (!TValue->isOne()) { 7398 // Update the comparison when we are interested in !cc. 7399 std::swap(TValue, FValue); 7400 SetCCInfo.Info.AArch64.CC = 7401 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); 7402 } 7403 return TValue->isOne() && FValue->isNullValue(); 7404} 7405 7406// Returns true if Op is setcc or zext of setcc. 7407static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { 7408 if (isSetCC(Op, Info)) 7409 return true; 7410 return ((Op.getOpcode() == ISD::ZERO_EXTEND) && 7411 isSetCC(Op->getOperand(0), Info)); 7412} 7413 7414// The folding we want to perform is: 7415// (add x, [zext] (setcc cc ...) ) 7416// --> 7417// (csel x, (add x, 1), !cc ...) 7418// 7419// The latter will get matched to a CSINC instruction. 7420static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { 7421 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); 7422 SDValue LHS = Op->getOperand(0); 7423 SDValue RHS = Op->getOperand(1); 7424 SetCCInfoAndKind InfoAndKind; 7425 7426 // If neither operand is a SET_CC, give up. 7427 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { 7428 std::swap(LHS, RHS); 7429 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) 7430 return SDValue(); 7431 } 7432 7433 // FIXME: This could be generatized to work for FP comparisons. 7434 EVT CmpVT = InfoAndKind.IsAArch64 7435 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() 7436 : InfoAndKind.Info.Generic.Opnd0->getValueType(); 7437 if (CmpVT != MVT::i32 && CmpVT != MVT::i64) 7438 return SDValue(); 7439 7440 SDValue CCVal; 7441 SDValue Cmp; 7442 SDLoc dl(Op); 7443 if (InfoAndKind.IsAArch64) { 7444 CCVal = DAG.getConstant( 7445 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32); 7446 Cmp = *InfoAndKind.Info.AArch64.Cmp; 7447 } else 7448 Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, 7449 *InfoAndKind.Info.Generic.Opnd1, 7450 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), 7451 CCVal, DAG, dl); 7452 7453 EVT VT = Op->getValueType(0); 7454 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT)); 7455 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); 7456} 7457 7458// The basic add/sub long vector instructions have variants with "2" on the end 7459// which act on the high-half of their inputs. They are normally matched by 7460// patterns like: 7461// 7462// (add (zeroext (extract_high LHS)), 7463// (zeroext (extract_high RHS))) 7464// -> uaddl2 vD, vN, vM 7465// 7466// However, if one of the extracts is something like a duplicate, this 7467// instruction can still be used profitably. This function puts the DAG into a 7468// more appropriate form for those patterns to trigger. 7469static SDValue performAddSubLongCombine(SDNode *N, 7470 TargetLowering::DAGCombinerInfo &DCI, 7471 SelectionDAG &DAG) { 7472 if (DCI.isBeforeLegalizeOps()) 7473 return SDValue(); 7474 7475 MVT VT = N->getSimpleValueType(0); 7476 if (!VT.is128BitVector()) { 7477 if (N->getOpcode() == ISD::ADD) 7478 return performSetccAddFolding(N, DAG); 7479 return SDValue(); 7480 } 7481 7482 // Make sure both branches are extended in the same way. 7483 SDValue LHS = N->getOperand(0); 7484 SDValue RHS = N->getOperand(1); 7485 if ((LHS.getOpcode() != ISD::ZERO_EXTEND && 7486 LHS.getOpcode() != ISD::SIGN_EXTEND) || 7487 LHS.getOpcode() != RHS.getOpcode()) 7488 return SDValue(); 7489 7490 unsigned ExtType = LHS.getOpcode(); 7491 7492 // It's not worth doing if at least one of the inputs isn't already an 7493 // extract, but we don't know which it'll be so we have to try both. 7494 if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { 7495 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); 7496 if (!RHS.getNode()) 7497 return SDValue(); 7498 7499 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); 7500 } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { 7501 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); 7502 if (!LHS.getNode()) 7503 return SDValue(); 7504 7505 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); 7506 } 7507 7508 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); 7509} 7510 7511// Massage DAGs which we can use the high-half "long" operations on into 7512// something isel will recognize better. E.g. 7513// 7514// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> 7515// (aarch64_neon_umull (extract_high (v2i64 vec))) 7516// (extract_high (v2i64 (dup128 scalar))))) 7517// 7518static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, 7519 TargetLowering::DAGCombinerInfo &DCI, 7520 SelectionDAG &DAG) { 7521 if (DCI.isBeforeLegalizeOps()) 7522 return SDValue(); 7523 7524 SDValue LHS = N->getOperand(1); 7525 SDValue RHS = N->getOperand(2); 7526 assert(LHS.getValueType().is64BitVector() && 7527 RHS.getValueType().is64BitVector() && 7528 "unexpected shape for long operation"); 7529 7530 // Either node could be a DUP, but it's not worth doing both of them (you'd 7531 // just as well use the non-high version) so look for a corresponding extract 7532 // operation on the other "wing". 7533 if (isEssentiallyExtractSubvector(LHS)) { 7534 RHS = tryExtendDUPToExtractHigh(RHS, DAG); 7535 if (!RHS.getNode()) 7536 return SDValue(); 7537 } else if (isEssentiallyExtractSubvector(RHS)) { 7538 LHS = tryExtendDUPToExtractHigh(LHS, DAG); 7539 if (!LHS.getNode()) 7540 return SDValue(); 7541 } 7542 7543 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), 7544 N->getOperand(0), LHS, RHS); 7545} 7546 7547static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { 7548 MVT ElemTy = N->getSimpleValueType(0).getScalarType(); 7549 unsigned ElemBits = ElemTy.getSizeInBits(); 7550 7551 int64_t ShiftAmount; 7552 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { 7553 APInt SplatValue, SplatUndef; 7554 unsigned SplatBitSize; 7555 bool HasAnyUndefs; 7556 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 7557 HasAnyUndefs, ElemBits) || 7558 SplatBitSize != ElemBits) 7559 return SDValue(); 7560 7561 ShiftAmount = SplatValue.getSExtValue(); 7562 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 7563 ShiftAmount = CVN->getSExtValue(); 7564 } else 7565 return SDValue(); 7566 7567 unsigned Opcode; 7568 bool IsRightShift; 7569 switch (IID) { 7570 default: 7571 llvm_unreachable("Unknown shift intrinsic"); 7572 case Intrinsic::aarch64_neon_sqshl: 7573 Opcode = AArch64ISD::SQSHL_I; 7574 IsRightShift = false; 7575 break; 7576 case Intrinsic::aarch64_neon_uqshl: 7577 Opcode = AArch64ISD::UQSHL_I; 7578 IsRightShift = false; 7579 break; 7580 case Intrinsic::aarch64_neon_srshl: 7581 Opcode = AArch64ISD::SRSHR_I; 7582 IsRightShift = true; 7583 break; 7584 case Intrinsic::aarch64_neon_urshl: 7585 Opcode = AArch64ISD::URSHR_I; 7586 IsRightShift = true; 7587 break; 7588 case Intrinsic::aarch64_neon_sqshlu: 7589 Opcode = AArch64ISD::SQSHLU_I; 7590 IsRightShift = false; 7591 break; 7592 } 7593 7594 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) 7595 return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), 7596 DAG.getConstant(-ShiftAmount, MVT::i32)); 7597 else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) 7598 return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), 7599 DAG.getConstant(ShiftAmount, MVT::i32)); 7600 7601 return SDValue(); 7602} 7603 7604// The CRC32[BH] instructions ignore the high bits of their data operand. Since 7605// the intrinsics must be legal and take an i32, this means there's almost 7606// certainly going to be a zext in the DAG which we can eliminate. 7607static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { 7608 SDValue AndN = N->getOperand(2); 7609 if (AndN.getOpcode() != ISD::AND) 7610 return SDValue(); 7611 7612 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); 7613 if (!CMask || CMask->getZExtValue() != Mask) 7614 return SDValue(); 7615 7616 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, 7617 N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); 7618} 7619 7620static SDValue performIntrinsicCombine(SDNode *N, 7621 TargetLowering::DAGCombinerInfo &DCI, 7622 const AArch64Subtarget *Subtarget) { 7623 SelectionDAG &DAG = DCI.DAG; 7624 unsigned IID = getIntrinsicID(N); 7625 switch (IID) { 7626 default: 7627 break; 7628 case Intrinsic::aarch64_neon_vcvtfxs2fp: 7629 case Intrinsic::aarch64_neon_vcvtfxu2fp: 7630 return tryCombineFixedPointConvert(N, DCI, DAG); 7631 break; 7632 case Intrinsic::aarch64_neon_fmax: 7633 return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0), 7634 N->getOperand(1), N->getOperand(2)); 7635 case Intrinsic::aarch64_neon_fmin: 7636 return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0), 7637 N->getOperand(1), N->getOperand(2)); 7638 case Intrinsic::aarch64_neon_smull: 7639 case Intrinsic::aarch64_neon_umull: 7640 case Intrinsic::aarch64_neon_pmull: 7641 case Intrinsic::aarch64_neon_sqdmull: 7642 return tryCombineLongOpWithDup(IID, N, DCI, DAG); 7643 case Intrinsic::aarch64_neon_sqshl: 7644 case Intrinsic::aarch64_neon_uqshl: 7645 case Intrinsic::aarch64_neon_sqshlu: 7646 case Intrinsic::aarch64_neon_srshl: 7647 case Intrinsic::aarch64_neon_urshl: 7648 return tryCombineShiftImm(IID, N, DAG); 7649 case Intrinsic::aarch64_crc32b: 7650 case Intrinsic::aarch64_crc32cb: 7651 return tryCombineCRC32(0xff, N, DAG); 7652 case Intrinsic::aarch64_crc32h: 7653 case Intrinsic::aarch64_crc32ch: 7654 return tryCombineCRC32(0xffff, N, DAG); 7655 } 7656 return SDValue(); 7657} 7658 7659static SDValue performExtendCombine(SDNode *N, 7660 TargetLowering::DAGCombinerInfo &DCI, 7661 SelectionDAG &DAG) { 7662 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then 7663 // we can convert that DUP into another extract_high (of a bigger DUP), which 7664 // helps the backend to decide that an sabdl2 would be useful, saving a real 7665 // extract_high operation. 7666 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && 7667 N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { 7668 SDNode *ABDNode = N->getOperand(0).getNode(); 7669 unsigned IID = getIntrinsicID(ABDNode); 7670 if (IID == Intrinsic::aarch64_neon_sabd || 7671 IID == Intrinsic::aarch64_neon_uabd) { 7672 SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); 7673 if (!NewABD.getNode()) 7674 return SDValue(); 7675 7676 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), 7677 NewABD); 7678 } 7679 } 7680 7681 // This is effectively a custom type legalization for AArch64. 7682 // 7683 // Type legalization will split an extend of a small, legal, type to a larger 7684 // illegal type by first splitting the destination type, often creating 7685 // illegal source types, which then get legalized in isel-confusing ways, 7686 // leading to really terrible codegen. E.g., 7687 // %result = v8i32 sext v8i8 %value 7688 // becomes 7689 // %losrc = extract_subreg %value, ... 7690 // %hisrc = extract_subreg %value, ... 7691 // %lo = v4i32 sext v4i8 %losrc 7692 // %hi = v4i32 sext v4i8 %hisrc 7693 // Things go rapidly downhill from there. 7694 // 7695 // For AArch64, the [sz]ext vector instructions can only go up one element 7696 // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 7697 // take two instructions. 7698 // 7699 // This implies that the most efficient way to do the extend from v8i8 7700 // to two v4i32 values is to first extend the v8i8 to v8i16, then do 7701 // the normal splitting to happen for the v8i16->v8i32. 7702 7703 // This is pre-legalization to catch some cases where the default 7704 // type legalization will create ill-tempered code. 7705 if (!DCI.isBeforeLegalizeOps()) 7706 return SDValue(); 7707 7708 // We're only interested in cleaning things up for non-legal vector types 7709 // here. If both the source and destination are legal, things will just 7710 // work naturally without any fiddling. 7711 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 7712 EVT ResVT = N->getValueType(0); 7713 if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) 7714 return SDValue(); 7715 // If the vector type isn't a simple VT, it's beyond the scope of what 7716 // we're worried about here. Let legalization do its thing and hope for 7717 // the best. 7718 SDValue Src = N->getOperand(0); 7719 EVT SrcVT = Src->getValueType(0); 7720 if (!ResVT.isSimple() || !SrcVT.isSimple()) 7721 return SDValue(); 7722 7723 // If the source VT is a 64-bit vector, we can play games and get the 7724 // better results we want. 7725 if (SrcVT.getSizeInBits() != 64) 7726 return SDValue(); 7727 7728 unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); 7729 unsigned ElementCount = SrcVT.getVectorNumElements(); 7730 SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); 7731 SDLoc DL(N); 7732 Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); 7733 7734 // Now split the rest of the operation into two halves, each with a 64 7735 // bit source. 7736 EVT LoVT, HiVT; 7737 SDValue Lo, Hi; 7738 unsigned NumElements = ResVT.getVectorNumElements(); 7739 assert(!(NumElements & 1) && "Splitting vector, but not in half!"); 7740 LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), 7741 ResVT.getVectorElementType(), NumElements / 2); 7742 7743 EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), 7744 LoVT.getVectorNumElements()); 7745 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 7746 DAG.getConstant(0, MVT::i64)); 7747 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 7748 DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64)); 7749 Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); 7750 Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); 7751 7752 // Now combine the parts back together so we still have a single result 7753 // like the combiner expects. 7754 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); 7755} 7756 7757/// Replace a splat of a scalar to a vector store by scalar stores of the scalar 7758/// value. The load store optimizer pass will merge them to store pair stores. 7759/// This has better performance than a splat of the scalar followed by a split 7760/// vector store. Even if the stores are not merged it is four stores vs a dup, 7761/// followed by an ext.b and two stores. 7762static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { 7763 SDValue StVal = St->getValue(); 7764 EVT VT = StVal.getValueType(); 7765 7766 // Don't replace floating point stores, they possibly won't be transformed to 7767 // stp because of the store pair suppress pass. 7768 if (VT.isFloatingPoint()) 7769 return SDValue(); 7770 7771 // Check for insert vector elements. 7772 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) 7773 return SDValue(); 7774 7775 // We can express a splat as store pair(s) for 2 or 4 elements. 7776 unsigned NumVecElts = VT.getVectorNumElements(); 7777 if (NumVecElts != 4 && NumVecElts != 2) 7778 return SDValue(); 7779 SDValue SplatVal = StVal.getOperand(1); 7780 unsigned RemainInsertElts = NumVecElts - 1; 7781 7782 // Check that this is a splat. 7783 while (--RemainInsertElts) { 7784 SDValue NextInsertElt = StVal.getOperand(0); 7785 if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT) 7786 return SDValue(); 7787 if (NextInsertElt.getOperand(1) != SplatVal) 7788 return SDValue(); 7789 StVal = NextInsertElt; 7790 } 7791 unsigned OrigAlignment = St->getAlignment(); 7792 unsigned EltOffset = NumVecElts == 4 ? 4 : 8; 7793 unsigned Alignment = std::min(OrigAlignment, EltOffset); 7794 7795 // Create scalar stores. This is at least as good as the code sequence for a 7796 // split unaligned store wich is a dup.s, ext.b, and two stores. 7797 // Most of the time the three stores should be replaced by store pair 7798 // instructions (stp). 7799 SDLoc DL(St); 7800 SDValue BasePtr = St->getBasePtr(); 7801 SDValue NewST1 = 7802 DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), 7803 St->isVolatile(), St->isNonTemporal(), St->getAlignment()); 7804 7805 unsigned Offset = EltOffset; 7806 while (--NumVecElts) { 7807 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 7808 DAG.getConstant(Offset, MVT::i64)); 7809 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, 7810 St->getPointerInfo(), St->isVolatile(), 7811 St->isNonTemporal(), Alignment); 7812 Offset += EltOffset; 7813 } 7814 return NewST1; 7815} 7816 7817static SDValue performSTORECombine(SDNode *N, 7818 TargetLowering::DAGCombinerInfo &DCI, 7819 SelectionDAG &DAG, 7820 const AArch64Subtarget *Subtarget) { 7821 if (!DCI.isBeforeLegalize()) 7822 return SDValue(); 7823 7824 StoreSDNode *S = cast<StoreSDNode>(N); 7825 if (S->isVolatile()) 7826 return SDValue(); 7827 7828 // Cyclone has bad performance on unaligned 16B stores when crossing line and 7829 // page boundries. We want to split such stores. 7830 if (!Subtarget->isCyclone()) 7831 return SDValue(); 7832 7833 // Don't split at Oz. 7834 MachineFunction &MF = DAG.getMachineFunction(); 7835 bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute( 7836 AttributeSet::FunctionIndex, Attribute::MinSize); 7837 if (IsMinSize) 7838 return SDValue(); 7839 7840 SDValue StVal = S->getValue(); 7841 EVT VT = StVal.getValueType(); 7842 7843 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting 7844 // those up regresses performance on micro-benchmarks and olden/bh. 7845 if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) 7846 return SDValue(); 7847 7848 // Split unaligned 16B stores. They are terrible for performance. 7849 // Don't split stores with alignment of 1 or 2. Code that uses clang vector 7850 // extensions can use this to mark that it does not want splitting to happen 7851 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of 7852 // eliminating alignment hazards is only 1 in 8 for alignment of 2. 7853 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || 7854 S->getAlignment() <= 2) 7855 return SDValue(); 7856 7857 // If we get a splat of a scalar convert this vector store to a store of 7858 // scalars. They will be merged into store pairs thereby removing two 7859 // instructions. 7860 SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S); 7861 if (ReplacedSplat != SDValue()) 7862 return ReplacedSplat; 7863 7864 SDLoc DL(S); 7865 unsigned NumElts = VT.getVectorNumElements() / 2; 7866 // Split VT into two. 7867 EVT HalfVT = 7868 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); 7869 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 7870 DAG.getConstant(0, MVT::i64)); 7871 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 7872 DAG.getConstant(NumElts, MVT::i64)); 7873 SDValue BasePtr = S->getBasePtr(); 7874 SDValue NewST1 = 7875 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), 7876 S->isVolatile(), S->isNonTemporal(), S->getAlignment()); 7877 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 7878 DAG.getConstant(8, MVT::i64)); 7879 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, 7880 S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), 7881 S->getAlignment()); 7882} 7883 7884/// Target-specific DAG combine function for post-increment LD1 (lane) and 7885/// post-increment LD1R. 7886static SDValue performPostLD1Combine(SDNode *N, 7887 TargetLowering::DAGCombinerInfo &DCI, 7888 bool IsLaneOp) { 7889 if (DCI.isBeforeLegalizeOps()) 7890 return SDValue(); 7891 7892 SelectionDAG &DAG = DCI.DAG; 7893 EVT VT = N->getValueType(0); 7894 7895 unsigned LoadIdx = IsLaneOp ? 1 : 0; 7896 SDNode *LD = N->getOperand(LoadIdx).getNode(); 7897 // If it is not LOAD, can not do such combine. 7898 if (LD->getOpcode() != ISD::LOAD) 7899 return SDValue(); 7900 7901 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); 7902 EVT MemVT = LoadSDN->getMemoryVT(); 7903 // Check if memory operand is the same type as the vector element. 7904 if (MemVT != VT.getVectorElementType()) 7905 return SDValue(); 7906 7907 // Check if there are other uses. If so, do not combine as it will introduce 7908 // an extra load. 7909 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; 7910 ++UI) { 7911 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. 7912 continue; 7913 if (*UI != N) 7914 return SDValue(); 7915 } 7916 7917 SDValue Addr = LD->getOperand(1); 7918 SDValue Vector = N->getOperand(0); 7919 // Search for a use of the address operand that is an increment. 7920 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = 7921 Addr.getNode()->use_end(); UI != UE; ++UI) { 7922 SDNode *User = *UI; 7923 if (User->getOpcode() != ISD::ADD 7924 || UI.getUse().getResNo() != Addr.getResNo()) 7925 continue; 7926 7927 // Check that the add is independent of the load. Otherwise, folding it 7928 // would create a cycle. 7929 if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) 7930 continue; 7931 // Also check that add is not used in the vector operand. This would also 7932 // create a cycle. 7933 if (User->isPredecessorOf(Vector.getNode())) 7934 continue; 7935 7936 // If the increment is a constant, it must match the memory ref size. 7937 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 7938 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 7939 uint32_t IncVal = CInc->getZExtValue(); 7940 unsigned NumBytes = VT.getScalarSizeInBits() / 8; 7941 if (IncVal != NumBytes) 7942 continue; 7943 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 7944 } 7945 7946 SmallVector<SDValue, 8> Ops; 7947 Ops.push_back(LD->getOperand(0)); // Chain 7948 if (IsLaneOp) { 7949 Ops.push_back(Vector); // The vector to be inserted 7950 Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector 7951 } 7952 Ops.push_back(Addr); 7953 Ops.push_back(Inc); 7954 7955 EVT Tys[3] = { VT, MVT::i64, MVT::Other }; 7956 SDVTList SDTys = DAG.getVTList(Tys); 7957 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; 7958 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, 7959 MemVT, 7960 LoadSDN->getMemOperand()); 7961 7962 // Update the uses. 7963 std::vector<SDValue> NewResults; 7964 NewResults.push_back(SDValue(LD, 0)); // The result of load 7965 NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain 7966 DCI.CombineTo(LD, NewResults); 7967 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result 7968 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register 7969 7970 break; 7971 } 7972 return SDValue(); 7973} 7974 7975/// Target-specific DAG combine function for NEON load/store intrinsics 7976/// to merge base address updates. 7977static SDValue performNEONPostLDSTCombine(SDNode *N, 7978 TargetLowering::DAGCombinerInfo &DCI, 7979 SelectionDAG &DAG) { 7980 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 7981 return SDValue(); 7982 7983 unsigned AddrOpIdx = N->getNumOperands() - 1; 7984 SDValue Addr = N->getOperand(AddrOpIdx); 7985 7986 // Search for a use of the address operand that is an increment. 7987 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 7988 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 7989 SDNode *User = *UI; 7990 if (User->getOpcode() != ISD::ADD || 7991 UI.getUse().getResNo() != Addr.getResNo()) 7992 continue; 7993 7994 // Check that the add is independent of the load/store. Otherwise, folding 7995 // it would create a cycle. 7996 if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) 7997 continue; 7998 7999 // Find the new opcode for the updating load/store. 8000 bool IsStore = false; 8001 bool IsLaneOp = false; 8002 bool IsDupOp = false; 8003 unsigned NewOpc = 0; 8004 unsigned NumVecs = 0; 8005 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 8006 switch (IntNo) { 8007 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 8008 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; 8009 NumVecs = 2; break; 8010 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; 8011 NumVecs = 3; break; 8012 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; 8013 NumVecs = 4; break; 8014 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; 8015 NumVecs = 2; IsStore = true; break; 8016 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; 8017 NumVecs = 3; IsStore = true; break; 8018 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; 8019 NumVecs = 4; IsStore = true; break; 8020 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; 8021 NumVecs = 2; break; 8022 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; 8023 NumVecs = 3; break; 8024 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; 8025 NumVecs = 4; break; 8026 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; 8027 NumVecs = 2; IsStore = true; break; 8028 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; 8029 NumVecs = 3; IsStore = true; break; 8030 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; 8031 NumVecs = 4; IsStore = true; break; 8032 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; 8033 NumVecs = 2; IsDupOp = true; break; 8034 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; 8035 NumVecs = 3; IsDupOp = true; break; 8036 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; 8037 NumVecs = 4; IsDupOp = true; break; 8038 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; 8039 NumVecs = 2; IsLaneOp = true; break; 8040 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; 8041 NumVecs = 3; IsLaneOp = true; break; 8042 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; 8043 NumVecs = 4; IsLaneOp = true; break; 8044 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; 8045 NumVecs = 2; IsStore = true; IsLaneOp = true; break; 8046 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; 8047 NumVecs = 3; IsStore = true; IsLaneOp = true; break; 8048 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; 8049 NumVecs = 4; IsStore = true; IsLaneOp = true; break; 8050 } 8051 8052 EVT VecTy; 8053 if (IsStore) 8054 VecTy = N->getOperand(2).getValueType(); 8055 else 8056 VecTy = N->getValueType(0); 8057 8058 // If the increment is a constant, it must match the memory ref size. 8059 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 8060 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 8061 uint32_t IncVal = CInc->getZExtValue(); 8062 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 8063 if (IsLaneOp || IsDupOp) 8064 NumBytes /= VecTy.getVectorNumElements(); 8065 if (IncVal != NumBytes) 8066 continue; 8067 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 8068 } 8069 SmallVector<SDValue, 8> Ops; 8070 Ops.push_back(N->getOperand(0)); // Incoming chain 8071 // Load lane and store have vector list as input. 8072 if (IsLaneOp || IsStore) 8073 for (unsigned i = 2; i < AddrOpIdx; ++i) 8074 Ops.push_back(N->getOperand(i)); 8075 Ops.push_back(Addr); // Base register 8076 Ops.push_back(Inc); 8077 8078 // Return Types. 8079 EVT Tys[6]; 8080 unsigned NumResultVecs = (IsStore ? 0 : NumVecs); 8081 unsigned n; 8082 for (n = 0; n < NumResultVecs; ++n) 8083 Tys[n] = VecTy; 8084 Tys[n++] = MVT::i64; // Type of write back register 8085 Tys[n] = MVT::Other; // Type of the chain 8086 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 8087 8088 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 8089 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, 8090 MemInt->getMemoryVT(), 8091 MemInt->getMemOperand()); 8092 8093 // Update the uses. 8094 std::vector<SDValue> NewResults; 8095 for (unsigned i = 0; i < NumResultVecs; ++i) { 8096 NewResults.push_back(SDValue(UpdN.getNode(), i)); 8097 } 8098 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); 8099 DCI.CombineTo(N, NewResults); 8100 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 8101 8102 break; 8103 } 8104 return SDValue(); 8105} 8106 8107// Checks to see if the value is the prescribed width and returns information 8108// about its extension mode. 8109static 8110bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { 8111 ExtType = ISD::NON_EXTLOAD; 8112 switch(V.getNode()->getOpcode()) { 8113 default: 8114 return false; 8115 case ISD::LOAD: { 8116 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); 8117 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) 8118 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { 8119 ExtType = LoadNode->getExtensionType(); 8120 return true; 8121 } 8122 return false; 8123 } 8124 case ISD::AssertSext: { 8125 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 8126 if ((TypeNode->getVT() == MVT::i8 && width == 8) 8127 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 8128 ExtType = ISD::SEXTLOAD; 8129 return true; 8130 } 8131 return false; 8132 } 8133 case ISD::AssertZext: { 8134 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 8135 if ((TypeNode->getVT() == MVT::i8 && width == 8) 8136 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 8137 ExtType = ISD::ZEXTLOAD; 8138 return true; 8139 } 8140 return false; 8141 } 8142 case ISD::Constant: 8143 case ISD::TargetConstant: { 8144 if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < 8145 1LL << (width - 1)) 8146 return true; 8147 return false; 8148 } 8149 } 8150 8151 return true; 8152} 8153 8154// This function does a whole lot of voodoo to determine if the tests are 8155// equivalent without and with a mask. Essentially what happens is that given a 8156// DAG resembling: 8157// 8158// +-------------+ +-------------+ +-------------+ +-------------+ 8159// | Input | | AddConstant | | CompConstant| | CC | 8160// +-------------+ +-------------+ +-------------+ +-------------+ 8161// | | | | 8162// V V | +----------+ 8163// +-------------+ +----+ | | 8164// | ADD | |0xff| | | 8165// +-------------+ +----+ | | 8166// | | | | 8167// V V | | 8168// +-------------+ | | 8169// | AND | | | 8170// +-------------+ | | 8171// | | | 8172// +-----+ | | 8173// | | | 8174// V V V 8175// +-------------+ 8176// | CMP | 8177// +-------------+ 8178// 8179// The AND node may be safely removed for some combinations of inputs. In 8180// particular we need to take into account the extension type of the Input, 8181// the exact values of AddConstant, CompConstant, and CC, along with the nominal 8182// width of the input (this can work for any width inputs, the above graph is 8183// specific to 8 bits. 8184// 8185// The specific equations were worked out by generating output tables for each 8186// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The 8187// problem was simplified by working with 4 bit inputs, which means we only 8188// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero 8189// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 8190// patterns present in both extensions (0,7). For every distinct set of 8191// AddConstant and CompConstants bit patterns we can consider the masked and 8192// unmasked versions to be equivalent if the result of this function is true for 8193// all 16 distinct bit patterns of for the current extension type of Input (w0). 8194// 8195// sub w8, w0, w1 8196// and w10, w8, #0x0f 8197// cmp w8, w2 8198// cset w9, AArch64CC 8199// cmp w10, w2 8200// cset w11, AArch64CC 8201// cmp w9, w11 8202// cset w0, eq 8203// ret 8204// 8205// Since the above function shows when the outputs are equivalent it defines 8206// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and 8207// would be expensive to run during compiles. The equations below were written 8208// in a test harness that confirmed they gave equivalent outputs to the above 8209// for all inputs function, so they can be used determine if the removal is 8210// legal instead. 8211// 8212// isEquivalentMaskless() is the code for testing if the AND can be removed 8213// factored out of the DAG recognition as the DAG can take several forms. 8214 8215static 8216bool isEquivalentMaskless(unsigned CC, unsigned width, 8217 ISD::LoadExtType ExtType, signed AddConstant, 8218 signed CompConstant) { 8219 // By being careful about our equations and only writing the in term 8220 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can 8221 // make them generally applicable to all bit widths. 8222 signed MaxUInt = (1 << width); 8223 8224 // For the purposes of these comparisons sign extending the type is 8225 // equivalent to zero extending the add and displacing it by half the integer 8226 // width. Provided we are careful and make sure our equations are valid over 8227 // the whole range we can just adjust the input and avoid writing equations 8228 // for sign extended inputs. 8229 if (ExtType == ISD::SEXTLOAD) 8230 AddConstant -= (1 << (width-1)); 8231 8232 switch(CC) { 8233 case AArch64CC::LE: 8234 case AArch64CC::GT: { 8235 if ((AddConstant == 0) || 8236 (CompConstant == MaxUInt - 1 && AddConstant < 0) || 8237 (AddConstant >= 0 && CompConstant < 0) || 8238 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) 8239 return true; 8240 } break; 8241 case AArch64CC::LT: 8242 case AArch64CC::GE: { 8243 if ((AddConstant == 0) || 8244 (AddConstant >= 0 && CompConstant <= 0) || 8245 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) 8246 return true; 8247 } break; 8248 case AArch64CC::HI: 8249 case AArch64CC::LS: { 8250 if ((AddConstant >= 0 && CompConstant < 0) || 8251 (AddConstant <= 0 && CompConstant >= -1 && 8252 CompConstant < AddConstant + MaxUInt)) 8253 return true; 8254 } break; 8255 case AArch64CC::PL: 8256 case AArch64CC::MI: { 8257 if ((AddConstant == 0) || 8258 (AddConstant > 0 && CompConstant <= 0) || 8259 (AddConstant < 0 && CompConstant <= AddConstant)) 8260 return true; 8261 } break; 8262 case AArch64CC::LO: 8263 case AArch64CC::HS: { 8264 if ((AddConstant >= 0 && CompConstant <= 0) || 8265 (AddConstant <= 0 && CompConstant >= 0 && 8266 CompConstant <= AddConstant + MaxUInt)) 8267 return true; 8268 } break; 8269 case AArch64CC::EQ: 8270 case AArch64CC::NE: { 8271 if ((AddConstant > 0 && CompConstant < 0) || 8272 (AddConstant < 0 && CompConstant >= 0 && 8273 CompConstant < AddConstant + MaxUInt) || 8274 (AddConstant >= 0 && CompConstant >= 0 && 8275 CompConstant >= AddConstant) || 8276 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) 8277 8278 return true; 8279 } break; 8280 case AArch64CC::VS: 8281 case AArch64CC::VC: 8282 case AArch64CC::AL: 8283 case AArch64CC::NV: 8284 return true; 8285 case AArch64CC::Invalid: 8286 break; 8287 } 8288 8289 return false; 8290} 8291 8292static 8293SDValue performCONDCombine(SDNode *N, 8294 TargetLowering::DAGCombinerInfo &DCI, 8295 SelectionDAG &DAG, unsigned CCIndex, 8296 unsigned CmpIndex) { 8297 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); 8298 SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); 8299 unsigned CondOpcode = SubsNode->getOpcode(); 8300 8301 if (CondOpcode != AArch64ISD::SUBS) 8302 return SDValue(); 8303 8304 // There is a SUBS feeding this condition. Is it fed by a mask we can 8305 // use? 8306 8307 SDNode *AndNode = SubsNode->getOperand(0).getNode(); 8308 unsigned MaskBits = 0; 8309 8310 if (AndNode->getOpcode() != ISD::AND) 8311 return SDValue(); 8312 8313 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { 8314 uint32_t CNV = CN->getZExtValue(); 8315 if (CNV == 255) 8316 MaskBits = 8; 8317 else if (CNV == 65535) 8318 MaskBits = 16; 8319 } 8320 8321 if (!MaskBits) 8322 return SDValue(); 8323 8324 SDValue AddValue = AndNode->getOperand(0); 8325 8326 if (AddValue.getOpcode() != ISD::ADD) 8327 return SDValue(); 8328 8329 // The basic dag structure is correct, grab the inputs and validate them. 8330 8331 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); 8332 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); 8333 SDValue SubsInputValue = SubsNode->getOperand(1); 8334 8335 // The mask is present and the provenance of all the values is a smaller type, 8336 // lets see if the mask is superfluous. 8337 8338 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || 8339 !isa<ConstantSDNode>(SubsInputValue.getNode())) 8340 return SDValue(); 8341 8342 ISD::LoadExtType ExtType; 8343 8344 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || 8345 !checkValueWidth(AddInputValue2, MaskBits, ExtType) || 8346 !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) 8347 return SDValue(); 8348 8349 if(!isEquivalentMaskless(CC, MaskBits, ExtType, 8350 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), 8351 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) 8352 return SDValue(); 8353 8354 // The AND is not necessary, remove it. 8355 8356 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), 8357 SubsNode->getValueType(1)); 8358 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; 8359 8360 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); 8361 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); 8362 8363 return SDValue(N, 0); 8364} 8365 8366// Optimize compare with zero and branch. 8367static SDValue performBRCONDCombine(SDNode *N, 8368 TargetLowering::DAGCombinerInfo &DCI, 8369 SelectionDAG &DAG) { 8370 SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3); 8371 if (NV.getNode()) 8372 N = NV.getNode(); 8373 SDValue Chain = N->getOperand(0); 8374 SDValue Dest = N->getOperand(1); 8375 SDValue CCVal = N->getOperand(2); 8376 SDValue Cmp = N->getOperand(3); 8377 8378 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); 8379 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); 8380 if (CC != AArch64CC::EQ && CC != AArch64CC::NE) 8381 return SDValue(); 8382 8383 unsigned CmpOpc = Cmp.getOpcode(); 8384 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) 8385 return SDValue(); 8386 8387 // Only attempt folding if there is only one use of the flag and no use of the 8388 // value. 8389 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) 8390 return SDValue(); 8391 8392 SDValue LHS = Cmp.getOperand(0); 8393 SDValue RHS = Cmp.getOperand(1); 8394 8395 assert(LHS.getValueType() == RHS.getValueType() && 8396 "Expected the value type to be the same for both operands!"); 8397 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 8398 return SDValue(); 8399 8400 if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue()) 8401 std::swap(LHS, RHS); 8402 8403 if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue()) 8404 return SDValue(); 8405 8406 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || 8407 LHS.getOpcode() == ISD::SRL) 8408 return SDValue(); 8409 8410 // Fold the compare into the branch instruction. 8411 SDValue BR; 8412 if (CC == AArch64CC::EQ) 8413 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 8414 else 8415 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 8416 8417 // Do not add new nodes to DAG combiner worklist. 8418 DCI.CombineTo(N, BR, false); 8419 8420 return SDValue(); 8421} 8422 8423// vselect (v1i1 setcc) -> 8424// vselect (v1iXX setcc) (XX is the size of the compared operand type) 8425// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as 8426// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine 8427// such VSELECT. 8428static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { 8429 SDValue N0 = N->getOperand(0); 8430 EVT CCVT = N0.getValueType(); 8431 8432 if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || 8433 CCVT.getVectorElementType() != MVT::i1) 8434 return SDValue(); 8435 8436 EVT ResVT = N->getValueType(0); 8437 EVT CmpVT = N0.getOperand(0).getValueType(); 8438 // Only combine when the result type is of the same size as the compared 8439 // operands. 8440 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) 8441 return SDValue(); 8442 8443 SDValue IfTrue = N->getOperand(1); 8444 SDValue IfFalse = N->getOperand(2); 8445 SDValue SetCC = 8446 DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), 8447 N0.getOperand(0), N0.getOperand(1), 8448 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 8449 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, 8450 IfTrue, IfFalse); 8451} 8452 8453/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with 8454/// the compare-mask instructions rather than going via NZCV, even if LHS and 8455/// RHS are really scalar. This replaces any scalar setcc in the above pattern 8456/// with a vector one followed by a DUP shuffle on the result. 8457static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { 8458 SDValue N0 = N->getOperand(0); 8459 EVT ResVT = N->getValueType(0); 8460 8461 if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1) 8462 return SDValue(); 8463 8464 // If NumMaskElts == 0, the comparison is larger than select result. The 8465 // largest real NEON comparison is 64-bits per lane, which means the result is 8466 // at most 32-bits and an illegal vector. Just bail out for now. 8467 EVT SrcVT = N0.getOperand(0).getValueType(); 8468 8469 // Don't try to do this optimization when the setcc itself has i1 operands. 8470 // There are no legal vectors of i1, so this would be pointless. 8471 if (SrcVT == MVT::i1) 8472 return SDValue(); 8473 8474 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); 8475 if (!ResVT.isVector() || NumMaskElts == 0) 8476 return SDValue(); 8477 8478 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); 8479 EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); 8480 8481 // First perform a vector comparison, where lane 0 is the one we're interested 8482 // in. 8483 SDLoc DL(N0); 8484 SDValue LHS = 8485 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); 8486 SDValue RHS = 8487 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); 8488 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); 8489 8490 // Now duplicate the comparison mask we want across all other lanes. 8491 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); 8492 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); 8493 Mask = DAG.getNode(ISD::BITCAST, DL, 8494 ResVT.changeVectorElementTypeToInteger(), Mask); 8495 8496 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); 8497} 8498 8499SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, 8500 DAGCombinerInfo &DCI) const { 8501 SelectionDAG &DAG = DCI.DAG; 8502 switch (N->getOpcode()) { 8503 default: 8504 break; 8505 case ISD::ADD: 8506 case ISD::SUB: 8507 return performAddSubLongCombine(N, DCI, DAG); 8508 case ISD::XOR: 8509 return performXorCombine(N, DAG, DCI, Subtarget); 8510 case ISD::MUL: 8511 return performMulCombine(N, DAG, DCI, Subtarget); 8512 case ISD::SINT_TO_FP: 8513 case ISD::UINT_TO_FP: 8514 return performIntToFpCombine(N, DAG, Subtarget); 8515 case ISD::OR: 8516 return performORCombine(N, DCI, Subtarget); 8517 case ISD::INTRINSIC_WO_CHAIN: 8518 return performIntrinsicCombine(N, DCI, Subtarget); 8519 case ISD::ANY_EXTEND: 8520 case ISD::ZERO_EXTEND: 8521 case ISD::SIGN_EXTEND: 8522 return performExtendCombine(N, DCI, DAG); 8523 case ISD::BITCAST: 8524 return performBitcastCombine(N, DCI, DAG); 8525 case ISD::CONCAT_VECTORS: 8526 return performConcatVectorsCombine(N, DCI, DAG); 8527 case ISD::SELECT: 8528 return performSelectCombine(N, DAG); 8529 case ISD::VSELECT: 8530 return performVSelectCombine(N, DCI.DAG); 8531 case ISD::STORE: 8532 return performSTORECombine(N, DCI, DAG, Subtarget); 8533 case AArch64ISD::BRCOND: 8534 return performBRCONDCombine(N, DCI, DAG); 8535 case AArch64ISD::CSEL: 8536 return performCONDCombine(N, DCI, DAG, 2, 3); 8537 case AArch64ISD::DUP: 8538 return performPostLD1Combine(N, DCI, false); 8539 case ISD::INSERT_VECTOR_ELT: 8540 return performPostLD1Combine(N, DCI, true); 8541 case ISD::INTRINSIC_VOID: 8542 case ISD::INTRINSIC_W_CHAIN: 8543 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 8544 case Intrinsic::aarch64_neon_ld2: 8545 case Intrinsic::aarch64_neon_ld3: 8546 case Intrinsic::aarch64_neon_ld4: 8547 case Intrinsic::aarch64_neon_ld1x2: 8548 case Intrinsic::aarch64_neon_ld1x3: 8549 case Intrinsic::aarch64_neon_ld1x4: 8550 case Intrinsic::aarch64_neon_ld2lane: 8551 case Intrinsic::aarch64_neon_ld3lane: 8552 case Intrinsic::aarch64_neon_ld4lane: 8553 case Intrinsic::aarch64_neon_ld2r: 8554 case Intrinsic::aarch64_neon_ld3r: 8555 case Intrinsic::aarch64_neon_ld4r: 8556 case Intrinsic::aarch64_neon_st2: 8557 case Intrinsic::aarch64_neon_st3: 8558 case Intrinsic::aarch64_neon_st4: 8559 case Intrinsic::aarch64_neon_st1x2: 8560 case Intrinsic::aarch64_neon_st1x3: 8561 case Intrinsic::aarch64_neon_st1x4: 8562 case Intrinsic::aarch64_neon_st2lane: 8563 case Intrinsic::aarch64_neon_st3lane: 8564 case Intrinsic::aarch64_neon_st4lane: 8565 return performNEONPostLDSTCombine(N, DCI, DAG); 8566 default: 8567 break; 8568 } 8569 } 8570 return SDValue(); 8571} 8572 8573// Check if the return value is used as only a return value, as otherwise 8574// we can't perform a tail-call. In particular, we need to check for 8575// target ISD nodes that are returns and any other "odd" constructs 8576// that the generic analysis code won't necessarily catch. 8577bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, 8578 SDValue &Chain) const { 8579 if (N->getNumValues() != 1) 8580 return false; 8581 if (!N->hasNUsesOfValue(1, 0)) 8582 return false; 8583 8584 SDValue TCChain = Chain; 8585 SDNode *Copy = *N->use_begin(); 8586 if (Copy->getOpcode() == ISD::CopyToReg) { 8587 // If the copy has a glue operand, we conservatively assume it isn't safe to 8588 // perform a tail call. 8589 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == 8590 MVT::Glue) 8591 return false; 8592 TCChain = Copy->getOperand(0); 8593 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 8594 return false; 8595 8596 bool HasRet = false; 8597 for (SDNode *Node : Copy->uses()) { 8598 if (Node->getOpcode() != AArch64ISD::RET_FLAG) 8599 return false; 8600 HasRet = true; 8601 } 8602 8603 if (!HasRet) 8604 return false; 8605 8606 Chain = TCChain; 8607 return true; 8608} 8609 8610// Return whether the an instruction can potentially be optimized to a tail 8611// call. This will cause the optimizers to attempt to move, or duplicate, 8612// return instructions to help enable tail call optimizations for this 8613// instruction. 8614bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 8615 if (!CI->isTailCall()) 8616 return false; 8617 8618 return true; 8619} 8620 8621bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, 8622 SDValue &Offset, 8623 ISD::MemIndexedMode &AM, 8624 bool &IsInc, 8625 SelectionDAG &DAG) const { 8626 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) 8627 return false; 8628 8629 Base = Op->getOperand(0); 8630 // All of the indexed addressing mode instructions take a signed 8631 // 9 bit immediate offset. 8632 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { 8633 int64_t RHSC = (int64_t)RHS->getZExtValue(); 8634 if (RHSC >= 256 || RHSC <= -256) 8635 return false; 8636 IsInc = (Op->getOpcode() == ISD::ADD); 8637 Offset = Op->getOperand(1); 8638 return true; 8639 } 8640 return false; 8641} 8642 8643bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 8644 SDValue &Offset, 8645 ISD::MemIndexedMode &AM, 8646 SelectionDAG &DAG) const { 8647 EVT VT; 8648 SDValue Ptr; 8649 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8650 VT = LD->getMemoryVT(); 8651 Ptr = LD->getBasePtr(); 8652 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 8653 VT = ST->getMemoryVT(); 8654 Ptr = ST->getBasePtr(); 8655 } else 8656 return false; 8657 8658 bool IsInc; 8659 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) 8660 return false; 8661 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; 8662 return true; 8663} 8664 8665bool AArch64TargetLowering::getPostIndexedAddressParts( 8666 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, 8667 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { 8668 EVT VT; 8669 SDValue Ptr; 8670 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8671 VT = LD->getMemoryVT(); 8672 Ptr = LD->getBasePtr(); 8673 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 8674 VT = ST->getMemoryVT(); 8675 Ptr = ST->getBasePtr(); 8676 } else 8677 return false; 8678 8679 bool IsInc; 8680 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) 8681 return false; 8682 // Post-indexing updates the base, so it's not a valid transform 8683 // if that's not the same as the load's pointer. 8684 if (Ptr != Base) 8685 return false; 8686 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; 8687 return true; 8688} 8689 8690static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, 8691 SelectionDAG &DAG) { 8692 SDLoc DL(N); 8693 SDValue Op = N->getOperand(0); 8694 8695 if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) 8696 return; 8697 8698 Op = SDValue( 8699 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, 8700 DAG.getUNDEF(MVT::i32), Op, 8701 DAG.getTargetConstant(AArch64::hsub, MVT::i32)), 8702 0); 8703 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); 8704 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); 8705} 8706 8707void AArch64TargetLowering::ReplaceNodeResults( 8708 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 8709 switch (N->getOpcode()) { 8710 default: 8711 llvm_unreachable("Don't know how to custom expand this"); 8712 case ISD::BITCAST: 8713 ReplaceBITCASTResults(N, Results, DAG); 8714 return; 8715 case ISD::FP_TO_UINT: 8716 case ISD::FP_TO_SINT: 8717 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); 8718 // Let normal code take care of it by not adding anything to Results. 8719 return; 8720 } 8721} 8722 8723bool AArch64TargetLowering::useLoadStackGuardNode() const { 8724 return true; 8725} 8726 8727bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { 8728 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 8729 // reciprocal if there are three or more FDIVs. 8730 return NumUsers > 2; 8731} 8732 8733TargetLoweringBase::LegalizeTypeAction 8734AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { 8735 MVT SVT = VT.getSimpleVT(); 8736 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, 8737 // v4i16, v2i32 instead of to promote. 8738 if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 8739 || SVT == MVT::v1f32) 8740 return TypeWidenVector; 8741 8742 return TargetLoweringBase::getPreferredVectorAction(VT); 8743} 8744 8745// Loads and stores less than 128-bits are already atomic; ones above that 8746// are doomed anyway, so defer to the default libcall and blame the OS when 8747// things go wrong. 8748bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 8749 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 8750 return Size == 128; 8751} 8752 8753// Loads and stores less than 128-bits are already atomic; ones above that 8754// are doomed anyway, so defer to the default libcall and blame the OS when 8755// things go wrong. 8756bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 8757 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 8758 return Size == 128; 8759} 8760 8761// For the real atomic operations, we have ldxr/stxr up to 128 bits, 8762bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 8763 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 8764 return Size <= 128; 8765} 8766 8767bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { 8768 return true; 8769} 8770 8771Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 8772 AtomicOrdering Ord) const { 8773 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 8774 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 8775 bool IsAcquire = isAtLeastAcquire(Ord); 8776 8777 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd 8778 // intrinsic must return {i64, i64} and we have to recombine them into a 8779 // single i128 here. 8780 if (ValTy->getPrimitiveSizeInBits() == 128) { 8781 Intrinsic::ID Int = 8782 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; 8783 Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); 8784 8785 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 8786 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); 8787 8788 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 8789 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 8790 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 8791 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 8792 return Builder.CreateOr( 8793 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); 8794 } 8795 8796 Type *Tys[] = { Addr->getType() }; 8797 Intrinsic::ID Int = 8798 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; 8799 Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); 8800 8801 return Builder.CreateTruncOrBitCast( 8802 Builder.CreateCall(Ldxr, Addr), 8803 cast<PointerType>(Addr->getType())->getElementType()); 8804} 8805 8806Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, 8807 Value *Val, Value *Addr, 8808 AtomicOrdering Ord) const { 8809 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 8810 bool IsRelease = isAtLeastRelease(Ord); 8811 8812 // Since the intrinsics must have legal type, the i128 intrinsics take two 8813 // parameters: "i64, i64". We must marshal Val into the appropriate form 8814 // before the call. 8815 if (Val->getType()->getPrimitiveSizeInBits() == 128) { 8816 Intrinsic::ID Int = 8817 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; 8818 Function *Stxr = Intrinsic::getDeclaration(M, Int); 8819 Type *Int64Ty = Type::getInt64Ty(M->getContext()); 8820 8821 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); 8822 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); 8823 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 8824 return Builder.CreateCall3(Stxr, Lo, Hi, Addr); 8825 } 8826 8827 Intrinsic::ID Int = 8828 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; 8829 Type *Tys[] = { Addr->getType() }; 8830 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); 8831 8832 return Builder.CreateCall2( 8833 Stxr, Builder.CreateZExtOrBitCast( 8834 Val, Stxr->getFunctionType()->getParamType(0)), 8835 Addr); 8836} 8837 8838bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( 8839 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 8840 return Ty->isArrayTy(); 8841}
| 2047 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2048 bool Res = 2049 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); 2050 assert(!Res && "Call operand has unhandled type"); 2051 (void)Res; 2052 } 2053 assert(ArgLocs.size() == Ins.size()); 2054 SmallVector<SDValue, 16> ArgValues; 2055 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2056 CCValAssign &VA = ArgLocs[i]; 2057 2058 if (Ins[i].Flags.isByVal()) { 2059 // Byval is used for HFAs in the PCS, but the system should work in a 2060 // non-compliant manner for larger structs. 2061 EVT PtrTy = getPointerTy(); 2062 int Size = Ins[i].Flags.getByValSize(); 2063 unsigned NumRegs = (Size + 7) / 8; 2064 2065 // FIXME: This works on big-endian for composite byvals, which are the common 2066 // case. It should also work for fundamental types too. 2067 unsigned FrameIdx = 2068 MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); 2069 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); 2070 InVals.push_back(FrameIdxN); 2071 2072 continue; 2073 } 2074 2075 if (VA.isRegLoc()) { 2076 // Arguments stored in registers. 2077 EVT RegVT = VA.getLocVT(); 2078 2079 SDValue ArgValue; 2080 const TargetRegisterClass *RC; 2081 2082 if (RegVT == MVT::i32) 2083 RC = &AArch64::GPR32RegClass; 2084 else if (RegVT == MVT::i64) 2085 RC = &AArch64::GPR64RegClass; 2086 else if (RegVT == MVT::f16) 2087 RC = &AArch64::FPR16RegClass; 2088 else if (RegVT == MVT::f32) 2089 RC = &AArch64::FPR32RegClass; 2090 else if (RegVT == MVT::f64 || RegVT.is64BitVector()) 2091 RC = &AArch64::FPR64RegClass; 2092 else if (RegVT == MVT::f128 || RegVT.is128BitVector()) 2093 RC = &AArch64::FPR128RegClass; 2094 else 2095 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 2096 2097 // Transform the arguments in physical registers into virtual ones. 2098 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 2099 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); 2100 2101 // If this is an 8, 16 or 32-bit value, it is really passed promoted 2102 // to 64 bits. Insert an assert[sz]ext to capture this, then 2103 // truncate to the right size. 2104 switch (VA.getLocInfo()) { 2105 default: 2106 llvm_unreachable("Unknown loc info!"); 2107 case CCValAssign::Full: 2108 break; 2109 case CCValAssign::BCvt: 2110 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); 2111 break; 2112 case CCValAssign::AExt: 2113 case CCValAssign::SExt: 2114 case CCValAssign::ZExt: 2115 // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt 2116 // nodes after our lowering. 2117 assert(RegVT == Ins[i].VT && "incorrect register location selected"); 2118 break; 2119 } 2120 2121 InVals.push_back(ArgValue); 2122 2123 } else { // VA.isRegLoc() 2124 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); 2125 unsigned ArgOffset = VA.getLocMemOffset(); 2126 unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; 2127 2128 uint32_t BEAlign = 0; 2129 if (!Subtarget->isLittleEndian() && ArgSize < 8 && 2130 !Ins[i].Flags.isInConsecutiveRegs()) 2131 BEAlign = 8 - ArgSize; 2132 2133 int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); 2134 2135 // Create load nodes to retrieve arguments from the stack. 2136 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 2137 SDValue ArgValue; 2138 2139 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) 2140 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; 2141 MVT MemVT = VA.getValVT(); 2142 2143 switch (VA.getLocInfo()) { 2144 default: 2145 break; 2146 case CCValAssign::BCvt: 2147 MemVT = VA.getLocVT(); 2148 break; 2149 case CCValAssign::SExt: 2150 ExtType = ISD::SEXTLOAD; 2151 break; 2152 case CCValAssign::ZExt: 2153 ExtType = ISD::ZEXTLOAD; 2154 break; 2155 case CCValAssign::AExt: 2156 ExtType = ISD::EXTLOAD; 2157 break; 2158 } 2159 2160 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, 2161 MachinePointerInfo::getFixedStack(FI), 2162 MemVT, false, false, false, 0); 2163 2164 InVals.push_back(ArgValue); 2165 } 2166 } 2167 2168 // varargs 2169 if (isVarArg) { 2170 if (!Subtarget->isTargetDarwin()) { 2171 // The AAPCS variadic function ABI is identical to the non-variadic 2172 // one. As a result there may be more arguments in registers and we should 2173 // save them for future reference. 2174 saveVarArgRegisters(CCInfo, DAG, DL, Chain); 2175 } 2176 2177 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 2178 // This will point to the next argument passed via stack. 2179 unsigned StackOffset = CCInfo.getNextStackOffset(); 2180 // We currently pass all varargs at 8-byte alignment. 2181 StackOffset = ((StackOffset + 7) & ~7); 2182 AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); 2183 } 2184 2185 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2186 unsigned StackArgSize = CCInfo.getNextStackOffset(); 2187 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2188 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { 2189 // This is a non-standard ABI so by fiat I say we're allowed to make full 2190 // use of the stack area to be popped, which must be aligned to 16 bytes in 2191 // any case: 2192 StackArgSize = RoundUpToAlignment(StackArgSize, 16); 2193 2194 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding 2195 // a multiple of 16. 2196 FuncInfo->setArgumentStackToRestore(StackArgSize); 2197 2198 // This realignment carries over to the available bytes below. Our own 2199 // callers will guarantee the space is free by giving an aligned value to 2200 // CALLSEQ_START. 2201 } 2202 // Even if we're not expected to free up the space, it's useful to know how 2203 // much is there while considering tail calls (because we can reuse it). 2204 FuncInfo->setBytesInStackArgArea(StackArgSize); 2205 2206 return Chain; 2207} 2208 2209void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, 2210 SelectionDAG &DAG, SDLoc DL, 2211 SDValue &Chain) const { 2212 MachineFunction &MF = DAG.getMachineFunction(); 2213 MachineFrameInfo *MFI = MF.getFrameInfo(); 2214 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2215 2216 SmallVector<SDValue, 8> MemOps; 2217 2218 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, 2219 AArch64::X3, AArch64::X4, AArch64::X5, 2220 AArch64::X6, AArch64::X7 }; 2221 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); 2222 unsigned FirstVariadicGPR = 2223 CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs); 2224 2225 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); 2226 int GPRIdx = 0; 2227 if (GPRSaveSize != 0) { 2228 GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); 2229 2230 SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); 2231 2232 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { 2233 unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); 2234 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); 2235 SDValue Store = 2236 DAG.getStore(Val.getValue(1), DL, Val, FIN, 2237 MachinePointerInfo::getStack(i * 8), false, false, 0); 2238 MemOps.push_back(Store); 2239 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 2240 DAG.getConstant(8, getPointerTy())); 2241 } 2242 } 2243 FuncInfo->setVarArgsGPRIndex(GPRIdx); 2244 FuncInfo->setVarArgsGPRSize(GPRSaveSize); 2245 2246 if (Subtarget->hasFPARMv8()) { 2247 static const MCPhysReg FPRArgRegs[] = { 2248 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, 2249 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; 2250 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); 2251 unsigned FirstVariadicFPR = 2252 CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs); 2253 2254 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); 2255 int FPRIdx = 0; 2256 if (FPRSaveSize != 0) { 2257 FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); 2258 2259 SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); 2260 2261 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { 2262 unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); 2263 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); 2264 2265 SDValue Store = 2266 DAG.getStore(Val.getValue(1), DL, Val, FIN, 2267 MachinePointerInfo::getStack(i * 16), false, false, 0); 2268 MemOps.push_back(Store); 2269 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, 2270 DAG.getConstant(16, getPointerTy())); 2271 } 2272 } 2273 FuncInfo->setVarArgsFPRIndex(FPRIdx); 2274 FuncInfo->setVarArgsFPRSize(FPRSaveSize); 2275 } 2276 2277 if (!MemOps.empty()) { 2278 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 2279 } 2280} 2281 2282/// LowerCallResult - Lower the result values of a call into the 2283/// appropriate copies out of appropriate physical registers. 2284SDValue AArch64TargetLowering::LowerCallResult( 2285 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2286 const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, 2287 SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2288 SDValue ThisVal) const { 2289 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 2290 ? RetCC_AArch64_WebKit_JS 2291 : RetCC_AArch64_AAPCS; 2292 // Assign locations to each value returned by this call. 2293 SmallVector<CCValAssign, 16> RVLocs; 2294 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2295 *DAG.getContext()); 2296 CCInfo.AnalyzeCallResult(Ins, RetCC); 2297 2298 // Copy all of the result registers out of their specified physreg. 2299 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2300 CCValAssign VA = RVLocs[i]; 2301 2302 // Pass 'this' value directly from the argument to return value, to avoid 2303 // reg unit interference 2304 if (i == 0 && isThisReturn) { 2305 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && 2306 "unexpected return calling convention register assignment"); 2307 InVals.push_back(ThisVal); 2308 continue; 2309 } 2310 2311 SDValue Val = 2312 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); 2313 Chain = Val.getValue(1); 2314 InFlag = Val.getValue(2); 2315 2316 switch (VA.getLocInfo()) { 2317 default: 2318 llvm_unreachable("Unknown loc info!"); 2319 case CCValAssign::Full: 2320 break; 2321 case CCValAssign::BCvt: 2322 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); 2323 break; 2324 } 2325 2326 InVals.push_back(Val); 2327 } 2328 2329 return Chain; 2330} 2331 2332bool AArch64TargetLowering::isEligibleForTailCallOptimization( 2333 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2334 bool isCalleeStructRet, bool isCallerStructRet, 2335 const SmallVectorImpl<ISD::OutputArg> &Outs, 2336 const SmallVectorImpl<SDValue> &OutVals, 2337 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { 2338 // For CallingConv::C this function knows whether the ABI needs 2339 // changing. That's not true for other conventions so they will have to opt in 2340 // manually. 2341 if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) 2342 return false; 2343 2344 const MachineFunction &MF = DAG.getMachineFunction(); 2345 const Function *CallerF = MF.getFunction(); 2346 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2347 bool CCMatch = CallerCC == CalleeCC; 2348 2349 // Byval parameters hand the function a pointer directly into the stack area 2350 // we want to reuse during a tail call. Working around this *is* possible (see 2351 // X86) but less efficient and uglier in LowerCall. 2352 for (Function::const_arg_iterator i = CallerF->arg_begin(), 2353 e = CallerF->arg_end(); 2354 i != e; ++i) 2355 if (i->hasByValAttr()) 2356 return false; 2357 2358 if (getTargetMachine().Options.GuaranteedTailCallOpt) { 2359 if (IsTailCallConvention(CalleeCC) && CCMatch) 2360 return true; 2361 return false; 2362 } 2363 2364 // Externally-defined functions with weak linkage should not be 2365 // tail-called on AArch64 when the OS does not support dynamic 2366 // pre-emption of symbols, as the AAELF spec requires normal calls 2367 // to undefined weak functions to be replaced with a NOP or jump to the 2368 // next instruction. The behaviour of branch instructions in this 2369 // situation (as used for tail calls) is implementation-defined, so we 2370 // cannot rely on the linker replacing the tail call with a return. 2371 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2372 const GlobalValue *GV = G->getGlobal(); 2373 const Triple TT(getTargetMachine().getTargetTriple()); 2374 if (GV->hasExternalWeakLinkage() && 2375 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2376 return false; 2377 } 2378 2379 // Now we search for cases where we can use a tail call without changing the 2380 // ABI. Sibcall is used in some places (particularly gcc) to refer to this 2381 // concept. 2382 2383 // I want anyone implementing a new calling convention to think long and hard 2384 // about this assert. 2385 assert((!isVarArg || CalleeCC == CallingConv::C) && 2386 "Unexpected variadic calling convention"); 2387 2388 if (isVarArg && !Outs.empty()) { 2389 // At least two cases here: if caller is fastcc then we can't have any 2390 // memory arguments (we'd be expected to clean up the stack afterwards). If 2391 // caller is C then we could potentially use its argument area. 2392 2393 // FIXME: for now we take the most conservative of these in both cases: 2394 // disallow all variadic memory operands. 2395 SmallVector<CCValAssign, 16> ArgLocs; 2396 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 2397 *DAG.getContext()); 2398 2399 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); 2400 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) 2401 if (!ArgLocs[i].isRegLoc()) 2402 return false; 2403 } 2404 2405 // If the calling conventions do not match, then we'd better make sure the 2406 // results are returned in the same way as what the caller expects. 2407 if (!CCMatch) { 2408 SmallVector<CCValAssign, 16> RVLocs1; 2409 CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, 2410 *DAG.getContext()); 2411 CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); 2412 2413 SmallVector<CCValAssign, 16> RVLocs2; 2414 CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, 2415 *DAG.getContext()); 2416 CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); 2417 2418 if (RVLocs1.size() != RVLocs2.size()) 2419 return false; 2420 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2421 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2422 return false; 2423 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2424 return false; 2425 if (RVLocs1[i].isRegLoc()) { 2426 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2427 return false; 2428 } else { 2429 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2430 return false; 2431 } 2432 } 2433 } 2434 2435 // Nothing more to check if the callee is taking no arguments 2436 if (Outs.empty()) 2437 return true; 2438 2439 SmallVector<CCValAssign, 16> ArgLocs; 2440 CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, 2441 *DAG.getContext()); 2442 2443 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2444 2445 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2446 2447 // If the stack arguments for this call would fit into our own save area then 2448 // the call can be made tail. 2449 return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); 2450} 2451 2452SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, 2453 SelectionDAG &DAG, 2454 MachineFrameInfo *MFI, 2455 int ClobberedFI) const { 2456 SmallVector<SDValue, 8> ArgChains; 2457 int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); 2458 int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; 2459 2460 // Include the original chain at the beginning of the list. When this is 2461 // used by target LowerCall hooks, this helps legalize find the 2462 // CALLSEQ_BEGIN node. 2463 ArgChains.push_back(Chain); 2464 2465 // Add a chain value for each stack argument corresponding 2466 for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), 2467 UE = DAG.getEntryNode().getNode()->use_end(); 2468 U != UE; ++U) 2469 if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) 2470 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) 2471 if (FI->getIndex() < 0) { 2472 int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); 2473 int64_t InLastByte = InFirstByte; 2474 InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; 2475 2476 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || 2477 (FirstByte <= InFirstByte && InFirstByte <= LastByte)) 2478 ArgChains.push_back(SDValue(L, 1)); 2479 } 2480 2481 // Build a tokenfactor for all the chains. 2482 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); 2483} 2484 2485bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, 2486 bool TailCallOpt) const { 2487 return CallCC == CallingConv::Fast && TailCallOpt; 2488} 2489 2490bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { 2491 return CallCC == CallingConv::Fast; 2492} 2493 2494/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, 2495/// and add input and output parameter nodes. 2496SDValue 2497AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, 2498 SmallVectorImpl<SDValue> &InVals) const { 2499 SelectionDAG &DAG = CLI.DAG; 2500 SDLoc &DL = CLI.DL; 2501 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; 2502 SmallVector<SDValue, 32> &OutVals = CLI.OutVals; 2503 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; 2504 SDValue Chain = CLI.Chain; 2505 SDValue Callee = CLI.Callee; 2506 bool &IsTailCall = CLI.IsTailCall; 2507 CallingConv::ID CallConv = CLI.CallConv; 2508 bool IsVarArg = CLI.IsVarArg; 2509 2510 MachineFunction &MF = DAG.getMachineFunction(); 2511 bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2512 bool IsThisReturn = false; 2513 2514 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 2515 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; 2516 bool IsSibCall = false; 2517 2518 if (IsTailCall) { 2519 // Check if it's really possible to do a tail call. 2520 IsTailCall = isEligibleForTailCallOptimization( 2521 Callee, CallConv, IsVarArg, IsStructRet, 2522 MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); 2523 if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) 2524 report_fatal_error("failed to perform tail call elimination on a call " 2525 "site marked musttail"); 2526 2527 // A sibling call is one where we're under the usual C ABI and not planning 2528 // to change that but can still do a tail call: 2529 if (!TailCallOpt && IsTailCall) 2530 IsSibCall = true; 2531 2532 if (IsTailCall) 2533 ++NumTailCalls; 2534 } 2535 2536 // Analyze operands of the call, assigning locations to each operand. 2537 SmallVector<CCValAssign, 16> ArgLocs; 2538 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, 2539 *DAG.getContext()); 2540 2541 if (IsVarArg) { 2542 // Handle fixed and variable vector arguments differently. 2543 // Variable vector arguments always go into memory. 2544 unsigned NumArgs = Outs.size(); 2545 2546 for (unsigned i = 0; i != NumArgs; ++i) { 2547 MVT ArgVT = Outs[i].VT; 2548 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2549 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, 2550 /*IsVarArg=*/ !Outs[i].IsFixed); 2551 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); 2552 assert(!Res && "Call operand has unhandled type"); 2553 (void)Res; 2554 } 2555 } else { 2556 // At this point, Outs[].VT may already be promoted to i32. To correctly 2557 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and 2558 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. 2559 // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here 2560 // we use a special version of AnalyzeCallOperands to pass in ValVT and 2561 // LocVT. 2562 unsigned NumArgs = Outs.size(); 2563 for (unsigned i = 0; i != NumArgs; ++i) { 2564 MVT ValVT = Outs[i].VT; 2565 // Get type of the original argument. 2566 EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty, 2567 /*AllowUnknown*/ true); 2568 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; 2569 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; 2570 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. 2571 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) 2572 ValVT = MVT::i8; 2573 else if (ActualMVT == MVT::i16) 2574 ValVT = MVT::i16; 2575 2576 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); 2577 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); 2578 assert(!Res && "Call operand has unhandled type"); 2579 (void)Res; 2580 } 2581 } 2582 2583 // Get a count of how many bytes are to be pushed on the stack. 2584 unsigned NumBytes = CCInfo.getNextStackOffset(); 2585 2586 if (IsSibCall) { 2587 // Since we're not changing the ABI to make this a tail call, the memory 2588 // operands are already available in the caller's incoming argument space. 2589 NumBytes = 0; 2590 } 2591 2592 // FPDiff is the byte offset of the call's argument area from the callee's. 2593 // Stores to callee stack arguments will be placed in FixedStackSlots offset 2594 // by this amount for a tail call. In a sibling call it must be 0 because the 2595 // caller will deallocate the entire stack and the callee still expects its 2596 // arguments to begin at SP+0. Completely unused for non-tail calls. 2597 int FPDiff = 0; 2598 2599 if (IsTailCall && !IsSibCall) { 2600 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); 2601 2602 // Since callee will pop argument stack as a tail call, we must keep the 2603 // popped size 16-byte aligned. 2604 NumBytes = RoundUpToAlignment(NumBytes, 16); 2605 2606 // FPDiff will be negative if this tail call requires more space than we 2607 // would automatically have in our incoming argument space. Positive if we 2608 // can actually shrink the stack. 2609 FPDiff = NumReusableBytes - NumBytes; 2610 2611 // The stack pointer must be 16-byte aligned at all times it's used for a 2612 // memory operation, which in practice means at *all* times and in 2613 // particular across call boundaries. Therefore our own arguments started at 2614 // a 16-byte aligned SP and the delta applied for the tail call should 2615 // satisfy the same constraint. 2616 assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); 2617 } 2618 2619 // Adjust the stack pointer for the new arguments... 2620 // These operations are automatically eliminated by the prolog/epilog pass 2621 if (!IsSibCall) 2622 Chain = 2623 DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL); 2624 2625 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy()); 2626 2627 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 2628 SmallVector<SDValue, 8> MemOpChains; 2629 2630 // Walk the register/memloc assignments, inserting copies/loads. 2631 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; 2632 ++i, ++realArgIdx) { 2633 CCValAssign &VA = ArgLocs[i]; 2634 SDValue Arg = OutVals[realArgIdx]; 2635 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2636 2637 // Promote the value if needed. 2638 switch (VA.getLocInfo()) { 2639 default: 2640 llvm_unreachable("Unknown loc info!"); 2641 case CCValAssign::Full: 2642 break; 2643 case CCValAssign::SExt: 2644 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); 2645 break; 2646 case CCValAssign::ZExt: 2647 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 2648 break; 2649 case CCValAssign::AExt: 2650 if (Outs[realArgIdx].ArgVT == MVT::i1) { 2651 // AAPCS requires i1 to be zero-extended to 8-bits by the caller. 2652 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 2653 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); 2654 } 2655 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); 2656 break; 2657 case CCValAssign::BCvt: 2658 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 2659 break; 2660 case CCValAssign::FPExt: 2661 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); 2662 break; 2663 } 2664 2665 if (VA.isRegLoc()) { 2666 if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { 2667 assert(VA.getLocVT() == MVT::i64 && 2668 "unexpected calling convention register assignment"); 2669 assert(!Ins.empty() && Ins[0].VT == MVT::i64 && 2670 "unexpected use of 'returned'"); 2671 IsThisReturn = true; 2672 } 2673 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2674 } else { 2675 assert(VA.isMemLoc()); 2676 2677 SDValue DstAddr; 2678 MachinePointerInfo DstInfo; 2679 2680 // FIXME: This works on big-endian for composite byvals, which are the 2681 // common case. It should also work for fundamental types too. 2682 uint32_t BEAlign = 0; 2683 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 2684 : VA.getValVT().getSizeInBits(); 2685 OpSize = (OpSize + 7) / 8; 2686 if (!Subtarget->isLittleEndian() && !Flags.isByVal() && 2687 !Flags.isInConsecutiveRegs()) { 2688 if (OpSize < 8) 2689 BEAlign = 8 - OpSize; 2690 } 2691 unsigned LocMemOffset = VA.getLocMemOffset(); 2692 int32_t Offset = LocMemOffset + BEAlign; 2693 SDValue PtrOff = DAG.getIntPtrConstant(Offset); 2694 PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); 2695 2696 if (IsTailCall) { 2697 Offset = Offset + FPDiff; 2698 int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2699 2700 DstAddr = DAG.getFrameIndex(FI, getPointerTy()); 2701 DstInfo = MachinePointerInfo::getFixedStack(FI); 2702 2703 // Make sure any stack arguments overlapping with where we're storing 2704 // are loaded before this eventual operation. Otherwise they'll be 2705 // clobbered. 2706 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); 2707 } else { 2708 SDValue PtrOff = DAG.getIntPtrConstant(Offset); 2709 2710 DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); 2711 DstInfo = MachinePointerInfo::getStack(LocMemOffset); 2712 } 2713 2714 if (Outs[i].Flags.isByVal()) { 2715 SDValue SizeNode = 2716 DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64); 2717 SDValue Cpy = DAG.getMemcpy( 2718 Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), 2719 /*isVol = */ false, 2720 /*AlwaysInline = */ false, DstInfo, MachinePointerInfo()); 2721 2722 MemOpChains.push_back(Cpy); 2723 } else { 2724 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already 2725 // promoted to a legal register type i32, we should truncate Arg back to 2726 // i1/i8/i16. 2727 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || 2728 VA.getValVT() == MVT::i16) 2729 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); 2730 2731 SDValue Store = 2732 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); 2733 MemOpChains.push_back(Store); 2734 } 2735 } 2736 } 2737 2738 if (!MemOpChains.empty()) 2739 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); 2740 2741 // Build a sequence of copy-to-reg nodes chained together with token chain 2742 // and flag operands which copy the outgoing args into the appropriate regs. 2743 SDValue InFlag; 2744 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2745 Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first, 2746 RegsToPass[i].second, InFlag); 2747 InFlag = Chain.getValue(1); 2748 } 2749 2750 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2751 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2752 // node so that legalize doesn't hack it. 2753 if (getTargetMachine().getCodeModel() == CodeModel::Large && 2754 Subtarget->isTargetMachO()) { 2755 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2756 const GlobalValue *GV = G->getGlobal(); 2757 bool InternalLinkage = GV->hasInternalLinkage(); 2758 if (InternalLinkage) 2759 Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); 2760 else { 2761 Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 2762 AArch64II::MO_GOT); 2763 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); 2764 } 2765 } else if (ExternalSymbolSDNode *S = 2766 dyn_cast<ExternalSymbolSDNode>(Callee)) { 2767 const char *Sym = S->getSymbol(); 2768 Callee = 2769 DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT); 2770 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); 2771 } 2772 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2773 const GlobalValue *GV = G->getGlobal(); 2774 Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); 2775 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2776 const char *Sym = S->getSymbol(); 2777 Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0); 2778 } 2779 2780 // We don't usually want to end the call-sequence here because we would tidy 2781 // the frame up *after* the call, however in the ABI-changing tail-call case 2782 // we've carefully laid out the parameters so that when sp is reset they'll be 2783 // in the correct location. 2784 if (IsTailCall && !IsSibCall) { 2785 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2786 DAG.getIntPtrConstant(0, true), InFlag, DL); 2787 InFlag = Chain.getValue(1); 2788 } 2789 2790 std::vector<SDValue> Ops; 2791 Ops.push_back(Chain); 2792 Ops.push_back(Callee); 2793 2794 if (IsTailCall) { 2795 // Each tail call may have to adjust the stack by a different amount, so 2796 // this information must travel along with the operation for eventual 2797 // consumption by emitEpilogue. 2798 Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); 2799 } 2800 2801 // Add argument registers to the end of the list so that they are known live 2802 // into the call. 2803 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2804 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2805 RegsToPass[i].second.getValueType())); 2806 2807 // Add a register mask operand representing the call-preserved registers. 2808 const uint32_t *Mask; 2809 const TargetRegisterInfo *TRI = 2810 getTargetMachine().getSubtargetImpl()->getRegisterInfo(); 2811 const AArch64RegisterInfo *ARI = 2812 static_cast<const AArch64RegisterInfo *>(TRI); 2813 if (IsThisReturn) { 2814 // For 'this' returns, use the X0-preserving mask if applicable 2815 Mask = ARI->getThisReturnPreservedMask(CallConv); 2816 if (!Mask) { 2817 IsThisReturn = false; 2818 Mask = ARI->getCallPreservedMask(CallConv); 2819 } 2820 } else 2821 Mask = ARI->getCallPreservedMask(CallConv); 2822 2823 assert(Mask && "Missing call preserved mask for calling convention"); 2824 Ops.push_back(DAG.getRegisterMask(Mask)); 2825 2826 if (InFlag.getNode()) 2827 Ops.push_back(InFlag); 2828 2829 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2830 2831 // If we're doing a tall call, use a TC_RETURN here rather than an 2832 // actual call instruction. 2833 if (IsTailCall) 2834 return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); 2835 2836 // Returns a chain and a flag for retval copy to use. 2837 Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); 2838 InFlag = Chain.getValue(1); 2839 2840 uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) 2841 ? RoundUpToAlignment(NumBytes, 16) 2842 : 0; 2843 2844 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2845 DAG.getIntPtrConstant(CalleePopBytes, true), 2846 InFlag, DL); 2847 if (!Ins.empty()) 2848 InFlag = Chain.getValue(1); 2849 2850 // Handle result values, copying them out of physregs into vregs that we 2851 // return. 2852 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, 2853 InVals, IsThisReturn, 2854 IsThisReturn ? OutVals[0] : SDValue()); 2855} 2856 2857bool AArch64TargetLowering::CanLowerReturn( 2858 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, 2859 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { 2860 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 2861 ? RetCC_AArch64_WebKit_JS 2862 : RetCC_AArch64_AAPCS; 2863 SmallVector<CCValAssign, 16> RVLocs; 2864 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2865 return CCInfo.CheckReturn(Outs, RetCC); 2866} 2867 2868SDValue 2869AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2870 bool isVarArg, 2871 const SmallVectorImpl<ISD::OutputArg> &Outs, 2872 const SmallVectorImpl<SDValue> &OutVals, 2873 SDLoc DL, SelectionDAG &DAG) const { 2874 CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS 2875 ? RetCC_AArch64_WebKit_JS 2876 : RetCC_AArch64_AAPCS; 2877 SmallVector<CCValAssign, 16> RVLocs; 2878 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2879 *DAG.getContext()); 2880 CCInfo.AnalyzeReturn(Outs, RetCC); 2881 2882 // Copy the result values into the output registers. 2883 SDValue Flag; 2884 SmallVector<SDValue, 4> RetOps(1, Chain); 2885 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); 2886 ++i, ++realRVLocIdx) { 2887 CCValAssign &VA = RVLocs[i]; 2888 assert(VA.isRegLoc() && "Can only return in registers!"); 2889 SDValue Arg = OutVals[realRVLocIdx]; 2890 2891 switch (VA.getLocInfo()) { 2892 default: 2893 llvm_unreachable("Unknown loc info!"); 2894 case CCValAssign::Full: 2895 if (Outs[i].ArgVT == MVT::i1) { 2896 // AAPCS requires i1 to be zero-extended to i8 by the producer of the 2897 // value. This is strictly redundant on Darwin (which uses "zeroext 2898 // i1"), but will be optimised out before ISel. 2899 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); 2900 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); 2901 } 2902 break; 2903 case CCValAssign::BCvt: 2904 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); 2905 break; 2906 } 2907 2908 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); 2909 Flag = Chain.getValue(1); 2910 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 2911 } 2912 2913 RetOps[0] = Chain; // Update chain. 2914 2915 // Add the flag if we have it. 2916 if (Flag.getNode()) 2917 RetOps.push_back(Flag); 2918 2919 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); 2920} 2921 2922//===----------------------------------------------------------------------===// 2923// Other Lowering Code 2924//===----------------------------------------------------------------------===// 2925 2926SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, 2927 SelectionDAG &DAG) const { 2928 EVT PtrVT = getPointerTy(); 2929 SDLoc DL(Op); 2930 const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); 2931 const GlobalValue *GV = GN->getGlobal(); 2932 unsigned char OpFlags = 2933 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 2934 2935 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && 2936 "unexpected offset in global node"); 2937 2938 // This also catched the large code model case for Darwin. 2939 if ((OpFlags & AArch64II::MO_GOT) != 0) { 2940 SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); 2941 // FIXME: Once remat is capable of dealing with instructions with register 2942 // operands, expand this into two nodes instead of using a wrapper node. 2943 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 2944 } 2945 2946 if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { 2947 assert(getTargetMachine().getCodeModel() == CodeModel::Small && 2948 "use of MO_CONSTPOOL only supported on small model"); 2949 SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); 2950 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 2951 unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; 2952 SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); 2953 SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 2954 SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr, 2955 MachinePointerInfo::getConstantPool(), 2956 /*isVolatile=*/ false, 2957 /*isNonTemporal=*/ true, 2958 /*isInvariant=*/ true, 8); 2959 if (GN->getOffset() != 0) 2960 return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, 2961 DAG.getConstant(GN->getOffset(), PtrVT)); 2962 return GlobalAddr; 2963 } 2964 2965 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2966 const unsigned char MO_NC = AArch64II::MO_NC; 2967 return DAG.getNode( 2968 AArch64ISD::WrapperLarge, DL, PtrVT, 2969 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), 2970 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 2971 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 2972 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 2973 } else { 2974 // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and 2975 // the only correct model on Darwin. 2976 SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 2977 OpFlags | AArch64II::MO_PAGE); 2978 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; 2979 SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); 2980 2981 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 2982 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 2983 } 2984} 2985 2986/// \brief Convert a TLS address reference into the correct sequence of loads 2987/// and calls to compute the variable's address (for Darwin, currently) and 2988/// return an SDValue containing the final node. 2989 2990/// Darwin only has one TLS scheme which must be capable of dealing with the 2991/// fully general situation, in the worst case. This means: 2992/// + "extern __thread" declaration. 2993/// + Defined in a possibly unknown dynamic library. 2994/// 2995/// The general system is that each __thread variable has a [3 x i64] descriptor 2996/// which contains information used by the runtime to calculate the address. The 2997/// only part of this the compiler needs to know about is the first xword, which 2998/// contains a function pointer that must be called with the address of the 2999/// entire descriptor in "x0". 3000/// 3001/// Since this descriptor may be in a different unit, in general even the 3002/// descriptor must be accessed via an indirect load. The "ideal" code sequence 3003/// is: 3004/// adrp x0, _var@TLVPPAGE 3005/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor 3006/// ldr x1, [x0] ; x1 contains 1st entry of descriptor, 3007/// ; the function pointer 3008/// blr x1 ; Uses descriptor address in x0 3009/// ; Address of _var is now in x0. 3010/// 3011/// If the address of _var's descriptor *is* known to the linker, then it can 3012/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for 3013/// a slight efficiency gain. 3014SDValue 3015AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, 3016 SelectionDAG &DAG) const { 3017 assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); 3018 3019 SDLoc DL(Op); 3020 MVT PtrVT = getPointerTy(); 3021 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3022 3023 SDValue TLVPAddr = 3024 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3025 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); 3026 3027 // The first entry in the descriptor is a function pointer that we must call 3028 // to obtain the address of the variable. 3029 SDValue Chain = DAG.getEntryNode(); 3030 SDValue FuncTLVGet = 3031 DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(), 3032 false, true, true, 8); 3033 Chain = FuncTLVGet.getValue(1); 3034 3035 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3036 MFI->setAdjustsStack(true); 3037 3038 // TLS calls preserve all registers except those that absolutely must be 3039 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be 3040 // silly). 3041 const TargetRegisterInfo *TRI = 3042 getTargetMachine().getSubtargetImpl()->getRegisterInfo(); 3043 const AArch64RegisterInfo *ARI = 3044 static_cast<const AArch64RegisterInfo *>(TRI); 3045 const uint32_t *Mask = ARI->getTLSCallPreservedMask(); 3046 3047 // Finally, we can make the call. This is just a degenerate version of a 3048 // normal AArch64 call node: x0 takes the address of the descriptor, and 3049 // returns the address of the variable in this thread. 3050 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); 3051 Chain = 3052 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3053 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), 3054 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3055 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); 3056} 3057 3058/// When accessing thread-local variables under either the general-dynamic or 3059/// local-dynamic system, we make a "TLS-descriptor" call. The variable will 3060/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry 3061/// is a function pointer to carry out the resolution. 3062/// 3063/// The sequence is: 3064/// adrp x0, :tlsdesc:var 3065/// ldr x1, [x0, #:tlsdesc_lo12:var] 3066/// add x0, x0, #:tlsdesc_lo12:var 3067/// .tlsdesccall var 3068/// blr x1 3069/// (TPIDR_EL0 offset now in x0) 3070/// 3071/// The above sequence must be produced unscheduled, to enable the linker to 3072/// optimize/relax this sequence. 3073/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the 3074/// above sequence, and expanded really late in the compilation flow, to ensure 3075/// the sequence is produced as per above. 3076SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL, 3077 SelectionDAG &DAG) const { 3078 EVT PtrVT = getPointerTy(); 3079 3080 SDValue Chain = DAG.getEntryNode(); 3081 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 3082 3083 SmallVector<SDValue, 2> Ops; 3084 Ops.push_back(Chain); 3085 Ops.push_back(SymAddr); 3086 3087 Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops); 3088 SDValue Glue = Chain.getValue(1); 3089 3090 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); 3091} 3092 3093SDValue 3094AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, 3095 SelectionDAG &DAG) const { 3096 assert(Subtarget->isTargetELF() && "This function expects an ELF target"); 3097 assert(getTargetMachine().getCodeModel() == CodeModel::Small && 3098 "ELF TLS only supported in small memory model"); 3099 // Different choices can be made for the maximum size of the TLS area for a 3100 // module. For the small address model, the default TLS size is 16MiB and the 3101 // maximum TLS size is 4GiB. 3102 // FIXME: add -mtls-size command line option and make it control the 16MiB 3103 // vs. 4GiB code sequence generation. 3104 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3105 3106 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); 3107 if (!EnableAArch64ELFLocalDynamicTLSGeneration) { 3108 if (Model == TLSModel::LocalDynamic) 3109 Model = TLSModel::GeneralDynamic; 3110 } 3111 3112 SDValue TPOff; 3113 EVT PtrVT = getPointerTy(); 3114 SDLoc DL(Op); 3115 const GlobalValue *GV = GA->getGlobal(); 3116 3117 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); 3118 3119 if (Model == TLSModel::LocalExec) { 3120 SDValue HiVar = DAG.getTargetGlobalAddress( 3121 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3122 SDValue LoVar = DAG.getTargetGlobalAddress( 3123 GV, DL, PtrVT, 0, 3124 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3125 3126 SDValue TPWithOff_lo = 3127 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, 3128 HiVar, DAG.getTargetConstant(0, MVT::i32)), 3129 0); 3130 SDValue TPWithOff = 3131 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, 3132 LoVar, DAG.getTargetConstant(0, MVT::i32)), 3133 0); 3134 return TPWithOff; 3135 } else if (Model == TLSModel::InitialExec) { 3136 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3137 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); 3138 } else if (Model == TLSModel::LocalDynamic) { 3139 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS 3140 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate 3141 // the beginning of the module's TLS region, followed by a DTPREL offset 3142 // calculation. 3143 3144 // These accesses will need deduplicating if there's more than one. 3145 AArch64FunctionInfo *MFI = 3146 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 3147 MFI->incNumLocalDynamicTLSAccesses(); 3148 3149 // The call needs a relocation too for linker relaxation. It doesn't make 3150 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3151 // the address. 3152 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, 3153 AArch64II::MO_TLS); 3154 3155 // Now we can calculate the offset from TPIDR_EL0 to this module's 3156 // thread-local area. 3157 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3158 3159 // Now use :dtprel_whatever: operations to calculate this variable's offset 3160 // in its thread-storage area. 3161 SDValue HiVar = DAG.getTargetGlobalAddress( 3162 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); 3163 SDValue LoVar = DAG.getTargetGlobalAddress( 3164 GV, DL, MVT::i64, 0, 3165 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3166 3167 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, 3168 DAG.getTargetConstant(0, MVT::i32)), 3169 0); 3170 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, 3171 DAG.getTargetConstant(0, MVT::i32)), 3172 0); 3173 } else if (Model == TLSModel::GeneralDynamic) { 3174 // The call needs a relocation too for linker relaxation. It doesn't make 3175 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of 3176 // the address. 3177 SDValue SymAddr = 3178 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); 3179 3180 // Finally we can make a call to calculate the offset from tpidr_el0. 3181 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); 3182 } else 3183 llvm_unreachable("Unsupported ELF TLS access model"); 3184 3185 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); 3186} 3187 3188SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, 3189 SelectionDAG &DAG) const { 3190 if (Subtarget->isTargetDarwin()) 3191 return LowerDarwinGlobalTLSAddress(Op, DAG); 3192 else if (Subtarget->isTargetELF()) 3193 return LowerELFGlobalTLSAddress(Op, DAG); 3194 3195 llvm_unreachable("Unexpected platform trying to use TLS"); 3196} 3197SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 3198 SDValue Chain = Op.getOperand(0); 3199 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 3200 SDValue LHS = Op.getOperand(2); 3201 SDValue RHS = Op.getOperand(3); 3202 SDValue Dest = Op.getOperand(4); 3203 SDLoc dl(Op); 3204 3205 // Handle f128 first, since lowering it will result in comparing the return 3206 // value of a libcall against zero, which is just what the rest of LowerBR_CC 3207 // is expecting to deal with. 3208 if (LHS.getValueType() == MVT::f128) { 3209 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3210 3211 // If softenSetCCOperands returned a scalar, we need to compare the result 3212 // against zero to select between true and false values. 3213 if (!RHS.getNode()) { 3214 RHS = DAG.getConstant(0, LHS.getValueType()); 3215 CC = ISD::SETNE; 3216 } 3217 } 3218 3219 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 3220 // instruction. 3221 unsigned Opc = LHS.getOpcode(); 3222 if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) && 3223 cast<ConstantSDNode>(RHS)->isOne() && 3224 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3225 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 3226 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && 3227 "Unexpected condition code."); 3228 // Only lower legal XALUO ops. 3229 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 3230 return SDValue(); 3231 3232 // The actual operation with overflow check. 3233 AArch64CC::CondCode OFCC; 3234 SDValue Value, Overflow; 3235 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); 3236 3237 if (CC == ISD::SETNE) 3238 OFCC = getInvertedCondCode(OFCC); 3239 SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); 3240 3241 return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest, 3242 CCVal, Overflow); 3243 } 3244 3245 if (LHS.getValueType().isInteger()) { 3246 assert((LHS.getValueType() == RHS.getValueType()) && 3247 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3248 3249 // If the RHS of the comparison is zero, we can potentially fold this 3250 // to a specialized branch. 3251 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); 3252 if (RHSC && RHSC->getZExtValue() == 0) { 3253 if (CC == ISD::SETEQ) { 3254 // See if we can use a TBZ to fold in an AND as well. 3255 // TBZ has a smaller branch displacement than CBZ. If the offset is 3256 // out of bounds, a late MI-layer pass rewrites branches. 3257 // 403.gcc is an example that hits this case. 3258 if (LHS.getOpcode() == ISD::AND && 3259 isa<ConstantSDNode>(LHS.getOperand(1)) && 3260 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3261 SDValue Test = LHS.getOperand(0); 3262 uint64_t Mask = LHS.getConstantOperandVal(1); 3263 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, 3264 DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); 3265 } 3266 3267 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); 3268 } else if (CC == ISD::SETNE) { 3269 // See if we can use a TBZ to fold in an AND as well. 3270 // TBZ has a smaller branch displacement than CBZ. If the offset is 3271 // out of bounds, a late MI-layer pass rewrites branches. 3272 // 403.gcc is an example that hits this case. 3273 if (LHS.getOpcode() == ISD::AND && 3274 isa<ConstantSDNode>(LHS.getOperand(1)) && 3275 isPowerOf2_64(LHS.getConstantOperandVal(1))) { 3276 SDValue Test = LHS.getOperand(0); 3277 uint64_t Mask = LHS.getConstantOperandVal(1); 3278 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, 3279 DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); 3280 } 3281 3282 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); 3283 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { 3284 // Don't combine AND since emitComparison converts the AND to an ANDS 3285 // (a.k.a. TST) and the test in the test bit and branch instruction 3286 // becomes redundant. This would also increase register pressure. 3287 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3288 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, 3289 DAG.getConstant(Mask, MVT::i64), Dest); 3290 } 3291 } 3292 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && 3293 LHS.getOpcode() != ISD::AND) { 3294 // Don't combine AND since emitComparison converts the AND to an ANDS 3295 // (a.k.a. TST) and the test in the test bit and branch instruction 3296 // becomes redundant. This would also increase register pressure. 3297 uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; 3298 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, 3299 DAG.getConstant(Mask, MVT::i64), Dest); 3300 } 3301 3302 SDValue CCVal; 3303 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3304 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, 3305 Cmp); 3306 } 3307 3308 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3309 3310 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 3311 // clean. Some of them require two branches to implement. 3312 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3313 AArch64CC::CondCode CC1, CC2; 3314 changeFPCCToAArch64CC(CC, CC1, CC2); 3315 SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); 3316 SDValue BR1 = 3317 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); 3318 if (CC2 != AArch64CC::AL) { 3319 SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); 3320 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, 3321 Cmp); 3322 } 3323 3324 return BR1; 3325} 3326 3327SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, 3328 SelectionDAG &DAG) const { 3329 EVT VT = Op.getValueType(); 3330 SDLoc DL(Op); 3331 3332 SDValue In1 = Op.getOperand(0); 3333 SDValue In2 = Op.getOperand(1); 3334 EVT SrcVT = In2.getValueType(); 3335 if (SrcVT != VT) { 3336 if (SrcVT == MVT::f32 && VT == MVT::f64) 3337 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); 3338 else if (SrcVT == MVT::f64 && VT == MVT::f32) 3339 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0)); 3340 else 3341 // FIXME: Src type is different, bail out for now. Can VT really be a 3342 // vector type? 3343 return SDValue(); 3344 } 3345 3346 EVT VecVT; 3347 EVT EltVT; 3348 SDValue EltMask, VecVal1, VecVal2; 3349 if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { 3350 EltVT = MVT::i32; 3351 VecVT = MVT::v4i32; 3352 EltMask = DAG.getConstant(0x80000000ULL, EltVT); 3353 3354 if (!VT.isVector()) { 3355 VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3356 DAG.getUNDEF(VecVT), In1); 3357 VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, 3358 DAG.getUNDEF(VecVT), In2); 3359 } else { 3360 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3361 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3362 } 3363 } else if (VT == MVT::f64 || VT == MVT::v2f64) { 3364 EltVT = MVT::i64; 3365 VecVT = MVT::v2i64; 3366 3367 // We want to materialize a mask with the the high bit set, but the AdvSIMD 3368 // immediate moves cannot materialize that in a single instruction for 3369 // 64-bit elements. Instead, materialize zero and then negate it. 3370 EltMask = DAG.getConstant(0, EltVT); 3371 3372 if (!VT.isVector()) { 3373 VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3374 DAG.getUNDEF(VecVT), In1); 3375 VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, 3376 DAG.getUNDEF(VecVT), In2); 3377 } else { 3378 VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); 3379 VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); 3380 } 3381 } else { 3382 llvm_unreachable("Invalid type for copysign!"); 3383 } 3384 3385 std::vector<SDValue> BuildVectorOps; 3386 for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i) 3387 BuildVectorOps.push_back(EltMask); 3388 3389 SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps); 3390 3391 // If we couldn't materialize the mask above, then the mask vector will be 3392 // the zero vector, and we need to negate it here. 3393 if (VT == MVT::f64 || VT == MVT::v2f64) { 3394 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); 3395 BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); 3396 BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); 3397 } 3398 3399 SDValue Sel = 3400 DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); 3401 3402 if (VT == MVT::f32) 3403 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); 3404 else if (VT == MVT::f64) 3405 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); 3406 else 3407 return DAG.getNode(ISD::BITCAST, DL, VT, Sel); 3408} 3409 3410SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { 3411 if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( 3412 AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) 3413 return SDValue(); 3414 3415 if (!Subtarget->hasNEON()) 3416 return SDValue(); 3417 3418 // While there is no integer popcount instruction, it can 3419 // be more efficiently lowered to the following sequence that uses 3420 // AdvSIMD registers/instructions as long as the copies to/from 3421 // the AdvSIMD registers are cheap. 3422 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd 3423 // CNT V0.8B, V0.8B // 8xbyte pop-counts 3424 // ADDV B0, V0.8B // sum 8xbyte pop-counts 3425 // UMOV X0, V0.B[0] // copy byte result back to integer reg 3426 SDValue Val = Op.getOperand(0); 3427 SDLoc DL(Op); 3428 EVT VT = Op.getValueType(); 3429 SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8); 3430 3431 SDValue VecVal; 3432 if (VT == MVT::i32) { 3433 VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val); 3434 VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec, 3435 VecVal); 3436 } else { 3437 VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); 3438 } 3439 3440 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal); 3441 SDValue UaddLV = DAG.getNode( 3442 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, 3443 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop); 3444 3445 if (VT == MVT::i64) 3446 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); 3447 return UaddLV; 3448} 3449 3450SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 3451 3452 if (Op.getValueType().isVector()) 3453 return LowerVSETCC(Op, DAG); 3454 3455 SDValue LHS = Op.getOperand(0); 3456 SDValue RHS = Op.getOperand(1); 3457 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 3458 SDLoc dl(Op); 3459 3460 // We chose ZeroOrOneBooleanContents, so use zero and one. 3461 EVT VT = Op.getValueType(); 3462 SDValue TVal = DAG.getConstant(1, VT); 3463 SDValue FVal = DAG.getConstant(0, VT); 3464 3465 // Handle f128 first, since one possible outcome is a normal integer 3466 // comparison which gets picked up by the next if statement. 3467 if (LHS.getValueType() == MVT::f128) { 3468 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3469 3470 // If softenSetCCOperands returned a scalar, use it. 3471 if (!RHS.getNode()) { 3472 assert(LHS.getValueType() == Op.getValueType() && 3473 "Unexpected setcc expansion!"); 3474 return LHS; 3475 } 3476 } 3477 3478 if (LHS.getValueType().isInteger()) { 3479 SDValue CCVal; 3480 SDValue Cmp = 3481 getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); 3482 3483 // Note that we inverted the condition above, so we reverse the order of 3484 // the true and false operands here. This will allow the setcc to be 3485 // matched to a single CSINC instruction. 3486 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); 3487 } 3488 3489 // Now we know we're dealing with FP values. 3490 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3491 3492 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 3493 // and do the comparison. 3494 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3495 3496 AArch64CC::CondCode CC1, CC2; 3497 changeFPCCToAArch64CC(CC, CC1, CC2); 3498 if (CC2 == AArch64CC::AL) { 3499 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); 3500 SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); 3501 3502 // Note that we inverted the condition above, so we reverse the order of 3503 // the true and false operands here. This will allow the setcc to be 3504 // matched to a single CSINC instruction. 3505 return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); 3506 } else { 3507 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't 3508 // totally clean. Some of them require two CSELs to implement. As is in 3509 // this case, we emit the first CSEL and then emit a second using the output 3510 // of the first as the RHS. We're effectively OR'ing the two CC's together. 3511 3512 // FIXME: It would be nice if we could match the two CSELs to two CSINCs. 3513 SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); 3514 SDValue CS1 = 3515 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 3516 3517 SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); 3518 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 3519 } 3520} 3521 3522/// A SELECT_CC operation is really some kind of max or min if both values being 3523/// compared are, in some sense, equal to the results in either case. However, 3524/// it is permissible to compare f32 values and produce directly extended f64 3525/// values. 3526/// 3527/// Extending the comparison operands would also be allowed, but is less likely 3528/// to happen in practice since their use is right here. Note that truncate 3529/// operations would *not* be semantically equivalent. 3530static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { 3531 if (Cmp == Result) 3532 return true; 3533 3534 ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp); 3535 ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result); 3536 if (CCmp && CResult && Cmp.getValueType() == MVT::f32 && 3537 Result.getValueType() == MVT::f64) { 3538 bool Lossy; 3539 APFloat CmpVal = CCmp->getValueAPF(); 3540 CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy); 3541 return CResult->getValueAPF().bitwiseIsEqual(CmpVal); 3542 } 3543 3544 return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp; 3545} 3546 3547SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, 3548 SelectionDAG &DAG) const { 3549 SDValue CC = Op->getOperand(0); 3550 SDValue TVal = Op->getOperand(1); 3551 SDValue FVal = Op->getOperand(2); 3552 SDLoc DL(Op); 3553 3554 unsigned Opc = CC.getOpcode(); 3555 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select 3556 // instruction. 3557 if (CC.getResNo() == 1 && 3558 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 3559 Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { 3560 // Only lower legal XALUO ops. 3561 if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0))) 3562 return SDValue(); 3563 3564 AArch64CC::CondCode OFCC; 3565 SDValue Value, Overflow; 3566 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG); 3567 SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); 3568 3569 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, 3570 CCVal, Overflow); 3571 } 3572 3573 if (CC.getOpcode() == ISD::SETCC) 3574 return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal, 3575 cast<CondCodeSDNode>(CC.getOperand(2))->get()); 3576 else 3577 return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal, 3578 FVal, ISD::SETNE); 3579} 3580 3581SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, 3582 SelectionDAG &DAG) const { 3583 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 3584 SDValue LHS = Op.getOperand(0); 3585 SDValue RHS = Op.getOperand(1); 3586 SDValue TVal = Op.getOperand(2); 3587 SDValue FVal = Op.getOperand(3); 3588 SDLoc dl(Op); 3589 3590 // Handle f128 first, because it will result in a comparison of some RTLIB 3591 // call result against zero. 3592 if (LHS.getValueType() == MVT::f128) { 3593 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); 3594 3595 // If softenSetCCOperands returned a scalar, we need to compare the result 3596 // against zero to select between true and false values. 3597 if (!RHS.getNode()) { 3598 RHS = DAG.getConstant(0, LHS.getValueType()); 3599 CC = ISD::SETNE; 3600 } 3601 } 3602 3603 // Handle integers first. 3604 if (LHS.getValueType().isInteger()) { 3605 assert((LHS.getValueType() == RHS.getValueType()) && 3606 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); 3607 3608 unsigned Opcode = AArch64ISD::CSEL; 3609 3610 // If both the TVal and the FVal are constants, see if we can swap them in 3611 // order to for a CSINV or CSINC out of them. 3612 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); 3613 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); 3614 3615 if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { 3616 std::swap(TVal, FVal); 3617 std::swap(CTVal, CFVal); 3618 CC = ISD::getSetCCInverse(CC, true); 3619 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { 3620 std::swap(TVal, FVal); 3621 std::swap(CTVal, CFVal); 3622 CC = ISD::getSetCCInverse(CC, true); 3623 } else if (TVal.getOpcode() == ISD::XOR) { 3624 // If TVal is a NOT we want to swap TVal and FVal so that we can match 3625 // with a CSINV rather than a CSEL. 3626 ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1)); 3627 3628 if (CVal && CVal->isAllOnesValue()) { 3629 std::swap(TVal, FVal); 3630 std::swap(CTVal, CFVal); 3631 CC = ISD::getSetCCInverse(CC, true); 3632 } 3633 } else if (TVal.getOpcode() == ISD::SUB) { 3634 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so 3635 // that we can match with a CSNEG rather than a CSEL. 3636 ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0)); 3637 3638 if (CVal && CVal->isNullValue()) { 3639 std::swap(TVal, FVal); 3640 std::swap(CTVal, CFVal); 3641 CC = ISD::getSetCCInverse(CC, true); 3642 } 3643 } else if (CTVal && CFVal) { 3644 const int64_t TrueVal = CTVal->getSExtValue(); 3645 const int64_t FalseVal = CFVal->getSExtValue(); 3646 bool Swap = false; 3647 3648 // If both TVal and FVal are constants, see if FVal is the 3649 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC 3650 // instead of a CSEL in that case. 3651 if (TrueVal == ~FalseVal) { 3652 Opcode = AArch64ISD::CSINV; 3653 } else if (TrueVal == -FalseVal) { 3654 Opcode = AArch64ISD::CSNEG; 3655 } else if (TVal.getValueType() == MVT::i32) { 3656 // If our operands are only 32-bit wide, make sure we use 32-bit 3657 // arithmetic for the check whether we can use CSINC. This ensures that 3658 // the addition in the check will wrap around properly in case there is 3659 // an overflow (which would not be the case if we do the check with 3660 // 64-bit arithmetic). 3661 const uint32_t TrueVal32 = CTVal->getZExtValue(); 3662 const uint32_t FalseVal32 = CFVal->getZExtValue(); 3663 3664 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { 3665 Opcode = AArch64ISD::CSINC; 3666 3667 if (TrueVal32 > FalseVal32) { 3668 Swap = true; 3669 } 3670 } 3671 // 64-bit check whether we can use CSINC. 3672 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { 3673 Opcode = AArch64ISD::CSINC; 3674 3675 if (TrueVal > FalseVal) { 3676 Swap = true; 3677 } 3678 } 3679 3680 // Swap TVal and FVal if necessary. 3681 if (Swap) { 3682 std::swap(TVal, FVal); 3683 std::swap(CTVal, CFVal); 3684 CC = ISD::getSetCCInverse(CC, true); 3685 } 3686 3687 if (Opcode != AArch64ISD::CSEL) { 3688 // Drop FVal since we can get its value by simply inverting/negating 3689 // TVal. 3690 FVal = TVal; 3691 } 3692 } 3693 3694 SDValue CCVal; 3695 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); 3696 3697 EVT VT = Op.getValueType(); 3698 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); 3699 } 3700 3701 // Now we know we're dealing with FP values. 3702 assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); 3703 assert(LHS.getValueType() == RHS.getValueType()); 3704 EVT VT = Op.getValueType(); 3705 3706 // Try to match this select into a max/min operation, which have dedicated 3707 // opcode in the instruction set. 3708 // FIXME: This is not correct in the presence of NaNs, so we only enable this 3709 // in no-NaNs mode. 3710 if (getTargetMachine().Options.NoNaNsFPMath) { 3711 SDValue MinMaxLHS = TVal, MinMaxRHS = FVal; 3712 if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) && 3713 selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) { 3714 CC = ISD::getSetCCSwappedOperands(CC); 3715 std::swap(MinMaxLHS, MinMaxRHS); 3716 } 3717 3718 if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) && 3719 selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) { 3720 switch (CC) { 3721 default: 3722 break; 3723 case ISD::SETGT: 3724 case ISD::SETGE: 3725 case ISD::SETUGT: 3726 case ISD::SETUGE: 3727 case ISD::SETOGT: 3728 case ISD::SETOGE: 3729 return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS); 3730 break; 3731 case ISD::SETLT: 3732 case ISD::SETLE: 3733 case ISD::SETULT: 3734 case ISD::SETULE: 3735 case ISD::SETOLT: 3736 case ISD::SETOLE: 3737 return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS); 3738 break; 3739 } 3740 } 3741 } 3742 3743 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead 3744 // and do the comparison. 3745 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); 3746 3747 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 3748 // clean. Some of them require two CSELs to implement. 3749 AArch64CC::CondCode CC1, CC2; 3750 changeFPCCToAArch64CC(CC, CC1, CC2); 3751 SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); 3752 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); 3753 3754 // If we need a second CSEL, emit it, using the output of the first as the 3755 // RHS. We're effectively OR'ing the two CC's together. 3756 if (CC2 != AArch64CC::AL) { 3757 SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); 3758 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); 3759 } 3760 3761 // Otherwise, return the output of the first CSEL. 3762 return CS1; 3763} 3764 3765SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, 3766 SelectionDAG &DAG) const { 3767 // Jump table entries as PC relative offsets. No additional tweaking 3768 // is necessary here. Just get the address of the jump table. 3769 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 3770 EVT PtrVT = getPointerTy(); 3771 SDLoc DL(Op); 3772 3773 if (getTargetMachine().getCodeModel() == CodeModel::Large && 3774 !Subtarget->isTargetMachO()) { 3775 const unsigned char MO_NC = AArch64II::MO_NC; 3776 return DAG.getNode( 3777 AArch64ISD::WrapperLarge, DL, PtrVT, 3778 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), 3779 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), 3780 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), 3781 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 3782 AArch64II::MO_G0 | MO_NC)); 3783 } 3784 3785 SDValue Hi = 3786 DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); 3787 SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 3788 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3789 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3790 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3791} 3792 3793SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, 3794 SelectionDAG &DAG) const { 3795 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3796 EVT PtrVT = getPointerTy(); 3797 SDLoc DL(Op); 3798 3799 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 3800 // Use the GOT for the large code model on iOS. 3801 if (Subtarget->isTargetMachO()) { 3802 SDValue GotAddr = DAG.getTargetConstantPool( 3803 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 3804 AArch64II::MO_GOT); 3805 return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); 3806 } 3807 3808 const unsigned char MO_NC = AArch64II::MO_NC; 3809 return DAG.getNode( 3810 AArch64ISD::WrapperLarge, DL, PtrVT, 3811 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 3812 CP->getOffset(), AArch64II::MO_G3), 3813 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 3814 CP->getOffset(), AArch64II::MO_G2 | MO_NC), 3815 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 3816 CP->getOffset(), AArch64II::MO_G1 | MO_NC), 3817 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 3818 CP->getOffset(), AArch64II::MO_G0 | MO_NC)); 3819 } else { 3820 // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on 3821 // ELF, the only valid one on Darwin. 3822 SDValue Hi = 3823 DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), 3824 CP->getOffset(), AArch64II::MO_PAGE); 3825 SDValue Lo = DAG.getTargetConstantPool( 3826 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), 3827 AArch64II::MO_PAGEOFF | AArch64II::MO_NC); 3828 3829 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3830 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3831 } 3832} 3833 3834SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, 3835 SelectionDAG &DAG) const { 3836 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3837 EVT PtrVT = getPointerTy(); 3838 SDLoc DL(Op); 3839 if (getTargetMachine().getCodeModel() == CodeModel::Large && 3840 !Subtarget->isTargetMachO()) { 3841 const unsigned char MO_NC = AArch64II::MO_NC; 3842 return DAG.getNode( 3843 AArch64ISD::WrapperLarge, DL, PtrVT, 3844 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), 3845 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), 3846 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), 3847 DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); 3848 } else { 3849 SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); 3850 SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | 3851 AArch64II::MO_NC); 3852 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); 3853 return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); 3854 } 3855} 3856 3857SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, 3858 SelectionDAG &DAG) const { 3859 AArch64FunctionInfo *FuncInfo = 3860 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); 3861 3862 SDLoc DL(Op); 3863 SDValue FR = 3864 DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); 3865 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3866 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 3867 MachinePointerInfo(SV), false, false, 0); 3868} 3869 3870SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, 3871 SelectionDAG &DAG) const { 3872 // The layout of the va_list struct is specified in the AArch64 Procedure Call 3873 // Standard, section B.3. 3874 MachineFunction &MF = DAG.getMachineFunction(); 3875 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); 3876 SDLoc DL(Op); 3877 3878 SDValue Chain = Op.getOperand(0); 3879 SDValue VAList = Op.getOperand(1); 3880 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3881 SmallVector<SDValue, 4> MemOps; 3882 3883 // void *__stack at offset 0 3884 SDValue Stack = 3885 DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); 3886 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, 3887 MachinePointerInfo(SV), false, false, 8)); 3888 3889 // void *__gr_top at offset 8 3890 int GPRSize = FuncInfo->getVarArgsGPRSize(); 3891 if (GPRSize > 0) { 3892 SDValue GRTop, GRTopAddr; 3893 3894 GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3895 DAG.getConstant(8, getPointerTy())); 3896 3897 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy()); 3898 GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, 3899 DAG.getConstant(GPRSize, getPointerTy())); 3900 3901 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, 3902 MachinePointerInfo(SV, 8), false, false, 8)); 3903 } 3904 3905 // void *__vr_top at offset 16 3906 int FPRSize = FuncInfo->getVarArgsFPRSize(); 3907 if (FPRSize > 0) { 3908 SDValue VRTop, VRTopAddr; 3909 VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3910 DAG.getConstant(16, getPointerTy())); 3911 3912 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy()); 3913 VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, 3914 DAG.getConstant(FPRSize, getPointerTy())); 3915 3916 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, 3917 MachinePointerInfo(SV, 16), false, false, 8)); 3918 } 3919 3920 // int __gr_offs at offset 24 3921 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3922 DAG.getConstant(24, getPointerTy())); 3923 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32), 3924 GROffsAddr, MachinePointerInfo(SV, 24), false, 3925 false, 4)); 3926 3927 // int __vr_offs at offset 28 3928 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3929 DAG.getConstant(28, getPointerTy())); 3930 MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32), 3931 VROffsAddr, MachinePointerInfo(SV, 28), false, 3932 false, 4)); 3933 3934 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); 3935} 3936 3937SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, 3938 SelectionDAG &DAG) const { 3939 return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) 3940 : LowerAAPCS_VASTART(Op, DAG); 3941} 3942 3943SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, 3944 SelectionDAG &DAG) const { 3945 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single 3946 // pointer. 3947 unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; 3948 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 3949 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 3950 3951 return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1), 3952 Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32), 3953 8, false, false, MachinePointerInfo(DestSV), 3954 MachinePointerInfo(SrcSV)); 3955} 3956 3957SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 3958 assert(Subtarget->isTargetDarwin() && 3959 "automatic va_arg instruction only works on Darwin"); 3960 3961 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 3962 EVT VT = Op.getValueType(); 3963 SDLoc DL(Op); 3964 SDValue Chain = Op.getOperand(0); 3965 SDValue Addr = Op.getOperand(1); 3966 unsigned Align = Op.getConstantOperandVal(3); 3967 3968 SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr, 3969 MachinePointerInfo(V), false, false, false, 0); 3970 Chain = VAList.getValue(1); 3971 3972 if (Align > 8) { 3973 assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); 3974 VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3975 DAG.getConstant(Align - 1, getPointerTy())); 3976 VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList, 3977 DAG.getConstant(-(int64_t)Align, getPointerTy())); 3978 } 3979 3980 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); 3981 uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); 3982 3983 // Scalar integer and FP values smaller than 64 bits are implicitly extended 3984 // up to 64 bits. At the very least, we have to increase the striding of the 3985 // vaargs list to match this, and for FP values we need to introduce 3986 // FP_ROUND nodes as well. 3987 if (VT.isInteger() && !VT.isVector()) 3988 ArgSize = 8; 3989 bool NeedFPTrunc = false; 3990 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { 3991 ArgSize = 8; 3992 NeedFPTrunc = true; 3993 } 3994 3995 // Increment the pointer, VAList, to the next vaarg 3996 SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, 3997 DAG.getConstant(ArgSize, getPointerTy())); 3998 // Store the incremented VAList to the legalized pointer 3999 SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), 4000 false, false, 0); 4001 4002 // Load the actual argument out of the pointer VAList 4003 if (NeedFPTrunc) { 4004 // Load the value as an f64. 4005 SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, 4006 MachinePointerInfo(), false, false, false, 0); 4007 // Round the value down to an f32. 4008 SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), 4009 DAG.getIntPtrConstant(1)); 4010 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; 4011 // Merge the rounded value with the chain output of the load. 4012 return DAG.getMergeValues(Ops, DL); 4013 } 4014 4015 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, 4016 false, false, 0); 4017} 4018 4019SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, 4020 SelectionDAG &DAG) const { 4021 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4022 MFI->setFrameAddressIsTaken(true); 4023 4024 EVT VT = Op.getValueType(); 4025 SDLoc DL(Op); 4026 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4027 SDValue FrameAddr = 4028 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); 4029 while (Depth--) 4030 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, 4031 MachinePointerInfo(), false, false, false, 0); 4032 return FrameAddr; 4033} 4034 4035// FIXME? Maybe this could be a TableGen attribute on some registers and 4036// this table could be generated automatically from RegInfo. 4037unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, 4038 EVT VT) const { 4039 unsigned Reg = StringSwitch<unsigned>(RegName) 4040 .Case("sp", AArch64::SP) 4041 .Default(0); 4042 if (Reg) 4043 return Reg; 4044 report_fatal_error("Invalid register name global variable"); 4045} 4046 4047SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, 4048 SelectionDAG &DAG) const { 4049 MachineFunction &MF = DAG.getMachineFunction(); 4050 MachineFrameInfo *MFI = MF.getFrameInfo(); 4051 MFI->setReturnAddressIsTaken(true); 4052 4053 EVT VT = Op.getValueType(); 4054 SDLoc DL(Op); 4055 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 4056 if (Depth) { 4057 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 4058 SDValue Offset = DAG.getConstant(8, getPointerTy()); 4059 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 4060 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), 4061 MachinePointerInfo(), false, false, false, 0); 4062 } 4063 4064 // Return LR, which contains the return address. Mark it an implicit live-in. 4065 unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); 4066 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); 4067} 4068 4069/// LowerShiftRightParts - Lower SRA_PARTS, which returns two 4070/// i64 values and take a 2 x i64 value to shift plus a shift amount. 4071SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, 4072 SelectionDAG &DAG) const { 4073 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4074 EVT VT = Op.getValueType(); 4075 unsigned VTBits = VT.getSizeInBits(); 4076 SDLoc dl(Op); 4077 SDValue ShOpLo = Op.getOperand(0); 4078 SDValue ShOpHi = Op.getOperand(1); 4079 SDValue ShAmt = Op.getOperand(2); 4080 SDValue ARMcc; 4081 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 4082 4083 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 4084 4085 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4086 DAG.getConstant(VTBits, MVT::i64), ShAmt); 4087 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 4088 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4089 DAG.getConstant(VTBits, MVT::i64)); 4090 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 4091 4092 SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), 4093 ISD::SETGE, dl, DAG); 4094 SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32); 4095 4096 SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 4097 SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 4098 SDValue Lo = 4099 DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); 4100 4101 // AArch64 shifts larger than the register width are wrapped rather than 4102 // clamped, so we can't just emit "hi >> x". 4103 SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 4104 SDValue TrueValHi = Opc == ISD::SRA 4105 ? DAG.getNode(Opc, dl, VT, ShOpHi, 4106 DAG.getConstant(VTBits - 1, MVT::i64)) 4107 : DAG.getConstant(0, VT); 4108 SDValue Hi = 4109 DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); 4110 4111 SDValue Ops[2] = { Lo, Hi }; 4112 return DAG.getMergeValues(Ops, dl); 4113} 4114 4115/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 4116/// i64 values and take a 2 x i64 value to shift plus a shift amount. 4117SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, 4118 SelectionDAG &DAG) const { 4119 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4120 EVT VT = Op.getValueType(); 4121 unsigned VTBits = VT.getSizeInBits(); 4122 SDLoc dl(Op); 4123 SDValue ShOpLo = Op.getOperand(0); 4124 SDValue ShOpHi = Op.getOperand(1); 4125 SDValue ShAmt = Op.getOperand(2); 4126 SDValue ARMcc; 4127 4128 assert(Op.getOpcode() == ISD::SHL_PARTS); 4129 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, 4130 DAG.getConstant(VTBits, MVT::i64), ShAmt); 4131 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 4132 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, 4133 DAG.getConstant(VTBits, MVT::i64)); 4134 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 4135 SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 4136 4137 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 4138 4139 SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), 4140 ISD::SETGE, dl, DAG); 4141 SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32); 4142 SDValue Hi = 4143 DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); 4144 4145 // AArch64 shifts of larger than register sizes are wrapped rather than 4146 // clamped, so we can't just emit "lo << a" if a is too big. 4147 SDValue TrueValLo = DAG.getConstant(0, VT); 4148 SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4149 SDValue Lo = 4150 DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); 4151 4152 SDValue Ops[2] = { Lo, Hi }; 4153 return DAG.getMergeValues(Ops, dl); 4154} 4155 4156bool AArch64TargetLowering::isOffsetFoldingLegal( 4157 const GlobalAddressSDNode *GA) const { 4158 // The AArch64 target doesn't support folding offsets into global addresses. 4159 return false; 4160} 4161 4162bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 4163 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. 4164 // FIXME: We should be able to handle f128 as well with a clever lowering. 4165 if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) 4166 return true; 4167 4168 if (VT == MVT::f64) 4169 return AArch64_AM::getFP64Imm(Imm) != -1; 4170 else if (VT == MVT::f32) 4171 return AArch64_AM::getFP32Imm(Imm) != -1; 4172 return false; 4173} 4174 4175//===----------------------------------------------------------------------===// 4176// AArch64 Optimization Hooks 4177//===----------------------------------------------------------------------===// 4178 4179//===----------------------------------------------------------------------===// 4180// AArch64 Inline Assembly Support 4181//===----------------------------------------------------------------------===// 4182 4183// Table of Constraints 4184// TODO: This is the current set of constraints supported by ARM for the 4185// compiler, not all of them may make sense, e.g. S may be difficult to support. 4186// 4187// r - A general register 4188// w - An FP/SIMD register of some size in the range v0-v31 4189// x - An FP/SIMD register of some size in the range v0-v15 4190// I - Constant that can be used with an ADD instruction 4191// J - Constant that can be used with a SUB instruction 4192// K - Constant that can be used with a 32-bit logical instruction 4193// L - Constant that can be used with a 64-bit logical instruction 4194// M - Constant that can be used as a 32-bit MOV immediate 4195// N - Constant that can be used as a 64-bit MOV immediate 4196// Q - A memory reference with base register and no offset 4197// S - A symbolic address 4198// Y - Floating point constant zero 4199// Z - Integer constant zero 4200// 4201// Note that general register operands will be output using their 64-bit x 4202// register name, whatever the size of the variable, unless the asm operand 4203// is prefixed by the %w modifier. Floating-point and SIMD register operands 4204// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or 4205// %q modifier. 4206 4207/// getConstraintType - Given a constraint letter, return the type of 4208/// constraint it is for this target. 4209AArch64TargetLowering::ConstraintType 4210AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { 4211 if (Constraint.size() == 1) { 4212 switch (Constraint[0]) { 4213 default: 4214 break; 4215 case 'z': 4216 return C_Other; 4217 case 'x': 4218 case 'w': 4219 return C_RegisterClass; 4220 // An address with a single base register. Due to the way we 4221 // currently handle addresses it is the same as 'r'. 4222 case 'Q': 4223 return C_Memory; 4224 } 4225 } 4226 return TargetLowering::getConstraintType(Constraint); 4227} 4228 4229/// Examine constraint type and operand type and determine a weight value. 4230/// This object must already have been set up with the operand type 4231/// and the current alternative constraint selected. 4232TargetLowering::ConstraintWeight 4233AArch64TargetLowering::getSingleConstraintMatchWeight( 4234 AsmOperandInfo &info, const char *constraint) const { 4235 ConstraintWeight weight = CW_Invalid; 4236 Value *CallOperandVal = info.CallOperandVal; 4237 // If we don't have a value, we can't do a match, 4238 // but allow it at the lowest weight. 4239 if (!CallOperandVal) 4240 return CW_Default; 4241 Type *type = CallOperandVal->getType(); 4242 // Look at the constraint type. 4243 switch (*constraint) { 4244 default: 4245 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 4246 break; 4247 case 'x': 4248 case 'w': 4249 if (type->isFloatingPointTy() || type->isVectorTy()) 4250 weight = CW_Register; 4251 break; 4252 case 'z': 4253 weight = CW_Constant; 4254 break; 4255 } 4256 return weight; 4257} 4258 4259std::pair<unsigned, const TargetRegisterClass *> 4260AArch64TargetLowering::getRegForInlineAsmConstraint( 4261 const std::string &Constraint, MVT VT) const { 4262 if (Constraint.size() == 1) { 4263 switch (Constraint[0]) { 4264 case 'r': 4265 if (VT.getSizeInBits() == 64) 4266 return std::make_pair(0U, &AArch64::GPR64commonRegClass); 4267 return std::make_pair(0U, &AArch64::GPR32commonRegClass); 4268 case 'w': 4269 if (VT == MVT::f32) 4270 return std::make_pair(0U, &AArch64::FPR32RegClass); 4271 if (VT.getSizeInBits() == 64) 4272 return std::make_pair(0U, &AArch64::FPR64RegClass); 4273 if (VT.getSizeInBits() == 128) 4274 return std::make_pair(0U, &AArch64::FPR128RegClass); 4275 break; 4276 // The instructions that this constraint is designed for can 4277 // only take 128-bit registers so just use that regclass. 4278 case 'x': 4279 if (VT.getSizeInBits() == 128) 4280 return std::make_pair(0U, &AArch64::FPR128_loRegClass); 4281 break; 4282 } 4283 } 4284 if (StringRef("{cc}").equals_lower(Constraint)) 4285 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); 4286 4287 // Use the default implementation in TargetLowering to convert the register 4288 // constraint into a member of a register class. 4289 std::pair<unsigned, const TargetRegisterClass *> Res; 4290 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 4291 4292 // Not found as a standard register? 4293 if (!Res.second) { 4294 unsigned Size = Constraint.size(); 4295 if ((Size == 4 || Size == 5) && Constraint[0] == '{' && 4296 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { 4297 const std::string Reg = 4298 std::string(&Constraint[2], &Constraint[Size - 1]); 4299 int RegNo = atoi(Reg.c_str()); 4300 if (RegNo >= 0 && RegNo <= 31) { 4301 // v0 - v31 are aliases of q0 - q31. 4302 // By default we'll emit v0-v31 for this unless there's a modifier where 4303 // we'll emit the correct register as well. 4304 Res.first = AArch64::FPR128RegClass.getRegister(RegNo); 4305 Res.second = &AArch64::FPR128RegClass; 4306 } 4307 } 4308 } 4309 4310 return Res; 4311} 4312 4313/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 4314/// vector. If it is invalid, don't add anything to Ops. 4315void AArch64TargetLowering::LowerAsmOperandForConstraint( 4316 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, 4317 SelectionDAG &DAG) const { 4318 SDValue Result; 4319 4320 // Currently only support length 1 constraints. 4321 if (Constraint.length() != 1) 4322 return; 4323 4324 char ConstraintLetter = Constraint[0]; 4325 switch (ConstraintLetter) { 4326 default: 4327 break; 4328 4329 // This set of constraints deal with valid constants for various instructions. 4330 // Validate and return a target constant for them if we can. 4331 case 'z': { 4332 // 'z' maps to xzr or wzr so it needs an input of 0. 4333 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4334 if (!C || C->getZExtValue() != 0) 4335 return; 4336 4337 if (Op.getValueType() == MVT::i64) 4338 Result = DAG.getRegister(AArch64::XZR, MVT::i64); 4339 else 4340 Result = DAG.getRegister(AArch64::WZR, MVT::i32); 4341 break; 4342 } 4343 4344 case 'I': 4345 case 'J': 4346 case 'K': 4347 case 'L': 4348 case 'M': 4349 case 'N': 4350 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 4351 if (!C) 4352 return; 4353 4354 // Grab the value and do some validation. 4355 uint64_t CVal = C->getZExtValue(); 4356 switch (ConstraintLetter) { 4357 // The I constraint applies only to simple ADD or SUB immediate operands: 4358 // i.e. 0 to 4095 with optional shift by 12 4359 // The J constraint applies only to ADD or SUB immediates that would be 4360 // valid when negated, i.e. if [an add pattern] were to be output as a SUB 4361 // instruction [or vice versa], in other words -1 to -4095 with optional 4362 // left shift by 12. 4363 case 'I': 4364 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) 4365 break; 4366 return; 4367 case 'J': { 4368 uint64_t NVal = -C->getSExtValue(); 4369 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { 4370 CVal = C->getSExtValue(); 4371 break; 4372 } 4373 return; 4374 } 4375 // The K and L constraints apply *only* to logical immediates, including 4376 // what used to be the MOVI alias for ORR (though the MOVI alias has now 4377 // been removed and MOV should be used). So these constraints have to 4378 // distinguish between bit patterns that are valid 32-bit or 64-bit 4379 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but 4380 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice 4381 // versa. 4382 case 'K': 4383 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4384 break; 4385 return; 4386 case 'L': 4387 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4388 break; 4389 return; 4390 // The M and N constraints are a superset of K and L respectively, for use 4391 // with the MOV (immediate) alias. As well as the logical immediates they 4392 // also match 32 or 64-bit immediates that can be loaded either using a 4393 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca 4394 // (M) or 64-bit 0x1234000000000000 (N) etc. 4395 // As a note some of this code is liberally stolen from the asm parser. 4396 case 'M': { 4397 if (!isUInt<32>(CVal)) 4398 return; 4399 if (AArch64_AM::isLogicalImmediate(CVal, 32)) 4400 break; 4401 if ((CVal & 0xFFFF) == CVal) 4402 break; 4403 if ((CVal & 0xFFFF0000ULL) == CVal) 4404 break; 4405 uint64_t NCVal = ~(uint32_t)CVal; 4406 if ((NCVal & 0xFFFFULL) == NCVal) 4407 break; 4408 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4409 break; 4410 return; 4411 } 4412 case 'N': { 4413 if (AArch64_AM::isLogicalImmediate(CVal, 64)) 4414 break; 4415 if ((CVal & 0xFFFFULL) == CVal) 4416 break; 4417 if ((CVal & 0xFFFF0000ULL) == CVal) 4418 break; 4419 if ((CVal & 0xFFFF00000000ULL) == CVal) 4420 break; 4421 if ((CVal & 0xFFFF000000000000ULL) == CVal) 4422 break; 4423 uint64_t NCVal = ~CVal; 4424 if ((NCVal & 0xFFFFULL) == NCVal) 4425 break; 4426 if ((NCVal & 0xFFFF0000ULL) == NCVal) 4427 break; 4428 if ((NCVal & 0xFFFF00000000ULL) == NCVal) 4429 break; 4430 if ((NCVal & 0xFFFF000000000000ULL) == NCVal) 4431 break; 4432 return; 4433 } 4434 default: 4435 return; 4436 } 4437 4438 // All assembler immediates are 64-bit integers. 4439 Result = DAG.getTargetConstant(CVal, MVT::i64); 4440 break; 4441 } 4442 4443 if (Result.getNode()) { 4444 Ops.push_back(Result); 4445 return; 4446 } 4447 4448 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 4449} 4450 4451//===----------------------------------------------------------------------===// 4452// AArch64 Advanced SIMD Support 4453//===----------------------------------------------------------------------===// 4454 4455/// WidenVector - Given a value in the V64 register class, produce the 4456/// equivalent value in the V128 register class. 4457static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { 4458 EVT VT = V64Reg.getValueType(); 4459 unsigned NarrowSize = VT.getVectorNumElements(); 4460 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4461 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); 4462 SDLoc DL(V64Reg); 4463 4464 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), 4465 V64Reg, DAG.getConstant(0, MVT::i32)); 4466} 4467 4468/// getExtFactor - Determine the adjustment factor for the position when 4469/// generating an "extract from vector registers" instruction. 4470static unsigned getExtFactor(SDValue &V) { 4471 EVT EltType = V.getValueType().getVectorElementType(); 4472 return EltType.getSizeInBits() / 8; 4473} 4474 4475/// NarrowVector - Given a value in the V128 register class, produce the 4476/// equivalent value in the V64 register class. 4477static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { 4478 EVT VT = V128Reg.getValueType(); 4479 unsigned WideSize = VT.getVectorNumElements(); 4480 MVT EltTy = VT.getVectorElementType().getSimpleVT(); 4481 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); 4482 SDLoc DL(V128Reg); 4483 4484 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); 4485} 4486 4487// Gather data to see if the operation can be modelled as a 4488// shuffle in combination with VEXTs. 4489SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, 4490 SelectionDAG &DAG) const { 4491 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 4492 SDLoc dl(Op); 4493 EVT VT = Op.getValueType(); 4494 unsigned NumElts = VT.getVectorNumElements(); 4495 4496 struct ShuffleSourceInfo { 4497 SDValue Vec; 4498 unsigned MinElt; 4499 unsigned MaxElt; 4500 4501 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 4502 // be compatible with the shuffle we intend to construct. As a result 4503 // ShuffleVec will be some sliding window into the original Vec. 4504 SDValue ShuffleVec; 4505 4506 // Code should guarantee that element i in Vec starts at element "WindowBase 4507 // + i * WindowScale in ShuffleVec". 4508 int WindowBase; 4509 int WindowScale; 4510 4511 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 4512 ShuffleSourceInfo(SDValue Vec) 4513 : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), 4514 WindowScale(1) {} 4515 }; 4516 4517 // First gather all vectors used as an immediate source for this BUILD_VECTOR 4518 // node. 4519 SmallVector<ShuffleSourceInfo, 2> Sources; 4520 for (unsigned i = 0; i < NumElts; ++i) { 4521 SDValue V = Op.getOperand(i); 4522 if (V.getOpcode() == ISD::UNDEF) 4523 continue; 4524 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 4525 // A shuffle can only come from building a vector from various 4526 // elements of other vectors. 4527 return SDValue(); 4528 } 4529 4530 // Add this element source to the list if it's not already there. 4531 SDValue SourceVec = V.getOperand(0); 4532 auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); 4533 if (Source == Sources.end()) 4534 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 4535 4536 // Update the minimum and maximum lane number seen. 4537 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 4538 Source->MinElt = std::min(Source->MinElt, EltNo); 4539 Source->MaxElt = std::max(Source->MaxElt, EltNo); 4540 } 4541 4542 // Currently only do something sane when at most two source vectors 4543 // are involved. 4544 if (Sources.size() > 2) 4545 return SDValue(); 4546 4547 // Find out the smallest element size among result and two sources, and use 4548 // it as element size to build the shuffle_vector. 4549 EVT SmallestEltTy = VT.getVectorElementType(); 4550 for (auto &Source : Sources) { 4551 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 4552 if (SrcEltTy.bitsLT(SmallestEltTy)) { 4553 SmallestEltTy = SrcEltTy; 4554 } 4555 } 4556 unsigned ResMultiplier = 4557 VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); 4558 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 4559 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 4560 4561 // If the source vector is too wide or too narrow, we may nevertheless be able 4562 // to construct a compatible shuffle either by concatenating it with UNDEF or 4563 // extracting a suitable range of elements. 4564 for (auto &Src : Sources) { 4565 EVT SrcVT = Src.ShuffleVec.getValueType(); 4566 4567 if (SrcVT.getSizeInBits() == VT.getSizeInBits()) 4568 continue; 4569 4570 // This stage of the search produces a source with the same element type as 4571 // the original, but with a total width matching the BUILD_VECTOR output. 4572 EVT EltVT = SrcVT.getVectorElementType(); 4573 unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); 4574 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 4575 4576 if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { 4577 assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); 4578 // We can pad out the smaller vector for free, so if it's part of a 4579 // shuffle... 4580 Src.ShuffleVec = 4581 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 4582 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 4583 continue; 4584 } 4585 4586 assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); 4587 4588 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 4589 // Span too large for a VEXT to cope 4590 return SDValue(); 4591 } 4592 4593 if (Src.MinElt >= NumSrcElts) { 4594 // The extraction can just take the second half 4595 Src.ShuffleVec = 4596 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4597 DAG.getConstant(NumSrcElts, MVT::i64)); 4598 Src.WindowBase = -NumSrcElts; 4599 } else if (Src.MaxElt < NumSrcElts) { 4600 // The extraction can just take the first half 4601 Src.ShuffleVec = 4602 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4603 DAG.getConstant(0, MVT::i64)); 4604 } else { 4605 // An actual VEXT is needed 4606 SDValue VEXTSrc1 = 4607 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4608 DAG.getConstant(0, MVT::i64)); 4609 SDValue VEXTSrc2 = 4610 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 4611 DAG.getConstant(NumSrcElts, MVT::i64)); 4612 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); 4613 4614 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, 4615 VEXTSrc2, DAG.getConstant(Imm, MVT::i32)); 4616 Src.WindowBase = -Src.MinElt; 4617 } 4618 } 4619 4620 // Another possible incompatibility occurs from the vector element types. We 4621 // can fix this by bitcasting the source vectors to the same type we intend 4622 // for the shuffle. 4623 for (auto &Src : Sources) { 4624 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 4625 if (SrcEltTy == SmallestEltTy) 4626 continue; 4627 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 4628 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); 4629 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 4630 Src.WindowBase *= Src.WindowScale; 4631 } 4632 4633 // Final sanity check before we try to actually produce a shuffle. 4634 DEBUG( 4635 for (auto Src : Sources) 4636 assert(Src.ShuffleVec.getValueType() == ShuffleVT); 4637 ); 4638 4639 // The stars all align, our next step is to produce the mask for the shuffle. 4640 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 4641 int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); 4642 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 4643 SDValue Entry = Op.getOperand(i); 4644 if (Entry.getOpcode() == ISD::UNDEF) 4645 continue; 4646 4647 auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); 4648 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 4649 4650 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 4651 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 4652 // segment. 4653 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 4654 int BitsDefined = std::min(OrigEltTy.getSizeInBits(), 4655 VT.getVectorElementType().getSizeInBits()); 4656 int LanesDefined = BitsDefined / BitsPerShuffleLane; 4657 4658 // This source is expected to fill ResMultiplier lanes of the final shuffle, 4659 // starting at the appropriate offset. 4660 int *LaneMask = &Mask[i * ResMultiplier]; 4661 4662 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 4663 ExtractBase += NumElts * (Src - Sources.begin()); 4664 for (int j = 0; j < LanesDefined; ++j) 4665 LaneMask[j] = ExtractBase + j; 4666 } 4667 4668 // Final check before we try to produce nonsense... 4669 if (!isShuffleMaskLegal(Mask, ShuffleVT)) 4670 return SDValue(); 4671 4672 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 4673 for (unsigned i = 0; i < Sources.size(); ++i) 4674 ShuffleOps[i] = Sources[i].ShuffleVec; 4675 4676 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 4677 ShuffleOps[1], &Mask[0]); 4678 return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); 4679} 4680 4681// check if an EXT instruction can handle the shuffle mask when the 4682// vector sources of the shuffle are the same. 4683static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 4684 unsigned NumElts = VT.getVectorNumElements(); 4685 4686 // Assume that the first shuffle index is not UNDEF. Fail if it is. 4687 if (M[0] < 0) 4688 return false; 4689 4690 Imm = M[0]; 4691 4692 // If this is a VEXT shuffle, the immediate value is the index of the first 4693 // element. The other shuffle indices must be the successive elements after 4694 // the first one. 4695 unsigned ExpectedElt = Imm; 4696 for (unsigned i = 1; i < NumElts; ++i) { 4697 // Increment the expected index. If it wraps around, just follow it 4698 // back to index zero and keep going. 4699 ++ExpectedElt; 4700 if (ExpectedElt == NumElts) 4701 ExpectedElt = 0; 4702 4703 if (M[i] < 0) 4704 continue; // ignore UNDEF indices 4705 if (ExpectedElt != static_cast<unsigned>(M[i])) 4706 return false; 4707 } 4708 4709 return true; 4710} 4711 4712// check if an EXT instruction can handle the shuffle mask when the 4713// vector sources of the shuffle are different. 4714static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, 4715 unsigned &Imm) { 4716 // Look for the first non-undef element. 4717 const int *FirstRealElt = std::find_if(M.begin(), M.end(), 4718 [](int Elt) {return Elt >= 0;}); 4719 4720 // Benefit form APInt to handle overflow when calculating expected element. 4721 unsigned NumElts = VT.getVectorNumElements(); 4722 unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); 4723 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); 4724 // The following shuffle indices must be the successive elements after the 4725 // first real element. 4726 const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), 4727 [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); 4728 if (FirstWrongElt != M.end()) 4729 return false; 4730 4731 // The index of an EXT is the first element if it is not UNDEF. 4732 // Watch out for the beginning UNDEFs. The EXT index should be the expected 4733 // value of the first element. E.g. 4734 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. 4735 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. 4736 // ExpectedElt is the last mask index plus 1. 4737 Imm = ExpectedElt.getZExtValue(); 4738 4739 // There are two difference cases requiring to reverse input vectors. 4740 // For example, for vector <4 x i32> we have the following cases, 4741 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) 4742 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) 4743 // For both cases, we finally use mask <5, 6, 7, 0>, which requires 4744 // to reverse two input vectors. 4745 if (Imm < NumElts) 4746 ReverseEXT = true; 4747 else 4748 Imm -= NumElts; 4749 4750 return true; 4751} 4752 4753/// isREVMask - Check if a vector shuffle corresponds to a REV 4754/// instruction with the specified blocksize. (The order of the elements 4755/// within each block of the vector is reversed.) 4756static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { 4757 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && 4758 "Only possible block sizes for REV are: 16, 32, 64"); 4759 4760 unsigned EltSz = VT.getVectorElementType().getSizeInBits(); 4761 if (EltSz == 64) 4762 return false; 4763 4764 unsigned NumElts = VT.getVectorNumElements(); 4765 unsigned BlockElts = M[0] + 1; 4766 // If the first shuffle index is UNDEF, be optimistic. 4767 if (M[0] < 0) 4768 BlockElts = BlockSize / EltSz; 4769 4770 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) 4771 return false; 4772 4773 for (unsigned i = 0; i < NumElts; ++i) { 4774 if (M[i] < 0) 4775 continue; // ignore UNDEF indices 4776 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) 4777 return false; 4778 } 4779 4780 return true; 4781} 4782 4783static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4784 unsigned NumElts = VT.getVectorNumElements(); 4785 WhichResult = (M[0] == 0 ? 0 : 1); 4786 unsigned Idx = WhichResult * NumElts / 2; 4787 for (unsigned i = 0; i != NumElts; i += 2) { 4788 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 4789 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) 4790 return false; 4791 Idx += 1; 4792 } 4793 4794 return true; 4795} 4796 4797static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4798 unsigned NumElts = VT.getVectorNumElements(); 4799 WhichResult = (M[0] == 0 ? 0 : 1); 4800 for (unsigned i = 0; i != NumElts; ++i) { 4801 if (M[i] < 0) 4802 continue; // ignore UNDEF indices 4803 if ((unsigned)M[i] != 2 * i + WhichResult) 4804 return false; 4805 } 4806 4807 return true; 4808} 4809 4810static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4811 unsigned NumElts = VT.getVectorNumElements(); 4812 WhichResult = (M[0] == 0 ? 0 : 1); 4813 for (unsigned i = 0; i < NumElts; i += 2) { 4814 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 4815 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) 4816 return false; 4817 } 4818 return true; 4819} 4820 4821/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of 4822/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 4823/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 4824static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4825 unsigned NumElts = VT.getVectorNumElements(); 4826 WhichResult = (M[0] == 0 ? 0 : 1); 4827 unsigned Idx = WhichResult * NumElts / 2; 4828 for (unsigned i = 0; i != NumElts; i += 2) { 4829 if ((M[i] >= 0 && (unsigned)M[i] != Idx) || 4830 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) 4831 return false; 4832 Idx += 1; 4833 } 4834 4835 return true; 4836} 4837 4838/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of 4839/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 4840/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 4841static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4842 unsigned Half = VT.getVectorNumElements() / 2; 4843 WhichResult = (M[0] == 0 ? 0 : 1); 4844 for (unsigned j = 0; j != 2; ++j) { 4845 unsigned Idx = WhichResult; 4846 for (unsigned i = 0; i != Half; ++i) { 4847 int MIdx = M[i + j * Half]; 4848 if (MIdx >= 0 && (unsigned)MIdx != Idx) 4849 return false; 4850 Idx += 2; 4851 } 4852 } 4853 4854 return true; 4855} 4856 4857/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of 4858/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 4859/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 4860static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 4861 unsigned NumElts = VT.getVectorNumElements(); 4862 WhichResult = (M[0] == 0 ? 0 : 1); 4863 for (unsigned i = 0; i < NumElts; i += 2) { 4864 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || 4865 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) 4866 return false; 4867 } 4868 return true; 4869} 4870 4871static bool isINSMask(ArrayRef<int> M, int NumInputElements, 4872 bool &DstIsLeft, int &Anomaly) { 4873 if (M.size() != static_cast<size_t>(NumInputElements)) 4874 return false; 4875 4876 int NumLHSMatch = 0, NumRHSMatch = 0; 4877 int LastLHSMismatch = -1, LastRHSMismatch = -1; 4878 4879 for (int i = 0; i < NumInputElements; ++i) { 4880 if (M[i] == -1) { 4881 ++NumLHSMatch; 4882 ++NumRHSMatch; 4883 continue; 4884 } 4885 4886 if (M[i] == i) 4887 ++NumLHSMatch; 4888 else 4889 LastLHSMismatch = i; 4890 4891 if (M[i] == i + NumInputElements) 4892 ++NumRHSMatch; 4893 else 4894 LastRHSMismatch = i; 4895 } 4896 4897 if (NumLHSMatch == NumInputElements - 1) { 4898 DstIsLeft = true; 4899 Anomaly = LastLHSMismatch; 4900 return true; 4901 } else if (NumRHSMatch == NumInputElements - 1) { 4902 DstIsLeft = false; 4903 Anomaly = LastRHSMismatch; 4904 return true; 4905 } 4906 4907 return false; 4908} 4909 4910static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { 4911 if (VT.getSizeInBits() != 128) 4912 return false; 4913 4914 unsigned NumElts = VT.getVectorNumElements(); 4915 4916 for (int I = 0, E = NumElts / 2; I != E; I++) { 4917 if (Mask[I] != I) 4918 return false; 4919 } 4920 4921 int Offset = NumElts / 2; 4922 for (int I = NumElts / 2, E = NumElts; I != E; I++) { 4923 if (Mask[I] != I + SplitLHS * Offset) 4924 return false; 4925 } 4926 4927 return true; 4928} 4929 4930static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { 4931 SDLoc DL(Op); 4932 EVT VT = Op.getValueType(); 4933 SDValue V0 = Op.getOperand(0); 4934 SDValue V1 = Op.getOperand(1); 4935 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); 4936 4937 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || 4938 VT.getVectorElementType() != V1.getValueType().getVectorElementType()) 4939 return SDValue(); 4940 4941 bool SplitV0 = V0.getValueType().getSizeInBits() == 128; 4942 4943 if (!isConcatMask(Mask, VT, SplitV0)) 4944 return SDValue(); 4945 4946 EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4947 VT.getVectorNumElements() / 2); 4948 if (SplitV0) { 4949 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, 4950 DAG.getConstant(0, MVT::i64)); 4951 } 4952 if (V1.getValueType().getSizeInBits() == 128) { 4953 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, 4954 DAG.getConstant(0, MVT::i64)); 4955 } 4956 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); 4957} 4958 4959/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 4960/// the specified operations to build the shuffle. 4961static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 4962 SDValue RHS, SelectionDAG &DAG, 4963 SDLoc dl) { 4964 unsigned OpNum = (PFEntry >> 26) & 0x0F; 4965 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); 4966 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); 4967 4968 enum { 4969 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 4970 OP_VREV, 4971 OP_VDUP0, 4972 OP_VDUP1, 4973 OP_VDUP2, 4974 OP_VDUP3, 4975 OP_VEXT1, 4976 OP_VEXT2, 4977 OP_VEXT3, 4978 OP_VUZPL, // VUZP, left result 4979 OP_VUZPR, // VUZP, right result 4980 OP_VZIPL, // VZIP, left result 4981 OP_VZIPR, // VZIP, right result 4982 OP_VTRNL, // VTRN, left result 4983 OP_VTRNR // VTRN, right result 4984 }; 4985 4986 if (OpNum == OP_COPY) { 4987 if (LHSID == (1 * 9 + 2) * 9 + 3) 4988 return LHS; 4989 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); 4990 return RHS; 4991 } 4992 4993 SDValue OpLHS, OpRHS; 4994 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 4995 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 4996 EVT VT = OpLHS.getValueType(); 4997 4998 switch (OpNum) { 4999 default: 5000 llvm_unreachable("Unknown shuffle opcode!"); 5001 case OP_VREV: 5002 // VREV divides the vector in half and swaps within the half. 5003 if (VT.getVectorElementType() == MVT::i32 || 5004 VT.getVectorElementType() == MVT::f32) 5005 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); 5006 // vrev <4 x i16> -> REV32 5007 if (VT.getVectorElementType() == MVT::i16 || 5008 VT.getVectorElementType() == MVT::f16) 5009 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); 5010 // vrev <4 x i8> -> REV16 5011 assert(VT.getVectorElementType() == MVT::i8); 5012 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); 5013 case OP_VDUP0: 5014 case OP_VDUP1: 5015 case OP_VDUP2: 5016 case OP_VDUP3: { 5017 EVT EltTy = VT.getVectorElementType(); 5018 unsigned Opcode; 5019 if (EltTy == MVT::i8) 5020 Opcode = AArch64ISD::DUPLANE8; 5021 else if (EltTy == MVT::i16) 5022 Opcode = AArch64ISD::DUPLANE16; 5023 else if (EltTy == MVT::i32 || EltTy == MVT::f32) 5024 Opcode = AArch64ISD::DUPLANE32; 5025 else if (EltTy == MVT::i64 || EltTy == MVT::f64) 5026 Opcode = AArch64ISD::DUPLANE64; 5027 else 5028 llvm_unreachable("Invalid vector element type?"); 5029 5030 if (VT.getSizeInBits() == 64) 5031 OpLHS = WidenVector(OpLHS, DAG); 5032 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64); 5033 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); 5034 } 5035 case OP_VEXT1: 5036 case OP_VEXT2: 5037 case OP_VEXT3: { 5038 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); 5039 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, 5040 DAG.getConstant(Imm, MVT::i32)); 5041 } 5042 case OP_VUZPL: 5043 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, 5044 OpRHS); 5045 case OP_VUZPR: 5046 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, 5047 OpRHS); 5048 case OP_VZIPL: 5049 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, 5050 OpRHS); 5051 case OP_VZIPR: 5052 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, 5053 OpRHS); 5054 case OP_VTRNL: 5055 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, 5056 OpRHS); 5057 case OP_VTRNR: 5058 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, 5059 OpRHS); 5060 } 5061} 5062 5063static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, 5064 SelectionDAG &DAG) { 5065 // Check to see if we can use the TBL instruction. 5066 SDValue V1 = Op.getOperand(0); 5067 SDValue V2 = Op.getOperand(1); 5068 SDLoc DL(Op); 5069 5070 EVT EltVT = Op.getValueType().getVectorElementType(); 5071 unsigned BytesPerElt = EltVT.getSizeInBits() / 8; 5072 5073 SmallVector<SDValue, 8> TBLMask; 5074 for (int Val : ShuffleMask) { 5075 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { 5076 unsigned Offset = Byte + Val * BytesPerElt; 5077 TBLMask.push_back(DAG.getConstant(Offset, MVT::i32)); 5078 } 5079 } 5080 5081 MVT IndexVT = MVT::v8i8; 5082 unsigned IndexLen = 8; 5083 if (Op.getValueType().getSizeInBits() == 128) { 5084 IndexVT = MVT::v16i8; 5085 IndexLen = 16; 5086 } 5087 5088 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); 5089 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); 5090 5091 SDValue Shuffle; 5092 if (V2.getNode()->getOpcode() == ISD::UNDEF) { 5093 if (IndexLen == 8) 5094 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); 5095 Shuffle = DAG.getNode( 5096 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5097 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst, 5098 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5099 makeArrayRef(TBLMask.data(), IndexLen))); 5100 } else { 5101 if (IndexLen == 8) { 5102 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); 5103 Shuffle = DAG.getNode( 5104 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5105 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst, 5106 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5107 makeArrayRef(TBLMask.data(), IndexLen))); 5108 } else { 5109 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we 5110 // cannot currently represent the register constraints on the input 5111 // table registers. 5112 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, 5113 // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5114 // &TBLMask[0], IndexLen)); 5115 Shuffle = DAG.getNode( 5116 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, 5117 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst, 5118 DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, 5119 makeArrayRef(TBLMask.data(), IndexLen))); 5120 } 5121 } 5122 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); 5123} 5124 5125static unsigned getDUPLANEOp(EVT EltType) { 5126 if (EltType == MVT::i8) 5127 return AArch64ISD::DUPLANE8; 5128 if (EltType == MVT::i16 || EltType == MVT::f16) 5129 return AArch64ISD::DUPLANE16; 5130 if (EltType == MVT::i32 || EltType == MVT::f32) 5131 return AArch64ISD::DUPLANE32; 5132 if (EltType == MVT::i64 || EltType == MVT::f64) 5133 return AArch64ISD::DUPLANE64; 5134 5135 llvm_unreachable("Invalid vector element type?"); 5136} 5137 5138SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, 5139 SelectionDAG &DAG) const { 5140 SDLoc dl(Op); 5141 EVT VT = Op.getValueType(); 5142 5143 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 5144 5145 // Convert shuffles that are directly supported on NEON to target-specific 5146 // DAG nodes, instead of keeping them as shuffles and matching them again 5147 // during code selection. This is more efficient and avoids the possibility 5148 // of inconsistencies between legalization and selection. 5149 ArrayRef<int> ShuffleMask = SVN->getMask(); 5150 5151 SDValue V1 = Op.getOperand(0); 5152 SDValue V2 = Op.getOperand(1); 5153 5154 if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], 5155 V1.getValueType().getSimpleVT())) { 5156 int Lane = SVN->getSplatIndex(); 5157 // If this is undef splat, generate it via "just" vdup, if possible. 5158 if (Lane == -1) 5159 Lane = 0; 5160 5161 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) 5162 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), 5163 V1.getOperand(0)); 5164 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- 5165 // constant. If so, we can just reference the lane's definition directly. 5166 if (V1.getOpcode() == ISD::BUILD_VECTOR && 5167 !isa<ConstantSDNode>(V1.getOperand(Lane))) 5168 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); 5169 5170 // Otherwise, duplicate from the lane of the input vector. 5171 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); 5172 5173 // SelectionDAGBuilder may have "helpfully" already extracted or conatenated 5174 // to make a vector of the same size as this SHUFFLE. We can ignore the 5175 // extract entirely, and canonicalise the concat using WidenVector. 5176 if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { 5177 Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); 5178 V1 = V1.getOperand(0); 5179 } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { 5180 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; 5181 Lane -= Idx * VT.getVectorNumElements() / 2; 5182 V1 = WidenVector(V1.getOperand(Idx), DAG); 5183 } else if (VT.getSizeInBits() == 64) 5184 V1 = WidenVector(V1, DAG); 5185 5186 return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64)); 5187 } 5188 5189 if (isREVMask(ShuffleMask, VT, 64)) 5190 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); 5191 if (isREVMask(ShuffleMask, VT, 32)) 5192 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); 5193 if (isREVMask(ShuffleMask, VT, 16)) 5194 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); 5195 5196 bool ReverseEXT = false; 5197 unsigned Imm; 5198 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { 5199 if (ReverseEXT) 5200 std::swap(V1, V2); 5201 Imm *= getExtFactor(V1); 5202 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, 5203 DAG.getConstant(Imm, MVT::i32)); 5204 } else if (V2->getOpcode() == ISD::UNDEF && 5205 isSingletonEXTMask(ShuffleMask, VT, Imm)) { 5206 Imm *= getExtFactor(V1); 5207 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, 5208 DAG.getConstant(Imm, MVT::i32)); 5209 } 5210 5211 unsigned WhichResult; 5212 if (isZIPMask(ShuffleMask, VT, WhichResult)) { 5213 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 5214 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5215 } 5216 if (isUZPMask(ShuffleMask, VT, WhichResult)) { 5217 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 5218 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5219 } 5220 if (isTRNMask(ShuffleMask, VT, WhichResult)) { 5221 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 5222 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); 5223 } 5224 5225 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5226 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; 5227 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5228 } 5229 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5230 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; 5231 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5232 } 5233 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { 5234 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; 5235 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); 5236 } 5237 5238 SDValue Concat = tryFormConcatFromShuffle(Op, DAG); 5239 if (Concat.getNode()) 5240 return Concat; 5241 5242 bool DstIsLeft; 5243 int Anomaly; 5244 int NumInputElements = V1.getValueType().getVectorNumElements(); 5245 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { 5246 SDValue DstVec = DstIsLeft ? V1 : V2; 5247 SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64); 5248 5249 SDValue SrcVec = V1; 5250 int SrcLane = ShuffleMask[Anomaly]; 5251 if (SrcLane >= NumInputElements) { 5252 SrcVec = V2; 5253 SrcLane -= VT.getVectorNumElements(); 5254 } 5255 SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64); 5256 5257 EVT ScalarVT = VT.getVectorElementType(); 5258 5259 if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) 5260 ScalarVT = MVT::i32; 5261 5262 return DAG.getNode( 5263 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, 5264 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), 5265 DstLaneV); 5266 } 5267 5268 // If the shuffle is not directly supported and it has 4 elements, use 5269 // the PerfectShuffle-generated table to synthesize it from other shuffles. 5270 unsigned NumElts = VT.getVectorNumElements(); 5271 if (NumElts == 4) { 5272 unsigned PFIndexes[4]; 5273 for (unsigned i = 0; i != 4; ++i) { 5274 if (ShuffleMask[i] < 0) 5275 PFIndexes[i] = 8; 5276 else 5277 PFIndexes[i] = ShuffleMask[i]; 5278 } 5279 5280 // Compute the index in the perfect shuffle table. 5281 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 5282 PFIndexes[2] * 9 + PFIndexes[3]; 5283 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 5284 unsigned Cost = (PFEntry >> 30); 5285 5286 if (Cost <= 4) 5287 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 5288 } 5289 5290 return GenerateTBL(Op, ShuffleMask, DAG); 5291} 5292 5293static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, 5294 APInt &UndefBits) { 5295 EVT VT = BVN->getValueType(0); 5296 APInt SplatBits, SplatUndef; 5297 unsigned SplatBitSize; 5298 bool HasAnyUndefs; 5299 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 5300 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; 5301 5302 for (unsigned i = 0; i < NumSplats; ++i) { 5303 CnstBits <<= SplatBitSize; 5304 UndefBits <<= SplatBitSize; 5305 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); 5306 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); 5307 } 5308 5309 return true; 5310 } 5311 5312 return false; 5313} 5314 5315SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, 5316 SelectionDAG &DAG) const { 5317 BuildVectorSDNode *BVN = 5318 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 5319 SDValue LHS = Op.getOperand(0); 5320 SDLoc dl(Op); 5321 EVT VT = Op.getValueType(); 5322 5323 if (!BVN) 5324 return Op; 5325 5326 APInt CnstBits(VT.getSizeInBits(), 0); 5327 APInt UndefBits(VT.getSizeInBits(), 0); 5328 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5329 // We only have BIC vector immediate instruction, which is and-not. 5330 CnstBits = ~CnstBits; 5331 5332 // We make use of a little bit of goto ickiness in order to avoid having to 5333 // duplicate the immediate matching logic for the undef toggled case. 5334 bool SecondTry = false; 5335 AttemptModImm: 5336 5337 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5338 CnstBits = CnstBits.zextOrTrunc(64); 5339 uint64_t CnstVal = CnstBits.getZExtValue(); 5340 5341 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5342 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5343 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5344 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5345 DAG.getConstant(CnstVal, MVT::i32), 5346 DAG.getConstant(0, MVT::i32)); 5347 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5348 } 5349 5350 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5351 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5352 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5353 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5354 DAG.getConstant(CnstVal, MVT::i32), 5355 DAG.getConstant(8, MVT::i32)); 5356 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5357 } 5358 5359 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5360 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5361 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5362 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5363 DAG.getConstant(CnstVal, MVT::i32), 5364 DAG.getConstant(16, MVT::i32)); 5365 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5366 } 5367 5368 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5369 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5370 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5371 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5372 DAG.getConstant(CnstVal, MVT::i32), 5373 DAG.getConstant(24, MVT::i32)); 5374 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5375 } 5376 5377 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5378 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5379 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5380 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5381 DAG.getConstant(CnstVal, MVT::i32), 5382 DAG.getConstant(0, MVT::i32)); 5383 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5384 } 5385 5386 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5387 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5388 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5389 SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, 5390 DAG.getConstant(CnstVal, MVT::i32), 5391 DAG.getConstant(8, MVT::i32)); 5392 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5393 } 5394 } 5395 5396 if (SecondTry) 5397 goto FailedModImm; 5398 SecondTry = true; 5399 CnstBits = ~UndefBits; 5400 goto AttemptModImm; 5401 } 5402 5403// We can always fall back to a non-immediate AND. 5404FailedModImm: 5405 return Op; 5406} 5407 5408// Specialized code to quickly find if PotentialBVec is a BuildVector that 5409// consists of only the same constant int value, returned in reference arg 5410// ConstVal 5411static bool isAllConstantBuildVector(const SDValue &PotentialBVec, 5412 uint64_t &ConstVal) { 5413 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); 5414 if (!Bvec) 5415 return false; 5416 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); 5417 if (!FirstElt) 5418 return false; 5419 EVT VT = Bvec->getValueType(0); 5420 unsigned NumElts = VT.getVectorNumElements(); 5421 for (unsigned i = 1; i < NumElts; ++i) 5422 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) 5423 return false; 5424 ConstVal = FirstElt->getZExtValue(); 5425 return true; 5426} 5427 5428static unsigned getIntrinsicID(const SDNode *N) { 5429 unsigned Opcode = N->getOpcode(); 5430 switch (Opcode) { 5431 default: 5432 return Intrinsic::not_intrinsic; 5433 case ISD::INTRINSIC_WO_CHAIN: { 5434 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 5435 if (IID < Intrinsic::num_intrinsics) 5436 return IID; 5437 return Intrinsic::not_intrinsic; 5438 } 5439 } 5440} 5441 5442// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), 5443// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a 5444// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. 5445// Also, logical shift right -> sri, with the same structure. 5446static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { 5447 EVT VT = N->getValueType(0); 5448 5449 if (!VT.isVector()) 5450 return SDValue(); 5451 5452 SDLoc DL(N); 5453 5454 // Is the first op an AND? 5455 const SDValue And = N->getOperand(0); 5456 if (And.getOpcode() != ISD::AND) 5457 return SDValue(); 5458 5459 // Is the second op an shl or lshr? 5460 SDValue Shift = N->getOperand(1); 5461 // This will have been turned into: AArch64ISD::VSHL vector, #shift 5462 // or AArch64ISD::VLSHR vector, #shift 5463 unsigned ShiftOpc = Shift.getOpcode(); 5464 if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) 5465 return SDValue(); 5466 bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; 5467 5468 // Is the shift amount constant? 5469 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); 5470 if (!C2node) 5471 return SDValue(); 5472 5473 // Is the and mask vector all constant? 5474 uint64_t C1; 5475 if (!isAllConstantBuildVector(And.getOperand(1), C1)) 5476 return SDValue(); 5477 5478 // Is C1 == ~C2, taking into account how much one can shift elements of a 5479 // particular size? 5480 uint64_t C2 = C2node->getZExtValue(); 5481 unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); 5482 if (C2 > ElemSizeInBits) 5483 return SDValue(); 5484 unsigned ElemMask = (1 << ElemSizeInBits) - 1; 5485 if ((C1 & ElemMask) != (~C2 & ElemMask)) 5486 return SDValue(); 5487 5488 SDValue X = And.getOperand(0); 5489 SDValue Y = Shift.getOperand(0); 5490 5491 unsigned Intrin = 5492 IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; 5493 SDValue ResultSLI = 5494 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 5495 DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1)); 5496 5497 DEBUG(dbgs() << "aarch64-lower: transformed: \n"); 5498 DEBUG(N->dump(&DAG)); 5499 DEBUG(dbgs() << "into: \n"); 5500 DEBUG(ResultSLI->dump(&DAG)); 5501 5502 ++NumShiftInserts; 5503 return ResultSLI; 5504} 5505 5506SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, 5507 SelectionDAG &DAG) const { 5508 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) 5509 if (EnableAArch64SlrGeneration) { 5510 SDValue Res = tryLowerToSLI(Op.getNode(), DAG); 5511 if (Res.getNode()) 5512 return Res; 5513 } 5514 5515 BuildVectorSDNode *BVN = 5516 dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); 5517 SDValue LHS = Op.getOperand(1); 5518 SDLoc dl(Op); 5519 EVT VT = Op.getValueType(); 5520 5521 // OR commutes, so try swapping the operands. 5522 if (!BVN) { 5523 LHS = Op.getOperand(0); 5524 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); 5525 } 5526 if (!BVN) 5527 return Op; 5528 5529 APInt CnstBits(VT.getSizeInBits(), 0); 5530 APInt UndefBits(VT.getSizeInBits(), 0); 5531 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5532 // We make use of a little bit of goto ickiness in order to avoid having to 5533 // duplicate the immediate matching logic for the undef toggled case. 5534 bool SecondTry = false; 5535 AttemptModImm: 5536 5537 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5538 CnstBits = CnstBits.zextOrTrunc(64); 5539 uint64_t CnstVal = CnstBits.getZExtValue(); 5540 5541 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5542 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5543 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5544 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5545 DAG.getConstant(CnstVal, MVT::i32), 5546 DAG.getConstant(0, MVT::i32)); 5547 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5548 } 5549 5550 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5551 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5552 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5553 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5554 DAG.getConstant(CnstVal, MVT::i32), 5555 DAG.getConstant(8, MVT::i32)); 5556 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5557 } 5558 5559 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5560 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5561 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5562 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5563 DAG.getConstant(CnstVal, MVT::i32), 5564 DAG.getConstant(16, MVT::i32)); 5565 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5566 } 5567 5568 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5569 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5570 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5571 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5572 DAG.getConstant(CnstVal, MVT::i32), 5573 DAG.getConstant(24, MVT::i32)); 5574 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5575 } 5576 5577 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5578 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5579 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5580 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5581 DAG.getConstant(CnstVal, MVT::i32), 5582 DAG.getConstant(0, MVT::i32)); 5583 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5584 } 5585 5586 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5587 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5588 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5589 SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, 5590 DAG.getConstant(CnstVal, MVT::i32), 5591 DAG.getConstant(8, MVT::i32)); 5592 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5593 } 5594 } 5595 5596 if (SecondTry) 5597 goto FailedModImm; 5598 SecondTry = true; 5599 CnstBits = UndefBits; 5600 goto AttemptModImm; 5601 } 5602 5603// We can always fall back to a non-immediate OR. 5604FailedModImm: 5605 return Op; 5606} 5607 5608// Normalize the operands of BUILD_VECTOR. The value of constant operands will 5609// be truncated to fit element width. 5610static SDValue NormalizeBuildVector(SDValue Op, 5611 SelectionDAG &DAG) { 5612 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 5613 SDLoc dl(Op); 5614 EVT VT = Op.getValueType(); 5615 EVT EltTy= VT.getVectorElementType(); 5616 5617 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) 5618 return Op; 5619 5620 SmallVector<SDValue, 16> Ops; 5621 for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) { 5622 SDValue Lane = Op.getOperand(I); 5623 if (Lane.getOpcode() == ISD::Constant) { 5624 APInt LowBits(EltTy.getSizeInBits(), 5625 cast<ConstantSDNode>(Lane)->getZExtValue()); 5626 Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32); 5627 } 5628 Ops.push_back(Lane); 5629 } 5630 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); 5631} 5632 5633SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, 5634 SelectionDAG &DAG) const { 5635 SDLoc dl(Op); 5636 EVT VT = Op.getValueType(); 5637 Op = NormalizeBuildVector(Op, DAG); 5638 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 5639 5640 APInt CnstBits(VT.getSizeInBits(), 0); 5641 APInt UndefBits(VT.getSizeInBits(), 0); 5642 if (resolveBuildVector(BVN, CnstBits, UndefBits)) { 5643 // We make use of a little bit of goto ickiness in order to avoid having to 5644 // duplicate the immediate matching logic for the undef toggled case. 5645 bool SecondTry = false; 5646 AttemptModImm: 5647 5648 if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { 5649 CnstBits = CnstBits.zextOrTrunc(64); 5650 uint64_t CnstVal = CnstBits.getZExtValue(); 5651 5652 // Certain magic vector constants (used to express things like NOT 5653 // and NEG) are passed through unmodified. This allows codegen patterns 5654 // for these operations to match. Special-purpose patterns will lower 5655 // these immediates to MOVIs if it proves necessary. 5656 if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL)) 5657 return Op; 5658 5659 // The many faces of MOVI... 5660 if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) { 5661 CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal); 5662 if (VT.getSizeInBits() == 128) { 5663 SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, 5664 DAG.getConstant(CnstVal, MVT::i32)); 5665 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5666 } 5667 5668 // Support the V64 version via subregister insertion. 5669 SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, 5670 DAG.getConstant(CnstVal, MVT::i32)); 5671 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5672 } 5673 5674 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5675 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5676 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5677 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5678 DAG.getConstant(CnstVal, MVT::i32), 5679 DAG.getConstant(0, MVT::i32)); 5680 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5681 } 5682 5683 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5684 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5685 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5686 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5687 DAG.getConstant(CnstVal, MVT::i32), 5688 DAG.getConstant(8, MVT::i32)); 5689 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5690 } 5691 5692 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5693 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5694 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5695 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5696 DAG.getConstant(CnstVal, MVT::i32), 5697 DAG.getConstant(16, MVT::i32)); 5698 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5699 } 5700 5701 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5702 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5703 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5704 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5705 DAG.getConstant(CnstVal, MVT::i32), 5706 DAG.getConstant(24, MVT::i32)); 5707 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5708 } 5709 5710 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5711 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5712 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5713 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5714 DAG.getConstant(CnstVal, MVT::i32), 5715 DAG.getConstant(0, MVT::i32)); 5716 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5717 } 5718 5719 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5720 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5721 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5722 SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, 5723 DAG.getConstant(CnstVal, MVT::i32), 5724 DAG.getConstant(8, MVT::i32)); 5725 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5726 } 5727 5728 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { 5729 CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); 5730 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5731 SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, 5732 DAG.getConstant(CnstVal, MVT::i32), 5733 DAG.getConstant(264, MVT::i32)); 5734 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5735 } 5736 5737 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { 5738 CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); 5739 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5740 SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, 5741 DAG.getConstant(CnstVal, MVT::i32), 5742 DAG.getConstant(272, MVT::i32)); 5743 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5744 } 5745 5746 if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) { 5747 CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal); 5748 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; 5749 SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, 5750 DAG.getConstant(CnstVal, MVT::i32)); 5751 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5752 } 5753 5754 // The few faces of FMOV... 5755 if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) { 5756 CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal); 5757 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; 5758 SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, 5759 DAG.getConstant(CnstVal, MVT::i32)); 5760 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5761 } 5762 5763 if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) && 5764 VT.getSizeInBits() == 128) { 5765 CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); 5766 SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, 5767 DAG.getConstant(CnstVal, MVT::i32)); 5768 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5769 } 5770 5771 // The many faces of MVNI... 5772 CnstVal = ~CnstVal; 5773 if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { 5774 CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); 5775 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5776 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5777 DAG.getConstant(CnstVal, MVT::i32), 5778 DAG.getConstant(0, MVT::i32)); 5779 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5780 } 5781 5782 if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { 5783 CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); 5784 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5785 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5786 DAG.getConstant(CnstVal, MVT::i32), 5787 DAG.getConstant(8, MVT::i32)); 5788 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5789 } 5790 5791 if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { 5792 CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); 5793 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5794 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5795 DAG.getConstant(CnstVal, MVT::i32), 5796 DAG.getConstant(16, MVT::i32)); 5797 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5798 } 5799 5800 if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { 5801 CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); 5802 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5803 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5804 DAG.getConstant(CnstVal, MVT::i32), 5805 DAG.getConstant(24, MVT::i32)); 5806 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5807 } 5808 5809 if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { 5810 CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); 5811 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5812 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5813 DAG.getConstant(CnstVal, MVT::i32), 5814 DAG.getConstant(0, MVT::i32)); 5815 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5816 } 5817 5818 if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { 5819 CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); 5820 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; 5821 SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, 5822 DAG.getConstant(CnstVal, MVT::i32), 5823 DAG.getConstant(8, MVT::i32)); 5824 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5825 } 5826 5827 if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { 5828 CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); 5829 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5830 SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, 5831 DAG.getConstant(CnstVal, MVT::i32), 5832 DAG.getConstant(264, MVT::i32)); 5833 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5834 } 5835 5836 if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { 5837 CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); 5838 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; 5839 SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, 5840 DAG.getConstant(CnstVal, MVT::i32), 5841 DAG.getConstant(272, MVT::i32)); 5842 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); 5843 } 5844 } 5845 5846 if (SecondTry) 5847 goto FailedModImm; 5848 SecondTry = true; 5849 CnstBits = UndefBits; 5850 goto AttemptModImm; 5851 } 5852FailedModImm: 5853 5854 // Scan through the operands to find some interesting properties we can 5855 // exploit: 5856 // 1) If only one value is used, we can use a DUP, or 5857 // 2) if only the low element is not undef, we can just insert that, or 5858 // 3) if only one constant value is used (w/ some non-constant lanes), 5859 // we can splat the constant value into the whole vector then fill 5860 // in the non-constant lanes. 5861 // 4) FIXME: If different constant values are used, but we can intelligently 5862 // select the values we'll be overwriting for the non-constant 5863 // lanes such that we can directly materialize the vector 5864 // some other way (MOVI, e.g.), we can be sneaky. 5865 unsigned NumElts = VT.getVectorNumElements(); 5866 bool isOnlyLowElement = true; 5867 bool usesOnlyOneValue = true; 5868 bool usesOnlyOneConstantValue = true; 5869 bool isConstant = true; 5870 unsigned NumConstantLanes = 0; 5871 SDValue Value; 5872 SDValue ConstantValue; 5873 for (unsigned i = 0; i < NumElts; ++i) { 5874 SDValue V = Op.getOperand(i); 5875 if (V.getOpcode() == ISD::UNDEF) 5876 continue; 5877 if (i > 0) 5878 isOnlyLowElement = false; 5879 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 5880 isConstant = false; 5881 5882 if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) { 5883 ++NumConstantLanes; 5884 if (!ConstantValue.getNode()) 5885 ConstantValue = V; 5886 else if (ConstantValue != V) 5887 usesOnlyOneConstantValue = false; 5888 } 5889 5890 if (!Value.getNode()) 5891 Value = V; 5892 else if (V != Value) 5893 usesOnlyOneValue = false; 5894 } 5895 5896 if (!Value.getNode()) 5897 return DAG.getUNDEF(VT); 5898 5899 if (isOnlyLowElement) 5900 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 5901 5902 // Use DUP for non-constant splats. For f32 constant splats, reduce to 5903 // i32 and try again. 5904 if (usesOnlyOneValue) { 5905 if (!isConstant) { 5906 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5907 Value.getValueType() != VT) 5908 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); 5909 5910 // This is actually a DUPLANExx operation, which keeps everything vectory. 5911 5912 // DUPLANE works on 128-bit vectors, widen it if necessary. 5913 SDValue Lane = Value.getOperand(1); 5914 Value = Value.getOperand(0); 5915 if (Value.getValueType().getSizeInBits() == 64) 5916 Value = WidenVector(Value, DAG); 5917 5918 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); 5919 return DAG.getNode(Opcode, dl, VT, Value, Lane); 5920 } 5921 5922 if (VT.getVectorElementType().isFloatingPoint()) { 5923 SmallVector<SDValue, 8> Ops; 5924 MVT NewType = 5925 (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; 5926 for (unsigned i = 0; i < NumElts; ++i) 5927 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); 5928 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); 5929 SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); 5930 Val = LowerBUILD_VECTOR(Val, DAG); 5931 if (Val.getNode()) 5932 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 5933 } 5934 } 5935 5936 // If there was only one constant value used and for more than one lane, 5937 // start by splatting that value, then replace the non-constant lanes. This 5938 // is better than the default, which will perform a separate initialization 5939 // for each lane. 5940 if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { 5941 SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); 5942 // Now insert the non-constant lanes. 5943 for (unsigned i = 0; i < NumElts; ++i) { 5944 SDValue V = Op.getOperand(i); 5945 SDValue LaneIdx = DAG.getConstant(i, MVT::i64); 5946 if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) { 5947 // Note that type legalization likely mucked about with the VT of the 5948 // source operand, so we may have to convert it here before inserting. 5949 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); 5950 } 5951 } 5952 return Val; 5953 } 5954 5955 // If all elements are constants and the case above didn't get hit, fall back 5956 // to the default expansion, which will generate a load from the constant 5957 // pool. 5958 if (isConstant) 5959 return SDValue(); 5960 5961 // Empirical tests suggest this is rarely worth it for vectors of length <= 2. 5962 if (NumElts >= 4) { 5963 SDValue shuffle = ReconstructShuffle(Op, DAG); 5964 if (shuffle != SDValue()) 5965 return shuffle; 5966 } 5967 5968 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 5969 // know the default expansion would otherwise fall back on something even 5970 // worse. For a vector with one or two non-undef values, that's 5971 // scalar_to_vector for the elements followed by a shuffle (provided the 5972 // shuffle is valid for the target) and materialization element by element 5973 // on the stack followed by a load for everything else. 5974 if (!isConstant && !usesOnlyOneValue) { 5975 SDValue Vec = DAG.getUNDEF(VT); 5976 SDValue Op0 = Op.getOperand(0); 5977 unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); 5978 unsigned i = 0; 5979 // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to 5980 // a) Avoid a RMW dependency on the full vector register, and 5981 // b) Allow the register coalescer to fold away the copy if the 5982 // value is already in an S or D register. 5983 if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) { 5984 unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; 5985 MachineSDNode *N = 5986 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, 5987 DAG.getTargetConstant(SubIdx, MVT::i32)); 5988 Vec = SDValue(N, 0); 5989 ++i; 5990 } 5991 for (; i < NumElts; ++i) { 5992 SDValue V = Op.getOperand(i); 5993 if (V.getOpcode() == ISD::UNDEF) 5994 continue; 5995 SDValue LaneIdx = DAG.getConstant(i, MVT::i64); 5996 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 5997 } 5998 return Vec; 5999 } 6000 6001 // Just use the default expansion. We failed to find a better alternative. 6002 return SDValue(); 6003} 6004 6005SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 6006 SelectionDAG &DAG) const { 6007 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); 6008 6009 // Check for non-constant or out of range lane. 6010 EVT VT = Op.getOperand(0).getValueType(); 6011 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); 6012 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 6013 return SDValue(); 6014 6015 6016 // Insertion/extraction are legal for V128 types. 6017 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 6018 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 6019 VT == MVT::v8f16) 6020 return Op; 6021 6022 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 6023 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 6024 return SDValue(); 6025 6026 // For V64 types, we perform insertion by expanding the value 6027 // to a V128 type and perform the insertion on that. 6028 SDLoc DL(Op); 6029 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 6030 EVT WideTy = WideVec.getValueType(); 6031 6032 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, 6033 Op.getOperand(1), Op.getOperand(2)); 6034 // Re-narrow the resultant vector. 6035 return NarrowVector(Node, DAG); 6036} 6037 6038SDValue 6039AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 6040 SelectionDAG &DAG) const { 6041 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); 6042 6043 // Check for non-constant or out of range lane. 6044 EVT VT = Op.getOperand(0).getValueType(); 6045 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6046 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) 6047 return SDValue(); 6048 6049 6050 // Insertion/extraction are legal for V128 types. 6051 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || 6052 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || 6053 VT == MVT::v8f16) 6054 return Op; 6055 6056 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && 6057 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) 6058 return SDValue(); 6059 6060 // For V64 types, we perform extraction by expanding the value 6061 // to a V128 type and perform the extraction on that. 6062 SDLoc DL(Op); 6063 SDValue WideVec = WidenVector(Op.getOperand(0), DAG); 6064 EVT WideTy = WideVec.getValueType(); 6065 6066 EVT ExtrTy = WideTy.getVectorElementType(); 6067 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) 6068 ExtrTy = MVT::i32; 6069 6070 // For extractions, we just return the result directly. 6071 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, 6072 Op.getOperand(1)); 6073} 6074 6075SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, 6076 SelectionDAG &DAG) const { 6077 EVT VT = Op.getOperand(0).getValueType(); 6078 SDLoc dl(Op); 6079 // Just in case... 6080 if (!VT.isVector()) 6081 return SDValue(); 6082 6083 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6084 if (!Cst) 6085 return SDValue(); 6086 unsigned Val = Cst->getZExtValue(); 6087 6088 unsigned Size = Op.getValueType().getSizeInBits(); 6089 if (Val == 0) { 6090 switch (Size) { 6091 case 8: 6092 return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(), 6093 Op.getOperand(0)); 6094 case 16: 6095 return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(), 6096 Op.getOperand(0)); 6097 case 32: 6098 return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(), 6099 Op.getOperand(0)); 6100 case 64: 6101 return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(), 6102 Op.getOperand(0)); 6103 default: 6104 llvm_unreachable("Unexpected vector type in extract_subvector!"); 6105 } 6106 } 6107 // If this is extracting the upper 64-bits of a 128-bit vector, we match 6108 // that directly. 6109 if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) 6110 return Op; 6111 6112 return SDValue(); 6113} 6114 6115bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 6116 EVT VT) const { 6117 if (VT.getVectorNumElements() == 4 && 6118 (VT.is128BitVector() || VT.is64BitVector())) { 6119 unsigned PFIndexes[4]; 6120 for (unsigned i = 0; i != 4; ++i) { 6121 if (M[i] < 0) 6122 PFIndexes[i] = 8; 6123 else 6124 PFIndexes[i] = M[i]; 6125 } 6126 6127 // Compute the index in the perfect shuffle table. 6128 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + 6129 PFIndexes[2] * 9 + PFIndexes[3]; 6130 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 6131 unsigned Cost = (PFEntry >> 30); 6132 6133 if (Cost <= 4) 6134 return true; 6135 } 6136 6137 bool DummyBool; 6138 int DummyInt; 6139 unsigned DummyUnsigned; 6140 6141 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || 6142 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || 6143 isEXTMask(M, VT, DummyBool, DummyUnsigned) || 6144 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. 6145 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || 6146 isZIPMask(M, VT, DummyUnsigned) || 6147 isTRN_v_undef_Mask(M, VT, DummyUnsigned) || 6148 isUZP_v_undef_Mask(M, VT, DummyUnsigned) || 6149 isZIP_v_undef_Mask(M, VT, DummyUnsigned) || 6150 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || 6151 isConcatMask(M, VT, VT.getSizeInBits() == 128)); 6152} 6153 6154/// getVShiftImm - Check if this is a valid build_vector for the immediate 6155/// operand of a vector shift operation, where all the elements of the 6156/// build_vector must have the same constant integer value. 6157static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6158 // Ignore bit_converts. 6159 while (Op.getOpcode() == ISD::BITCAST) 6160 Op = Op.getOperand(0); 6161 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6162 APInt SplatBits, SplatUndef; 6163 unsigned SplatBitSize; 6164 bool HasAnyUndefs; 6165 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, 6166 HasAnyUndefs, ElementBits) || 6167 SplatBitSize > ElementBits) 6168 return false; 6169 Cnt = SplatBits.getSExtValue(); 6170 return true; 6171} 6172 6173/// isVShiftLImm - Check if this is a valid build_vector for the immediate 6174/// operand of a vector shift left operation. That value must be in the range: 6175/// 0 <= Value < ElementBits for a left shift; or 6176/// 0 <= Value <= ElementBits for a long left shift. 6177static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6178 assert(VT.isVector() && "vector shift count is not a vector type"); 6179 unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); 6180 if (!getVShiftImm(Op, ElementBits, Cnt)) 6181 return false; 6182 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6183} 6184 6185/// isVShiftRImm - Check if this is a valid build_vector for the immediate 6186/// operand of a vector shift right operation. For a shift opcode, the value 6187/// is positive, but for an intrinsic the value count must be negative. The 6188/// absolute value must be in the range: 6189/// 1 <= |Value| <= ElementBits for a right shift; or 6190/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6191static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6192 int64_t &Cnt) { 6193 assert(VT.isVector() && "vector shift count is not a vector type"); 6194 unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); 6195 if (!getVShiftImm(Op, ElementBits, Cnt)) 6196 return false; 6197 if (isIntrinsic) 6198 Cnt = -Cnt; 6199 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6200} 6201 6202SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, 6203 SelectionDAG &DAG) const { 6204 EVT VT = Op.getValueType(); 6205 SDLoc DL(Op); 6206 int64_t Cnt; 6207 6208 if (!Op.getOperand(1).getValueType().isVector()) 6209 return Op; 6210 unsigned EltSize = VT.getVectorElementType().getSizeInBits(); 6211 6212 switch (Op.getOpcode()) { 6213 default: 6214 llvm_unreachable("unexpected shift opcode"); 6215 6216 case ISD::SHL: 6217 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) 6218 return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0), 6219 DAG.getConstant(Cnt, MVT::i32)); 6220 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 6221 DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32), 6222 Op.getOperand(0), Op.getOperand(1)); 6223 case ISD::SRA: 6224 case ISD::SRL: 6225 // Right shift immediate 6226 if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) && 6227 Cnt < EltSize) { 6228 unsigned Opc = 6229 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; 6230 return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0), 6231 DAG.getConstant(Cnt, MVT::i32)); 6232 } 6233 6234 // Right shift register. Note, there is not a shift right register 6235 // instruction, but the shift left register instruction takes a signed 6236 // value, where negative numbers specify a right shift. 6237 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl 6238 : Intrinsic::aarch64_neon_ushl; 6239 // negate the shift amount 6240 SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); 6241 SDValue NegShiftLeft = 6242 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 6243 DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift); 6244 return NegShiftLeft; 6245 } 6246 6247 return SDValue(); 6248} 6249 6250static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, 6251 AArch64CC::CondCode CC, bool NoNans, EVT VT, 6252 SDLoc dl, SelectionDAG &DAG) { 6253 EVT SrcVT = LHS.getValueType(); 6254 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && 6255 "function only supposed to emit natural comparisons"); 6256 6257 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); 6258 APInt CnstBits(VT.getSizeInBits(), 0); 6259 APInt UndefBits(VT.getSizeInBits(), 0); 6260 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); 6261 bool IsZero = IsCnst && (CnstBits == 0); 6262 6263 if (SrcVT.getVectorElementType().isFloatingPoint()) { 6264 switch (CC) { 6265 default: 6266 return SDValue(); 6267 case AArch64CC::NE: { 6268 SDValue Fcmeq; 6269 if (IsZero) 6270 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 6271 else 6272 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 6273 return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); 6274 } 6275 case AArch64CC::EQ: 6276 if (IsZero) 6277 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); 6278 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); 6279 case AArch64CC::GE: 6280 if (IsZero) 6281 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); 6282 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); 6283 case AArch64CC::GT: 6284 if (IsZero) 6285 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); 6286 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); 6287 case AArch64CC::LS: 6288 if (IsZero) 6289 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); 6290 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); 6291 case AArch64CC::LT: 6292 if (!NoNans) 6293 return SDValue(); 6294 // If we ignore NaNs then we can use to the MI implementation. 6295 // Fallthrough. 6296 case AArch64CC::MI: 6297 if (IsZero) 6298 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); 6299 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); 6300 } 6301 } 6302 6303 switch (CC) { 6304 default: 6305 return SDValue(); 6306 case AArch64CC::NE: { 6307 SDValue Cmeq; 6308 if (IsZero) 6309 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 6310 else 6311 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 6312 return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); 6313 } 6314 case AArch64CC::EQ: 6315 if (IsZero) 6316 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); 6317 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); 6318 case AArch64CC::GE: 6319 if (IsZero) 6320 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); 6321 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); 6322 case AArch64CC::GT: 6323 if (IsZero) 6324 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); 6325 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); 6326 case AArch64CC::LE: 6327 if (IsZero) 6328 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); 6329 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); 6330 case AArch64CC::LS: 6331 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); 6332 case AArch64CC::LO: 6333 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); 6334 case AArch64CC::LT: 6335 if (IsZero) 6336 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); 6337 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); 6338 case AArch64CC::HI: 6339 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); 6340 case AArch64CC::HS: 6341 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); 6342 } 6343} 6344 6345SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, 6346 SelectionDAG &DAG) const { 6347 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6348 SDValue LHS = Op.getOperand(0); 6349 SDValue RHS = Op.getOperand(1); 6350 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); 6351 SDLoc dl(Op); 6352 6353 if (LHS.getValueType().getVectorElementType().isInteger()) { 6354 assert(LHS.getValueType() == RHS.getValueType()); 6355 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); 6356 SDValue Cmp = 6357 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); 6358 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 6359 } 6360 6361 assert(LHS.getValueType().getVectorElementType() == MVT::f32 || 6362 LHS.getValueType().getVectorElementType() == MVT::f64); 6363 6364 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally 6365 // clean. Some of them require two branches to implement. 6366 AArch64CC::CondCode CC1, CC2; 6367 bool ShouldInvert; 6368 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); 6369 6370 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; 6371 SDValue Cmp = 6372 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); 6373 if (!Cmp.getNode()) 6374 return SDValue(); 6375 6376 if (CC2 != AArch64CC::AL) { 6377 SDValue Cmp2 = 6378 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); 6379 if (!Cmp2.getNode()) 6380 return SDValue(); 6381 6382 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); 6383 } 6384 6385 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); 6386 6387 if (ShouldInvert) 6388 return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); 6389 6390 return Cmp; 6391} 6392 6393/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 6394/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 6395/// specified in the intrinsic calls. 6396bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 6397 const CallInst &I, 6398 unsigned Intrinsic) const { 6399 switch (Intrinsic) { 6400 case Intrinsic::aarch64_neon_ld2: 6401 case Intrinsic::aarch64_neon_ld3: 6402 case Intrinsic::aarch64_neon_ld4: 6403 case Intrinsic::aarch64_neon_ld1x2: 6404 case Intrinsic::aarch64_neon_ld1x3: 6405 case Intrinsic::aarch64_neon_ld1x4: 6406 case Intrinsic::aarch64_neon_ld2lane: 6407 case Intrinsic::aarch64_neon_ld3lane: 6408 case Intrinsic::aarch64_neon_ld4lane: 6409 case Intrinsic::aarch64_neon_ld2r: 6410 case Intrinsic::aarch64_neon_ld3r: 6411 case Intrinsic::aarch64_neon_ld4r: { 6412 Info.opc = ISD::INTRINSIC_W_CHAIN; 6413 // Conservatively set memVT to the entire set of vectors loaded. 6414 uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; 6415 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 6416 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 6417 Info.offset = 0; 6418 Info.align = 0; 6419 Info.vol = false; // volatile loads with NEON intrinsics not supported 6420 Info.readMem = true; 6421 Info.writeMem = false; 6422 return true; 6423 } 6424 case Intrinsic::aarch64_neon_st2: 6425 case Intrinsic::aarch64_neon_st3: 6426 case Intrinsic::aarch64_neon_st4: 6427 case Intrinsic::aarch64_neon_st1x2: 6428 case Intrinsic::aarch64_neon_st1x3: 6429 case Intrinsic::aarch64_neon_st1x4: 6430 case Intrinsic::aarch64_neon_st2lane: 6431 case Intrinsic::aarch64_neon_st3lane: 6432 case Intrinsic::aarch64_neon_st4lane: { 6433 Info.opc = ISD::INTRINSIC_VOID; 6434 // Conservatively set memVT to the entire set of vectors stored. 6435 unsigned NumElts = 0; 6436 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 6437 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 6438 if (!ArgTy->isVectorTy()) 6439 break; 6440 NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; 6441 } 6442 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 6443 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 6444 Info.offset = 0; 6445 Info.align = 0; 6446 Info.vol = false; // volatile stores with NEON intrinsics not supported 6447 Info.readMem = false; 6448 Info.writeMem = true; 6449 return true; 6450 } 6451 case Intrinsic::aarch64_ldaxr: 6452 case Intrinsic::aarch64_ldxr: { 6453 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 6454 Info.opc = ISD::INTRINSIC_W_CHAIN; 6455 Info.memVT = MVT::getVT(PtrTy->getElementType()); 6456 Info.ptrVal = I.getArgOperand(0); 6457 Info.offset = 0; 6458 Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); 6459 Info.vol = true; 6460 Info.readMem = true; 6461 Info.writeMem = false; 6462 return true; 6463 } 6464 case Intrinsic::aarch64_stlxr: 6465 case Intrinsic::aarch64_stxr: { 6466 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 6467 Info.opc = ISD::INTRINSIC_W_CHAIN; 6468 Info.memVT = MVT::getVT(PtrTy->getElementType()); 6469 Info.ptrVal = I.getArgOperand(1); 6470 Info.offset = 0; 6471 Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); 6472 Info.vol = true; 6473 Info.readMem = false; 6474 Info.writeMem = true; 6475 return true; 6476 } 6477 case Intrinsic::aarch64_ldaxp: 6478 case Intrinsic::aarch64_ldxp: { 6479 Info.opc = ISD::INTRINSIC_W_CHAIN; 6480 Info.memVT = MVT::i128; 6481 Info.ptrVal = I.getArgOperand(0); 6482 Info.offset = 0; 6483 Info.align = 16; 6484 Info.vol = true; 6485 Info.readMem = true; 6486 Info.writeMem = false; 6487 return true; 6488 } 6489 case Intrinsic::aarch64_stlxp: 6490 case Intrinsic::aarch64_stxp: { 6491 Info.opc = ISD::INTRINSIC_W_CHAIN; 6492 Info.memVT = MVT::i128; 6493 Info.ptrVal = I.getArgOperand(2); 6494 Info.offset = 0; 6495 Info.align = 16; 6496 Info.vol = true; 6497 Info.readMem = false; 6498 Info.writeMem = true; 6499 return true; 6500 } 6501 default: 6502 break; 6503 } 6504 6505 return false; 6506} 6507 6508// Truncations from 64-bit GPR to 32-bit GPR is free. 6509bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { 6510 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 6511 return false; 6512 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6513 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6514 return NumBits1 > NumBits2; 6515} 6516bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 6517 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 6518 return false; 6519 unsigned NumBits1 = VT1.getSizeInBits(); 6520 unsigned NumBits2 = VT2.getSizeInBits(); 6521 return NumBits1 > NumBits2; 6522} 6523 6524// All 32-bit GPR operations implicitly zero the high-half of the corresponding 6525// 64-bit GPR. 6526bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { 6527 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 6528 return false; 6529 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6530 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6531 return NumBits1 == 32 && NumBits2 == 64; 6532} 6533bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 6534 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) 6535 return false; 6536 unsigned NumBits1 = VT1.getSizeInBits(); 6537 unsigned NumBits2 = VT2.getSizeInBits(); 6538 return NumBits1 == 32 && NumBits2 == 64; 6539} 6540 6541bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 6542 EVT VT1 = Val.getValueType(); 6543 if (isZExtFree(VT1, VT2)) { 6544 return true; 6545 } 6546 6547 if (Val.getOpcode() != ISD::LOAD) 6548 return false; 6549 6550 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. 6551 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && 6552 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && 6553 VT1.getSizeInBits() <= 32); 6554} 6555 6556bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, 6557 unsigned &RequiredAligment) const { 6558 if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) 6559 return false; 6560 // Cyclone supports unaligned accesses. 6561 RequiredAligment = 0; 6562 unsigned NumBits = LoadedType->getPrimitiveSizeInBits(); 6563 return NumBits == 32 || NumBits == 64; 6564} 6565 6566bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, 6567 unsigned &RequiredAligment) const { 6568 if (!LoadedType.isSimple() || 6569 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) 6570 return false; 6571 // Cyclone supports unaligned accesses. 6572 RequiredAligment = 0; 6573 unsigned NumBits = LoadedType.getSizeInBits(); 6574 return NumBits == 32 || NumBits == 64; 6575} 6576 6577static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, 6578 unsigned AlignCheck) { 6579 return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && 6580 (DstAlign == 0 || DstAlign % AlignCheck == 0)); 6581} 6582 6583EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, 6584 unsigned SrcAlign, bool IsMemset, 6585 bool ZeroMemset, 6586 bool MemcpyStrSrc, 6587 MachineFunction &MF) const { 6588 // Don't use AdvSIMD to implement 16-byte memset. It would have taken one 6589 // instruction to materialize the v2i64 zero and one store (with restrictive 6590 // addressing mode). Just do two i64 store of zero-registers. 6591 bool Fast; 6592 const Function *F = MF.getFunction(); 6593 if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && 6594 !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, 6595 Attribute::NoImplicitFloat) && 6596 (memOpAlign(SrcAlign, DstAlign, 16) || 6597 (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) 6598 return MVT::f128; 6599 6600 return Size >= 8 ? MVT::i64 : MVT::i32; 6601} 6602 6603// 12-bit optionally shifted immediates are legal for adds. 6604bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { 6605 if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) 6606 return true; 6607 return false; 6608} 6609 6610// Integer comparisons are implemented with ADDS/SUBS, so the range of valid 6611// immediates is the same as for an add or a sub. 6612bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { 6613 if (Immed < 0) 6614 Immed *= -1; 6615 return isLegalAddImmediate(Immed); 6616} 6617 6618/// isLegalAddressingMode - Return true if the addressing mode represented 6619/// by AM is legal for this target, for a load/store of the specified type. 6620bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, 6621 Type *Ty) const { 6622 // AArch64 has five basic addressing modes: 6623 // reg 6624 // reg + 9-bit signed offset 6625 // reg + SIZE_IN_BYTES * 12-bit unsigned offset 6626 // reg1 + reg2 6627 // reg + SIZE_IN_BYTES * reg 6628 6629 // No global is ever allowed as a base. 6630 if (AM.BaseGV) 6631 return false; 6632 6633 // No reg+reg+imm addressing. 6634 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) 6635 return false; 6636 6637 // check reg + imm case: 6638 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 6639 uint64_t NumBytes = 0; 6640 if (Ty->isSized()) { 6641 uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty); 6642 NumBytes = NumBits / 8; 6643 if (!isPowerOf2_64(NumBits)) 6644 NumBytes = 0; 6645 } 6646 6647 if (!AM.Scale) { 6648 int64_t Offset = AM.BaseOffs; 6649 6650 // 9-bit signed offset 6651 if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1) 6652 return true; 6653 6654 // 12-bit unsigned offset 6655 unsigned shift = Log2_64(NumBytes); 6656 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && 6657 // Must be a multiple of NumBytes (NumBytes is a power of 2) 6658 (Offset >> shift) << shift == Offset) 6659 return true; 6660 return false; 6661 } 6662 6663 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 6664 6665 if (!AM.Scale || AM.Scale == 1 || 6666 (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) 6667 return true; 6668 return false; 6669} 6670 6671int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM, 6672 Type *Ty) const { 6673 // Scaling factors are not free at all. 6674 // Operands | Rt Latency 6675 // ------------------------------------------- 6676 // Rt, [Xn, Xm] | 4 6677 // ------------------------------------------- 6678 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 6679 // Rt, [Xn, Wm, <extend> #imm] | 6680 if (isLegalAddressingMode(AM, Ty)) 6681 // Scale represents reg2 * scale, thus account for 1 if 6682 // it is not equal to 0 or 1. 6683 return AM.Scale != 0 && AM.Scale != 1; 6684 return -1; 6685} 6686 6687bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 6688 VT = VT.getScalarType(); 6689 6690 if (!VT.isSimple()) 6691 return false; 6692 6693 switch (VT.getSimpleVT().SimpleTy) { 6694 case MVT::f32: 6695 case MVT::f64: 6696 return true; 6697 default: 6698 break; 6699 } 6700 6701 return false; 6702} 6703 6704const MCPhysReg * 6705AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { 6706 // LR is a callee-save register, but we must treat it as clobbered by any call 6707 // site. Hence we include LR in the scratch registers, which are in turn added 6708 // as implicit-defs for stackmaps and patchpoints. 6709 static const MCPhysReg ScratchRegs[] = { 6710 AArch64::X16, AArch64::X17, AArch64::LR, 0 6711 }; 6712 return ScratchRegs; 6713} 6714 6715bool 6716AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const { 6717 EVT VT = N->getValueType(0); 6718 // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine 6719 // it with shift to let it be lowered to UBFX. 6720 if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && 6721 isa<ConstantSDNode>(N->getOperand(1))) { 6722 uint64_t TruncMask = N->getConstantOperandVal(1); 6723 if (isMask_64(TruncMask) && 6724 N->getOperand(0).getOpcode() == ISD::SRL && 6725 isa<ConstantSDNode>(N->getOperand(0)->getOperand(1))) 6726 return false; 6727 } 6728 return true; 6729} 6730 6731bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 6732 Type *Ty) const { 6733 assert(Ty->isIntegerTy()); 6734 6735 unsigned BitSize = Ty->getPrimitiveSizeInBits(); 6736 if (BitSize == 0) 6737 return false; 6738 6739 int64_t Val = Imm.getSExtValue(); 6740 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) 6741 return true; 6742 6743 if ((int64_t)Val < 0) 6744 Val = ~Val; 6745 if (BitSize == 32) 6746 Val &= (1LL << 32) - 1; 6747 6748 unsigned LZ = countLeadingZeros((uint64_t)Val); 6749 unsigned Shift = (63 - LZ) / 16; 6750 // MOVZ is free so return true for one or fewer MOVK. 6751 return (Shift < 3) ? true : false; 6752} 6753 6754// Generate SUBS and CSEL for integer abs. 6755static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { 6756 EVT VT = N->getValueType(0); 6757 6758 SDValue N0 = N->getOperand(0); 6759 SDValue N1 = N->getOperand(1); 6760 SDLoc DL(N); 6761 6762 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) 6763 // and change it to SUB and CSEL. 6764 if (VT.isInteger() && N->getOpcode() == ISD::XOR && 6765 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && 6766 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) 6767 if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) 6768 if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { 6769 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), 6770 N0.getOperand(0)); 6771 // Generate SUBS & CSEL. 6772 SDValue Cmp = 6773 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), 6774 N0.getOperand(0), DAG.getConstant(0, VT)); 6775 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, 6776 DAG.getConstant(AArch64CC::PL, MVT::i32), 6777 SDValue(Cmp.getNode(), 1)); 6778 } 6779 return SDValue(); 6780} 6781 6782// performXorCombine - Attempts to handle integer ABS. 6783static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, 6784 TargetLowering::DAGCombinerInfo &DCI, 6785 const AArch64Subtarget *Subtarget) { 6786 if (DCI.isBeforeLegalizeOps()) 6787 return SDValue(); 6788 6789 return performIntegerAbsCombine(N, DAG); 6790} 6791 6792SDValue 6793AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 6794 SelectionDAG &DAG, 6795 std::vector<SDNode *> *Created) const { 6796 // fold (sdiv X, pow2) 6797 EVT VT = N->getValueType(0); 6798 if ((VT != MVT::i32 && VT != MVT::i64) || 6799 !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) 6800 return SDValue(); 6801 6802 SDLoc DL(N); 6803 SDValue N0 = N->getOperand(0); 6804 unsigned Lg2 = Divisor.countTrailingZeros(); 6805 SDValue Zero = DAG.getConstant(0, VT); 6806 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT); 6807 6808 // Add (N0 < 0) ? Pow2 - 1 : 0; 6809 SDValue CCVal; 6810 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); 6811 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); 6812 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); 6813 6814 if (Created) { 6815 Created->push_back(Cmp.getNode()); 6816 Created->push_back(Add.getNode()); 6817 Created->push_back(CSel.getNode()); 6818 } 6819 6820 // Divide by pow2. 6821 SDValue SRA = 6822 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64)); 6823 6824 // If we're dividing by a positive value, we're done. Otherwise, we must 6825 // negate the result. 6826 if (Divisor.isNonNegative()) 6827 return SRA; 6828 6829 if (Created) 6830 Created->push_back(SRA.getNode()); 6831 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA); 6832} 6833 6834static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, 6835 TargetLowering::DAGCombinerInfo &DCI, 6836 const AArch64Subtarget *Subtarget) { 6837 if (DCI.isBeforeLegalizeOps()) 6838 return SDValue(); 6839 6840 // Multiplication of a power of two plus/minus one can be done more 6841 // cheaply as as shift+add/sub. For now, this is true unilaterally. If 6842 // future CPUs have a cheaper MADD instruction, this may need to be 6843 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and 6844 // 64-bit is 5 cycles, so this is always a win. 6845 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 6846 APInt Value = C->getAPIntValue(); 6847 EVT VT = N->getValueType(0); 6848 if (Value.isNonNegative()) { 6849 // (mul x, 2^N + 1) => (add (shl x, N), x) 6850 APInt VM1 = Value - 1; 6851 if (VM1.isPowerOf2()) { 6852 SDValue ShiftedVal = 6853 DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), 6854 DAG.getConstant(VM1.logBase2(), MVT::i64)); 6855 return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, 6856 N->getOperand(0)); 6857 } 6858 // (mul x, 2^N - 1) => (sub (shl x, N), x) 6859 APInt VP1 = Value + 1; 6860 if (VP1.isPowerOf2()) { 6861 SDValue ShiftedVal = 6862 DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), 6863 DAG.getConstant(VP1.logBase2(), MVT::i64)); 6864 return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, 6865 N->getOperand(0)); 6866 } 6867 } else { 6868 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 6869 APInt VNM1 = -Value - 1; 6870 if (VNM1.isPowerOf2()) { 6871 SDValue ShiftedVal = 6872 DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), 6873 DAG.getConstant(VNM1.logBase2(), MVT::i64)); 6874 SDValue Add = 6875 DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); 6876 return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add); 6877 } 6878 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 6879 APInt VNP1 = -Value + 1; 6880 if (VNP1.isPowerOf2()) { 6881 SDValue ShiftedVal = 6882 DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), 6883 DAG.getConstant(VNP1.logBase2(), MVT::i64)); 6884 return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0), 6885 ShiftedVal); 6886 } 6887 } 6888 } 6889 return SDValue(); 6890} 6891 6892static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, 6893 SelectionDAG &DAG) { 6894 // Take advantage of vector comparisons producing 0 or -1 in each lane to 6895 // optimize away operation when it's from a constant. 6896 // 6897 // The general transformation is: 6898 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> 6899 // AND(VECTOR_CMP(x,y), constant2) 6900 // constant2 = UNARYOP(constant) 6901 6902 // Early exit if this isn't a vector operation, the operand of the 6903 // unary operation isn't a bitwise AND, or if the sizes of the operations 6904 // aren't the same. 6905 EVT VT = N->getValueType(0); 6906 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || 6907 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || 6908 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) 6909 return SDValue(); 6910 6911 // Now check that the other operand of the AND is a constant. We could 6912 // make the transformation for non-constant splats as well, but it's unclear 6913 // that would be a benefit as it would not eliminate any operations, just 6914 // perform one more step in scalar code before moving to the vector unit. 6915 if (BuildVectorSDNode *BV = 6916 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { 6917 // Bail out if the vector isn't a constant. 6918 if (!BV->isConstant()) 6919 return SDValue(); 6920 6921 // Everything checks out. Build up the new and improved node. 6922 SDLoc DL(N); 6923 EVT IntVT = BV->getValueType(0); 6924 // Create a new constant of the appropriate type for the transformed 6925 // DAG. 6926 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); 6927 // The AND node needs bitcasts to/from an integer vector type around it. 6928 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); 6929 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, 6930 N->getOperand(0)->getOperand(0), MaskConst); 6931 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); 6932 return Res; 6933 } 6934 6935 return SDValue(); 6936} 6937 6938static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, 6939 const AArch64Subtarget *Subtarget) { 6940 // First try to optimize away the conversion when it's conditionally from 6941 // a constant. Vectors only. 6942 SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); 6943 if (Res != SDValue()) 6944 return Res; 6945 6946 EVT VT = N->getValueType(0); 6947 if (VT != MVT::f32 && VT != MVT::f64) 6948 return SDValue(); 6949 6950 // Only optimize when the source and destination types have the same width. 6951 if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) 6952 return SDValue(); 6953 6954 // If the result of an integer load is only used by an integer-to-float 6955 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. 6956 // This eliminates an "integer-to-vector-move UOP and improve throughput. 6957 SDValue N0 = N->getOperand(0); 6958 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && 6959 // Do not change the width of a volatile load. 6960 !cast<LoadSDNode>(N0)->isVolatile()) { 6961 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 6962 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), 6963 LN0->getPointerInfo(), LN0->isVolatile(), 6964 LN0->isNonTemporal(), LN0->isInvariant(), 6965 LN0->getAlignment()); 6966 6967 // Make sure successors of the original load stay after it by updating them 6968 // to use the new Chain. 6969 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); 6970 6971 unsigned Opcode = 6972 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; 6973 return DAG.getNode(Opcode, SDLoc(N), VT, Load); 6974 } 6975 6976 return SDValue(); 6977} 6978 6979/// An EXTR instruction is made up of two shifts, ORed together. This helper 6980/// searches for and classifies those shifts. 6981static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, 6982 bool &FromHi) { 6983 if (N.getOpcode() == ISD::SHL) 6984 FromHi = false; 6985 else if (N.getOpcode() == ISD::SRL) 6986 FromHi = true; 6987 else 6988 return false; 6989 6990 if (!isa<ConstantSDNode>(N.getOperand(1))) 6991 return false; 6992 6993 ShiftAmount = N->getConstantOperandVal(1); 6994 Src = N->getOperand(0); 6995 return true; 6996} 6997 6998/// EXTR instruction extracts a contiguous chunk of bits from two existing 6999/// registers viewed as a high/low pair. This function looks for the pattern: 7000/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an 7001/// EXTR. Can't quite be done in TableGen because the two immediates aren't 7002/// independent. 7003static SDValue tryCombineToEXTR(SDNode *N, 7004 TargetLowering::DAGCombinerInfo &DCI) { 7005 SelectionDAG &DAG = DCI.DAG; 7006 SDLoc DL(N); 7007 EVT VT = N->getValueType(0); 7008 7009 assert(N->getOpcode() == ISD::OR && "Unexpected root"); 7010 7011 if (VT != MVT::i32 && VT != MVT::i64) 7012 return SDValue(); 7013 7014 SDValue LHS; 7015 uint32_t ShiftLHS = 0; 7016 bool LHSFromHi = 0; 7017 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) 7018 return SDValue(); 7019 7020 SDValue RHS; 7021 uint32_t ShiftRHS = 0; 7022 bool RHSFromHi = 0; 7023 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) 7024 return SDValue(); 7025 7026 // If they're both trying to come from the high part of the register, they're 7027 // not really an EXTR. 7028 if (LHSFromHi == RHSFromHi) 7029 return SDValue(); 7030 7031 if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) 7032 return SDValue(); 7033 7034 if (LHSFromHi) { 7035 std::swap(LHS, RHS); 7036 std::swap(ShiftLHS, ShiftRHS); 7037 } 7038 7039 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, 7040 DAG.getConstant(ShiftRHS, MVT::i64)); 7041} 7042 7043static SDValue tryCombineToBSL(SDNode *N, 7044 TargetLowering::DAGCombinerInfo &DCI) { 7045 EVT VT = N->getValueType(0); 7046 SelectionDAG &DAG = DCI.DAG; 7047 SDLoc DL(N); 7048 7049 if (!VT.isVector()) 7050 return SDValue(); 7051 7052 SDValue N0 = N->getOperand(0); 7053 if (N0.getOpcode() != ISD::AND) 7054 return SDValue(); 7055 7056 SDValue N1 = N->getOperand(1); 7057 if (N1.getOpcode() != ISD::AND) 7058 return SDValue(); 7059 7060 // We only have to look for constant vectors here since the general, variable 7061 // case can be handled in TableGen. 7062 unsigned Bits = VT.getVectorElementType().getSizeInBits(); 7063 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); 7064 for (int i = 1; i >= 0; --i) 7065 for (int j = 1; j >= 0; --j) { 7066 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); 7067 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); 7068 if (!BVN0 || !BVN1) 7069 continue; 7070 7071 bool FoundMatch = true; 7072 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { 7073 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); 7074 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); 7075 if (!CN0 || !CN1 || 7076 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { 7077 FoundMatch = false; 7078 break; 7079 } 7080 } 7081 7082 if (FoundMatch) 7083 return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), 7084 N0->getOperand(1 - i), N1->getOperand(1 - j)); 7085 } 7086 7087 return SDValue(); 7088} 7089 7090static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 7091 const AArch64Subtarget *Subtarget) { 7092 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) 7093 if (!EnableAArch64ExtrGeneration) 7094 return SDValue(); 7095 SelectionDAG &DAG = DCI.DAG; 7096 EVT VT = N->getValueType(0); 7097 7098 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 7099 return SDValue(); 7100 7101 SDValue Res = tryCombineToEXTR(N, DCI); 7102 if (Res.getNode()) 7103 return Res; 7104 7105 Res = tryCombineToBSL(N, DCI); 7106 if (Res.getNode()) 7107 return Res; 7108 7109 return SDValue(); 7110} 7111 7112static SDValue performBitcastCombine(SDNode *N, 7113 TargetLowering::DAGCombinerInfo &DCI, 7114 SelectionDAG &DAG) { 7115 // Wait 'til after everything is legalized to try this. That way we have 7116 // legal vector types and such. 7117 if (DCI.isBeforeLegalizeOps()) 7118 return SDValue(); 7119 7120 // Remove extraneous bitcasts around an extract_subvector. 7121 // For example, 7122 // (v4i16 (bitconvert 7123 // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) 7124 // becomes 7125 // (extract_subvector ((v8i16 ...), (i64 4))) 7126 7127 // Only interested in 64-bit vectors as the ultimate result. 7128 EVT VT = N->getValueType(0); 7129 if (!VT.isVector()) 7130 return SDValue(); 7131 if (VT.getSimpleVT().getSizeInBits() != 64) 7132 return SDValue(); 7133 // Is the operand an extract_subvector starting at the beginning or halfway 7134 // point of the vector? A low half may also come through as an 7135 // EXTRACT_SUBREG, so look for that, too. 7136 SDValue Op0 = N->getOperand(0); 7137 if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && 7138 !(Op0->isMachineOpcode() && 7139 Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) 7140 return SDValue(); 7141 uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue(); 7142 if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { 7143 if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) 7144 return SDValue(); 7145 } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { 7146 if (idx != AArch64::dsub) 7147 return SDValue(); 7148 // The dsub reference is equivalent to a lane zero subvector reference. 7149 idx = 0; 7150 } 7151 // Look through the bitcast of the input to the extract. 7152 if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) 7153 return SDValue(); 7154 SDValue Source = Op0->getOperand(0)->getOperand(0); 7155 // If the source type has twice the number of elements as our destination 7156 // type, we know this is an extract of the high or low half of the vector. 7157 EVT SVT = Source->getValueType(0); 7158 if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) 7159 return SDValue(); 7160 7161 DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); 7162 7163 // Create the simplified form to just extract the low or high half of the 7164 // vector directly rather than bothering with the bitcasts. 7165 SDLoc dl(N); 7166 unsigned NumElements = VT.getVectorNumElements(); 7167 if (idx) { 7168 SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64); 7169 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); 7170 } else { 7171 SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32); 7172 return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, 7173 Source, SubReg), 7174 0); 7175 } 7176} 7177 7178static SDValue performConcatVectorsCombine(SDNode *N, 7179 TargetLowering::DAGCombinerInfo &DCI, 7180 SelectionDAG &DAG) { 7181 // Wait 'til after everything is legalized to try this. That way we have 7182 // legal vector types and such. 7183 if (DCI.isBeforeLegalizeOps()) 7184 return SDValue(); 7185 7186 SDLoc dl(N); 7187 EVT VT = N->getValueType(0); 7188 7189 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector 7190 // splat. The indexed instructions are going to be expecting a DUPLANE64, so 7191 // canonicalise to that. 7192 if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) { 7193 assert(VT.getVectorElementType().getSizeInBits() == 64); 7194 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, 7195 WidenVector(N->getOperand(0), DAG), 7196 DAG.getConstant(0, MVT::i64)); 7197 } 7198 7199 // Canonicalise concat_vectors so that the right-hand vector has as few 7200 // bit-casts as possible before its real operation. The primary matching 7201 // destination for these operations will be the narrowing "2" instructions, 7202 // which depend on the operation being performed on this right-hand vector. 7203 // For example, 7204 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) 7205 // becomes 7206 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) 7207 7208 SDValue Op1 = N->getOperand(1); 7209 if (Op1->getOpcode() != ISD::BITCAST) 7210 return SDValue(); 7211 SDValue RHS = Op1->getOperand(0); 7212 MVT RHSTy = RHS.getValueType().getSimpleVT(); 7213 // If the RHS is not a vector, this is not the pattern we're looking for. 7214 if (!RHSTy.isVector()) 7215 return SDValue(); 7216 7217 DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); 7218 7219 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), 7220 RHSTy.getVectorNumElements() * 2); 7221 return DAG.getNode( 7222 ISD::BITCAST, dl, VT, 7223 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, 7224 DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS)); 7225} 7226 7227static SDValue tryCombineFixedPointConvert(SDNode *N, 7228 TargetLowering::DAGCombinerInfo &DCI, 7229 SelectionDAG &DAG) { 7230 // Wait 'til after everything is legalized to try this. That way we have 7231 // legal vector types and such. 7232 if (DCI.isBeforeLegalizeOps()) 7233 return SDValue(); 7234 // Transform a scalar conversion of a value from a lane extract into a 7235 // lane extract of a vector conversion. E.g., from foo1 to foo2: 7236 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } 7237 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } 7238 // 7239 // The second form interacts better with instruction selection and the 7240 // register allocator to avoid cross-class register copies that aren't 7241 // coalescable due to a lane reference. 7242 7243 // Check the operand and see if it originates from a lane extract. 7244 SDValue Op1 = N->getOperand(1); 7245 if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 7246 // Yep, no additional predication needed. Perform the transform. 7247 SDValue IID = N->getOperand(0); 7248 SDValue Shift = N->getOperand(2); 7249 SDValue Vec = Op1.getOperand(0); 7250 SDValue Lane = Op1.getOperand(1); 7251 EVT ResTy = N->getValueType(0); 7252 EVT VecResTy; 7253 SDLoc DL(N); 7254 7255 // The vector width should be 128 bits by the time we get here, even 7256 // if it started as 64 bits (the extract_vector handling will have 7257 // done so). 7258 assert(Vec.getValueType().getSizeInBits() == 128 && 7259 "unexpected vector size on extract_vector_elt!"); 7260 if (Vec.getValueType() == MVT::v4i32) 7261 VecResTy = MVT::v4f32; 7262 else if (Vec.getValueType() == MVT::v2i64) 7263 VecResTy = MVT::v2f64; 7264 else 7265 llvm_unreachable("unexpected vector type!"); 7266 7267 SDValue Convert = 7268 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); 7269 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); 7270 } 7271 return SDValue(); 7272} 7273 7274// AArch64 high-vector "long" operations are formed by performing the non-high 7275// version on an extract_subvector of each operand which gets the high half: 7276// 7277// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) 7278// 7279// However, there are cases which don't have an extract_high explicitly, but 7280// have another operation that can be made compatible with one for free. For 7281// example: 7282// 7283// (dupv64 scalar) --> (extract_high (dup128 scalar)) 7284// 7285// This routine does the actual conversion of such DUPs, once outer routines 7286// have determined that everything else is in order. 7287static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { 7288 // We can handle most types of duplicate, but the lane ones have an extra 7289 // operand saying *which* lane, so we need to know. 7290 bool IsDUPLANE; 7291 switch (N.getOpcode()) { 7292 case AArch64ISD::DUP: 7293 IsDUPLANE = false; 7294 break; 7295 case AArch64ISD::DUPLANE8: 7296 case AArch64ISD::DUPLANE16: 7297 case AArch64ISD::DUPLANE32: 7298 case AArch64ISD::DUPLANE64: 7299 IsDUPLANE = true; 7300 break; 7301 default: 7302 return SDValue(); 7303 } 7304 7305 MVT NarrowTy = N.getSimpleValueType(); 7306 if (!NarrowTy.is64BitVector()) 7307 return SDValue(); 7308 7309 MVT ElementTy = NarrowTy.getVectorElementType(); 7310 unsigned NumElems = NarrowTy.getVectorNumElements(); 7311 MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2); 7312 7313 SDValue NewDUP; 7314 if (IsDUPLANE) 7315 NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0), 7316 N.getOperand(1)); 7317 else 7318 NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0)); 7319 7320 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy, 7321 NewDUP, DAG.getConstant(NumElems, MVT::i64)); 7322} 7323 7324static bool isEssentiallyExtractSubvector(SDValue N) { 7325 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) 7326 return true; 7327 7328 return N.getOpcode() == ISD::BITCAST && 7329 N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; 7330} 7331 7332/// \brief Helper structure to keep track of ISD::SET_CC operands. 7333struct GenericSetCCInfo { 7334 const SDValue *Opnd0; 7335 const SDValue *Opnd1; 7336 ISD::CondCode CC; 7337}; 7338 7339/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code. 7340struct AArch64SetCCInfo { 7341 const SDValue *Cmp; 7342 AArch64CC::CondCode CC; 7343}; 7344 7345/// \brief Helper structure to keep track of SetCC information. 7346union SetCCInfo { 7347 GenericSetCCInfo Generic; 7348 AArch64SetCCInfo AArch64; 7349}; 7350 7351/// \brief Helper structure to be able to read SetCC information. If set to 7352/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a 7353/// GenericSetCCInfo. 7354struct SetCCInfoAndKind { 7355 SetCCInfo Info; 7356 bool IsAArch64; 7357}; 7358 7359/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or 7360/// an 7361/// AArch64 lowered one. 7362/// \p SetCCInfo is filled accordingly. 7363/// \post SetCCInfo is meanginfull only when this function returns true. 7364/// \return True when Op is a kind of SET_CC operation. 7365static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { 7366 // If this is a setcc, this is straight forward. 7367 if (Op.getOpcode() == ISD::SETCC) { 7368 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); 7369 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); 7370 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7371 SetCCInfo.IsAArch64 = false; 7372 return true; 7373 } 7374 // Otherwise, check if this is a matching csel instruction. 7375 // In other words: 7376 // - csel 1, 0, cc 7377 // - csel 0, 1, !cc 7378 if (Op.getOpcode() != AArch64ISD::CSEL) 7379 return false; 7380 // Set the information about the operands. 7381 // TODO: we want the operands of the Cmp not the csel 7382 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); 7383 SetCCInfo.IsAArch64 = true; 7384 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>( 7385 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); 7386 7387 // Check that the operands matches the constraints: 7388 // (1) Both operands must be constants. 7389 // (2) One must be 1 and the other must be 0. 7390 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); 7391 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7392 7393 // Check (1). 7394 if (!TValue || !FValue) 7395 return false; 7396 7397 // Check (2). 7398 if (!TValue->isOne()) { 7399 // Update the comparison when we are interested in !cc. 7400 std::swap(TValue, FValue); 7401 SetCCInfo.Info.AArch64.CC = 7402 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); 7403 } 7404 return TValue->isOne() && FValue->isNullValue(); 7405} 7406 7407// Returns true if Op is setcc or zext of setcc. 7408static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { 7409 if (isSetCC(Op, Info)) 7410 return true; 7411 return ((Op.getOpcode() == ISD::ZERO_EXTEND) && 7412 isSetCC(Op->getOperand(0), Info)); 7413} 7414 7415// The folding we want to perform is: 7416// (add x, [zext] (setcc cc ...) ) 7417// --> 7418// (csel x, (add x, 1), !cc ...) 7419// 7420// The latter will get matched to a CSINC instruction. 7421static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { 7422 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); 7423 SDValue LHS = Op->getOperand(0); 7424 SDValue RHS = Op->getOperand(1); 7425 SetCCInfoAndKind InfoAndKind; 7426 7427 // If neither operand is a SET_CC, give up. 7428 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { 7429 std::swap(LHS, RHS); 7430 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) 7431 return SDValue(); 7432 } 7433 7434 // FIXME: This could be generatized to work for FP comparisons. 7435 EVT CmpVT = InfoAndKind.IsAArch64 7436 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() 7437 : InfoAndKind.Info.Generic.Opnd0->getValueType(); 7438 if (CmpVT != MVT::i32 && CmpVT != MVT::i64) 7439 return SDValue(); 7440 7441 SDValue CCVal; 7442 SDValue Cmp; 7443 SDLoc dl(Op); 7444 if (InfoAndKind.IsAArch64) { 7445 CCVal = DAG.getConstant( 7446 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32); 7447 Cmp = *InfoAndKind.Info.AArch64.Cmp; 7448 } else 7449 Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, 7450 *InfoAndKind.Info.Generic.Opnd1, 7451 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), 7452 CCVal, DAG, dl); 7453 7454 EVT VT = Op->getValueType(0); 7455 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT)); 7456 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); 7457} 7458 7459// The basic add/sub long vector instructions have variants with "2" on the end 7460// which act on the high-half of their inputs. They are normally matched by 7461// patterns like: 7462// 7463// (add (zeroext (extract_high LHS)), 7464// (zeroext (extract_high RHS))) 7465// -> uaddl2 vD, vN, vM 7466// 7467// However, if one of the extracts is something like a duplicate, this 7468// instruction can still be used profitably. This function puts the DAG into a 7469// more appropriate form for those patterns to trigger. 7470static SDValue performAddSubLongCombine(SDNode *N, 7471 TargetLowering::DAGCombinerInfo &DCI, 7472 SelectionDAG &DAG) { 7473 if (DCI.isBeforeLegalizeOps()) 7474 return SDValue(); 7475 7476 MVT VT = N->getSimpleValueType(0); 7477 if (!VT.is128BitVector()) { 7478 if (N->getOpcode() == ISD::ADD) 7479 return performSetccAddFolding(N, DAG); 7480 return SDValue(); 7481 } 7482 7483 // Make sure both branches are extended in the same way. 7484 SDValue LHS = N->getOperand(0); 7485 SDValue RHS = N->getOperand(1); 7486 if ((LHS.getOpcode() != ISD::ZERO_EXTEND && 7487 LHS.getOpcode() != ISD::SIGN_EXTEND) || 7488 LHS.getOpcode() != RHS.getOpcode()) 7489 return SDValue(); 7490 7491 unsigned ExtType = LHS.getOpcode(); 7492 7493 // It's not worth doing if at least one of the inputs isn't already an 7494 // extract, but we don't know which it'll be so we have to try both. 7495 if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { 7496 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); 7497 if (!RHS.getNode()) 7498 return SDValue(); 7499 7500 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); 7501 } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { 7502 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); 7503 if (!LHS.getNode()) 7504 return SDValue(); 7505 7506 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); 7507 } 7508 7509 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); 7510} 7511 7512// Massage DAGs which we can use the high-half "long" operations on into 7513// something isel will recognize better. E.g. 7514// 7515// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> 7516// (aarch64_neon_umull (extract_high (v2i64 vec))) 7517// (extract_high (v2i64 (dup128 scalar))))) 7518// 7519static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, 7520 TargetLowering::DAGCombinerInfo &DCI, 7521 SelectionDAG &DAG) { 7522 if (DCI.isBeforeLegalizeOps()) 7523 return SDValue(); 7524 7525 SDValue LHS = N->getOperand(1); 7526 SDValue RHS = N->getOperand(2); 7527 assert(LHS.getValueType().is64BitVector() && 7528 RHS.getValueType().is64BitVector() && 7529 "unexpected shape for long operation"); 7530 7531 // Either node could be a DUP, but it's not worth doing both of them (you'd 7532 // just as well use the non-high version) so look for a corresponding extract 7533 // operation on the other "wing". 7534 if (isEssentiallyExtractSubvector(LHS)) { 7535 RHS = tryExtendDUPToExtractHigh(RHS, DAG); 7536 if (!RHS.getNode()) 7537 return SDValue(); 7538 } else if (isEssentiallyExtractSubvector(RHS)) { 7539 LHS = tryExtendDUPToExtractHigh(LHS, DAG); 7540 if (!LHS.getNode()) 7541 return SDValue(); 7542 } 7543 7544 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), 7545 N->getOperand(0), LHS, RHS); 7546} 7547 7548static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { 7549 MVT ElemTy = N->getSimpleValueType(0).getScalarType(); 7550 unsigned ElemBits = ElemTy.getSizeInBits(); 7551 7552 int64_t ShiftAmount; 7553 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { 7554 APInt SplatValue, SplatUndef; 7555 unsigned SplatBitSize; 7556 bool HasAnyUndefs; 7557 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, 7558 HasAnyUndefs, ElemBits) || 7559 SplatBitSize != ElemBits) 7560 return SDValue(); 7561 7562 ShiftAmount = SplatValue.getSExtValue(); 7563 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 7564 ShiftAmount = CVN->getSExtValue(); 7565 } else 7566 return SDValue(); 7567 7568 unsigned Opcode; 7569 bool IsRightShift; 7570 switch (IID) { 7571 default: 7572 llvm_unreachable("Unknown shift intrinsic"); 7573 case Intrinsic::aarch64_neon_sqshl: 7574 Opcode = AArch64ISD::SQSHL_I; 7575 IsRightShift = false; 7576 break; 7577 case Intrinsic::aarch64_neon_uqshl: 7578 Opcode = AArch64ISD::UQSHL_I; 7579 IsRightShift = false; 7580 break; 7581 case Intrinsic::aarch64_neon_srshl: 7582 Opcode = AArch64ISD::SRSHR_I; 7583 IsRightShift = true; 7584 break; 7585 case Intrinsic::aarch64_neon_urshl: 7586 Opcode = AArch64ISD::URSHR_I; 7587 IsRightShift = true; 7588 break; 7589 case Intrinsic::aarch64_neon_sqshlu: 7590 Opcode = AArch64ISD::SQSHLU_I; 7591 IsRightShift = false; 7592 break; 7593 } 7594 7595 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) 7596 return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), 7597 DAG.getConstant(-ShiftAmount, MVT::i32)); 7598 else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) 7599 return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), 7600 DAG.getConstant(ShiftAmount, MVT::i32)); 7601 7602 return SDValue(); 7603} 7604 7605// The CRC32[BH] instructions ignore the high bits of their data operand. Since 7606// the intrinsics must be legal and take an i32, this means there's almost 7607// certainly going to be a zext in the DAG which we can eliminate. 7608static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { 7609 SDValue AndN = N->getOperand(2); 7610 if (AndN.getOpcode() != ISD::AND) 7611 return SDValue(); 7612 7613 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); 7614 if (!CMask || CMask->getZExtValue() != Mask) 7615 return SDValue(); 7616 7617 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, 7618 N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); 7619} 7620 7621static SDValue performIntrinsicCombine(SDNode *N, 7622 TargetLowering::DAGCombinerInfo &DCI, 7623 const AArch64Subtarget *Subtarget) { 7624 SelectionDAG &DAG = DCI.DAG; 7625 unsigned IID = getIntrinsicID(N); 7626 switch (IID) { 7627 default: 7628 break; 7629 case Intrinsic::aarch64_neon_vcvtfxs2fp: 7630 case Intrinsic::aarch64_neon_vcvtfxu2fp: 7631 return tryCombineFixedPointConvert(N, DCI, DAG); 7632 break; 7633 case Intrinsic::aarch64_neon_fmax: 7634 return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0), 7635 N->getOperand(1), N->getOperand(2)); 7636 case Intrinsic::aarch64_neon_fmin: 7637 return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0), 7638 N->getOperand(1), N->getOperand(2)); 7639 case Intrinsic::aarch64_neon_smull: 7640 case Intrinsic::aarch64_neon_umull: 7641 case Intrinsic::aarch64_neon_pmull: 7642 case Intrinsic::aarch64_neon_sqdmull: 7643 return tryCombineLongOpWithDup(IID, N, DCI, DAG); 7644 case Intrinsic::aarch64_neon_sqshl: 7645 case Intrinsic::aarch64_neon_uqshl: 7646 case Intrinsic::aarch64_neon_sqshlu: 7647 case Intrinsic::aarch64_neon_srshl: 7648 case Intrinsic::aarch64_neon_urshl: 7649 return tryCombineShiftImm(IID, N, DAG); 7650 case Intrinsic::aarch64_crc32b: 7651 case Intrinsic::aarch64_crc32cb: 7652 return tryCombineCRC32(0xff, N, DAG); 7653 case Intrinsic::aarch64_crc32h: 7654 case Intrinsic::aarch64_crc32ch: 7655 return tryCombineCRC32(0xffff, N, DAG); 7656 } 7657 return SDValue(); 7658} 7659 7660static SDValue performExtendCombine(SDNode *N, 7661 TargetLowering::DAGCombinerInfo &DCI, 7662 SelectionDAG &DAG) { 7663 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then 7664 // we can convert that DUP into another extract_high (of a bigger DUP), which 7665 // helps the backend to decide that an sabdl2 would be useful, saving a real 7666 // extract_high operation. 7667 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && 7668 N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { 7669 SDNode *ABDNode = N->getOperand(0).getNode(); 7670 unsigned IID = getIntrinsicID(ABDNode); 7671 if (IID == Intrinsic::aarch64_neon_sabd || 7672 IID == Intrinsic::aarch64_neon_uabd) { 7673 SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); 7674 if (!NewABD.getNode()) 7675 return SDValue(); 7676 7677 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), 7678 NewABD); 7679 } 7680 } 7681 7682 // This is effectively a custom type legalization for AArch64. 7683 // 7684 // Type legalization will split an extend of a small, legal, type to a larger 7685 // illegal type by first splitting the destination type, often creating 7686 // illegal source types, which then get legalized in isel-confusing ways, 7687 // leading to really terrible codegen. E.g., 7688 // %result = v8i32 sext v8i8 %value 7689 // becomes 7690 // %losrc = extract_subreg %value, ... 7691 // %hisrc = extract_subreg %value, ... 7692 // %lo = v4i32 sext v4i8 %losrc 7693 // %hi = v4i32 sext v4i8 %hisrc 7694 // Things go rapidly downhill from there. 7695 // 7696 // For AArch64, the [sz]ext vector instructions can only go up one element 7697 // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 7698 // take two instructions. 7699 // 7700 // This implies that the most efficient way to do the extend from v8i8 7701 // to two v4i32 values is to first extend the v8i8 to v8i16, then do 7702 // the normal splitting to happen for the v8i16->v8i32. 7703 7704 // This is pre-legalization to catch some cases where the default 7705 // type legalization will create ill-tempered code. 7706 if (!DCI.isBeforeLegalizeOps()) 7707 return SDValue(); 7708 7709 // We're only interested in cleaning things up for non-legal vector types 7710 // here. If both the source and destination are legal, things will just 7711 // work naturally without any fiddling. 7712 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 7713 EVT ResVT = N->getValueType(0); 7714 if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) 7715 return SDValue(); 7716 // If the vector type isn't a simple VT, it's beyond the scope of what 7717 // we're worried about here. Let legalization do its thing and hope for 7718 // the best. 7719 SDValue Src = N->getOperand(0); 7720 EVT SrcVT = Src->getValueType(0); 7721 if (!ResVT.isSimple() || !SrcVT.isSimple()) 7722 return SDValue(); 7723 7724 // If the source VT is a 64-bit vector, we can play games and get the 7725 // better results we want. 7726 if (SrcVT.getSizeInBits() != 64) 7727 return SDValue(); 7728 7729 unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); 7730 unsigned ElementCount = SrcVT.getVectorNumElements(); 7731 SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); 7732 SDLoc DL(N); 7733 Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); 7734 7735 // Now split the rest of the operation into two halves, each with a 64 7736 // bit source. 7737 EVT LoVT, HiVT; 7738 SDValue Lo, Hi; 7739 unsigned NumElements = ResVT.getVectorNumElements(); 7740 assert(!(NumElements & 1) && "Splitting vector, but not in half!"); 7741 LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), 7742 ResVT.getVectorElementType(), NumElements / 2); 7743 7744 EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), 7745 LoVT.getVectorNumElements()); 7746 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 7747 DAG.getConstant(0, MVT::i64)); 7748 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, 7749 DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64)); 7750 Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); 7751 Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); 7752 7753 // Now combine the parts back together so we still have a single result 7754 // like the combiner expects. 7755 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); 7756} 7757 7758/// Replace a splat of a scalar to a vector store by scalar stores of the scalar 7759/// value. The load store optimizer pass will merge them to store pair stores. 7760/// This has better performance than a splat of the scalar followed by a split 7761/// vector store. Even if the stores are not merged it is four stores vs a dup, 7762/// followed by an ext.b and two stores. 7763static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { 7764 SDValue StVal = St->getValue(); 7765 EVT VT = StVal.getValueType(); 7766 7767 // Don't replace floating point stores, they possibly won't be transformed to 7768 // stp because of the store pair suppress pass. 7769 if (VT.isFloatingPoint()) 7770 return SDValue(); 7771 7772 // Check for insert vector elements. 7773 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) 7774 return SDValue(); 7775 7776 // We can express a splat as store pair(s) for 2 or 4 elements. 7777 unsigned NumVecElts = VT.getVectorNumElements(); 7778 if (NumVecElts != 4 && NumVecElts != 2) 7779 return SDValue(); 7780 SDValue SplatVal = StVal.getOperand(1); 7781 unsigned RemainInsertElts = NumVecElts - 1; 7782 7783 // Check that this is a splat. 7784 while (--RemainInsertElts) { 7785 SDValue NextInsertElt = StVal.getOperand(0); 7786 if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT) 7787 return SDValue(); 7788 if (NextInsertElt.getOperand(1) != SplatVal) 7789 return SDValue(); 7790 StVal = NextInsertElt; 7791 } 7792 unsigned OrigAlignment = St->getAlignment(); 7793 unsigned EltOffset = NumVecElts == 4 ? 4 : 8; 7794 unsigned Alignment = std::min(OrigAlignment, EltOffset); 7795 7796 // Create scalar stores. This is at least as good as the code sequence for a 7797 // split unaligned store wich is a dup.s, ext.b, and two stores. 7798 // Most of the time the three stores should be replaced by store pair 7799 // instructions (stp). 7800 SDLoc DL(St); 7801 SDValue BasePtr = St->getBasePtr(); 7802 SDValue NewST1 = 7803 DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), 7804 St->isVolatile(), St->isNonTemporal(), St->getAlignment()); 7805 7806 unsigned Offset = EltOffset; 7807 while (--NumVecElts) { 7808 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 7809 DAG.getConstant(Offset, MVT::i64)); 7810 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, 7811 St->getPointerInfo(), St->isVolatile(), 7812 St->isNonTemporal(), Alignment); 7813 Offset += EltOffset; 7814 } 7815 return NewST1; 7816} 7817 7818static SDValue performSTORECombine(SDNode *N, 7819 TargetLowering::DAGCombinerInfo &DCI, 7820 SelectionDAG &DAG, 7821 const AArch64Subtarget *Subtarget) { 7822 if (!DCI.isBeforeLegalize()) 7823 return SDValue(); 7824 7825 StoreSDNode *S = cast<StoreSDNode>(N); 7826 if (S->isVolatile()) 7827 return SDValue(); 7828 7829 // Cyclone has bad performance on unaligned 16B stores when crossing line and 7830 // page boundries. We want to split such stores. 7831 if (!Subtarget->isCyclone()) 7832 return SDValue(); 7833 7834 // Don't split at Oz. 7835 MachineFunction &MF = DAG.getMachineFunction(); 7836 bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute( 7837 AttributeSet::FunctionIndex, Attribute::MinSize); 7838 if (IsMinSize) 7839 return SDValue(); 7840 7841 SDValue StVal = S->getValue(); 7842 EVT VT = StVal.getValueType(); 7843 7844 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting 7845 // those up regresses performance on micro-benchmarks and olden/bh. 7846 if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) 7847 return SDValue(); 7848 7849 // Split unaligned 16B stores. They are terrible for performance. 7850 // Don't split stores with alignment of 1 or 2. Code that uses clang vector 7851 // extensions can use this to mark that it does not want splitting to happen 7852 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of 7853 // eliminating alignment hazards is only 1 in 8 for alignment of 2. 7854 if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || 7855 S->getAlignment() <= 2) 7856 return SDValue(); 7857 7858 // If we get a splat of a scalar convert this vector store to a store of 7859 // scalars. They will be merged into store pairs thereby removing two 7860 // instructions. 7861 SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S); 7862 if (ReplacedSplat != SDValue()) 7863 return ReplacedSplat; 7864 7865 SDLoc DL(S); 7866 unsigned NumElts = VT.getVectorNumElements() / 2; 7867 // Split VT into two. 7868 EVT HalfVT = 7869 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); 7870 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 7871 DAG.getConstant(0, MVT::i64)); 7872 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, 7873 DAG.getConstant(NumElts, MVT::i64)); 7874 SDValue BasePtr = S->getBasePtr(); 7875 SDValue NewST1 = 7876 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), 7877 S->isVolatile(), S->isNonTemporal(), S->getAlignment()); 7878 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 7879 DAG.getConstant(8, MVT::i64)); 7880 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, 7881 S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), 7882 S->getAlignment()); 7883} 7884 7885/// Target-specific DAG combine function for post-increment LD1 (lane) and 7886/// post-increment LD1R. 7887static SDValue performPostLD1Combine(SDNode *N, 7888 TargetLowering::DAGCombinerInfo &DCI, 7889 bool IsLaneOp) { 7890 if (DCI.isBeforeLegalizeOps()) 7891 return SDValue(); 7892 7893 SelectionDAG &DAG = DCI.DAG; 7894 EVT VT = N->getValueType(0); 7895 7896 unsigned LoadIdx = IsLaneOp ? 1 : 0; 7897 SDNode *LD = N->getOperand(LoadIdx).getNode(); 7898 // If it is not LOAD, can not do such combine. 7899 if (LD->getOpcode() != ISD::LOAD) 7900 return SDValue(); 7901 7902 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); 7903 EVT MemVT = LoadSDN->getMemoryVT(); 7904 // Check if memory operand is the same type as the vector element. 7905 if (MemVT != VT.getVectorElementType()) 7906 return SDValue(); 7907 7908 // Check if there are other uses. If so, do not combine as it will introduce 7909 // an extra load. 7910 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; 7911 ++UI) { 7912 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. 7913 continue; 7914 if (*UI != N) 7915 return SDValue(); 7916 } 7917 7918 SDValue Addr = LD->getOperand(1); 7919 SDValue Vector = N->getOperand(0); 7920 // Search for a use of the address operand that is an increment. 7921 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = 7922 Addr.getNode()->use_end(); UI != UE; ++UI) { 7923 SDNode *User = *UI; 7924 if (User->getOpcode() != ISD::ADD 7925 || UI.getUse().getResNo() != Addr.getResNo()) 7926 continue; 7927 7928 // Check that the add is independent of the load. Otherwise, folding it 7929 // would create a cycle. 7930 if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) 7931 continue; 7932 // Also check that add is not used in the vector operand. This would also 7933 // create a cycle. 7934 if (User->isPredecessorOf(Vector.getNode())) 7935 continue; 7936 7937 // If the increment is a constant, it must match the memory ref size. 7938 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 7939 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 7940 uint32_t IncVal = CInc->getZExtValue(); 7941 unsigned NumBytes = VT.getScalarSizeInBits() / 8; 7942 if (IncVal != NumBytes) 7943 continue; 7944 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 7945 } 7946 7947 SmallVector<SDValue, 8> Ops; 7948 Ops.push_back(LD->getOperand(0)); // Chain 7949 if (IsLaneOp) { 7950 Ops.push_back(Vector); // The vector to be inserted 7951 Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector 7952 } 7953 Ops.push_back(Addr); 7954 Ops.push_back(Inc); 7955 7956 EVT Tys[3] = { VT, MVT::i64, MVT::Other }; 7957 SDVTList SDTys = DAG.getVTList(Tys); 7958 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; 7959 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, 7960 MemVT, 7961 LoadSDN->getMemOperand()); 7962 7963 // Update the uses. 7964 std::vector<SDValue> NewResults; 7965 NewResults.push_back(SDValue(LD, 0)); // The result of load 7966 NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain 7967 DCI.CombineTo(LD, NewResults); 7968 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result 7969 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register 7970 7971 break; 7972 } 7973 return SDValue(); 7974} 7975 7976/// Target-specific DAG combine function for NEON load/store intrinsics 7977/// to merge base address updates. 7978static SDValue performNEONPostLDSTCombine(SDNode *N, 7979 TargetLowering::DAGCombinerInfo &DCI, 7980 SelectionDAG &DAG) { 7981 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 7982 return SDValue(); 7983 7984 unsigned AddrOpIdx = N->getNumOperands() - 1; 7985 SDValue Addr = N->getOperand(AddrOpIdx); 7986 7987 // Search for a use of the address operand that is an increment. 7988 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 7989 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 7990 SDNode *User = *UI; 7991 if (User->getOpcode() != ISD::ADD || 7992 UI.getUse().getResNo() != Addr.getResNo()) 7993 continue; 7994 7995 // Check that the add is independent of the load/store. Otherwise, folding 7996 // it would create a cycle. 7997 if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) 7998 continue; 7999 8000 // Find the new opcode for the updating load/store. 8001 bool IsStore = false; 8002 bool IsLaneOp = false; 8003 bool IsDupOp = false; 8004 unsigned NewOpc = 0; 8005 unsigned NumVecs = 0; 8006 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 8007 switch (IntNo) { 8008 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 8009 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; 8010 NumVecs = 2; break; 8011 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; 8012 NumVecs = 3; break; 8013 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; 8014 NumVecs = 4; break; 8015 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; 8016 NumVecs = 2; IsStore = true; break; 8017 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; 8018 NumVecs = 3; IsStore = true; break; 8019 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; 8020 NumVecs = 4; IsStore = true; break; 8021 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; 8022 NumVecs = 2; break; 8023 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; 8024 NumVecs = 3; break; 8025 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; 8026 NumVecs = 4; break; 8027 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; 8028 NumVecs = 2; IsStore = true; break; 8029 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; 8030 NumVecs = 3; IsStore = true; break; 8031 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; 8032 NumVecs = 4; IsStore = true; break; 8033 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; 8034 NumVecs = 2; IsDupOp = true; break; 8035 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; 8036 NumVecs = 3; IsDupOp = true; break; 8037 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; 8038 NumVecs = 4; IsDupOp = true; break; 8039 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; 8040 NumVecs = 2; IsLaneOp = true; break; 8041 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; 8042 NumVecs = 3; IsLaneOp = true; break; 8043 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; 8044 NumVecs = 4; IsLaneOp = true; break; 8045 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; 8046 NumVecs = 2; IsStore = true; IsLaneOp = true; break; 8047 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; 8048 NumVecs = 3; IsStore = true; IsLaneOp = true; break; 8049 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; 8050 NumVecs = 4; IsStore = true; IsLaneOp = true; break; 8051 } 8052 8053 EVT VecTy; 8054 if (IsStore) 8055 VecTy = N->getOperand(2).getValueType(); 8056 else 8057 VecTy = N->getValueType(0); 8058 8059 // If the increment is a constant, it must match the memory ref size. 8060 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 8061 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { 8062 uint32_t IncVal = CInc->getZExtValue(); 8063 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 8064 if (IsLaneOp || IsDupOp) 8065 NumBytes /= VecTy.getVectorNumElements(); 8066 if (IncVal != NumBytes) 8067 continue; 8068 Inc = DAG.getRegister(AArch64::XZR, MVT::i64); 8069 } 8070 SmallVector<SDValue, 8> Ops; 8071 Ops.push_back(N->getOperand(0)); // Incoming chain 8072 // Load lane and store have vector list as input. 8073 if (IsLaneOp || IsStore) 8074 for (unsigned i = 2; i < AddrOpIdx; ++i) 8075 Ops.push_back(N->getOperand(i)); 8076 Ops.push_back(Addr); // Base register 8077 Ops.push_back(Inc); 8078 8079 // Return Types. 8080 EVT Tys[6]; 8081 unsigned NumResultVecs = (IsStore ? 0 : NumVecs); 8082 unsigned n; 8083 for (n = 0; n < NumResultVecs; ++n) 8084 Tys[n] = VecTy; 8085 Tys[n++] = MVT::i64; // Type of write back register 8086 Tys[n] = MVT::Other; // Type of the chain 8087 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 8088 8089 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); 8090 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, 8091 MemInt->getMemoryVT(), 8092 MemInt->getMemOperand()); 8093 8094 // Update the uses. 8095 std::vector<SDValue> NewResults; 8096 for (unsigned i = 0; i < NumResultVecs; ++i) { 8097 NewResults.push_back(SDValue(UpdN.getNode(), i)); 8098 } 8099 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); 8100 DCI.CombineTo(N, NewResults); 8101 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 8102 8103 break; 8104 } 8105 return SDValue(); 8106} 8107 8108// Checks to see if the value is the prescribed width and returns information 8109// about its extension mode. 8110static 8111bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { 8112 ExtType = ISD::NON_EXTLOAD; 8113 switch(V.getNode()->getOpcode()) { 8114 default: 8115 return false; 8116 case ISD::LOAD: { 8117 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); 8118 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) 8119 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { 8120 ExtType = LoadNode->getExtensionType(); 8121 return true; 8122 } 8123 return false; 8124 } 8125 case ISD::AssertSext: { 8126 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 8127 if ((TypeNode->getVT() == MVT::i8 && width == 8) 8128 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 8129 ExtType = ISD::SEXTLOAD; 8130 return true; 8131 } 8132 return false; 8133 } 8134 case ISD::AssertZext: { 8135 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); 8136 if ((TypeNode->getVT() == MVT::i8 && width == 8) 8137 || (TypeNode->getVT() == MVT::i16 && width == 16)) { 8138 ExtType = ISD::ZEXTLOAD; 8139 return true; 8140 } 8141 return false; 8142 } 8143 case ISD::Constant: 8144 case ISD::TargetConstant: { 8145 if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < 8146 1LL << (width - 1)) 8147 return true; 8148 return false; 8149 } 8150 } 8151 8152 return true; 8153} 8154 8155// This function does a whole lot of voodoo to determine if the tests are 8156// equivalent without and with a mask. Essentially what happens is that given a 8157// DAG resembling: 8158// 8159// +-------------+ +-------------+ +-------------+ +-------------+ 8160// | Input | | AddConstant | | CompConstant| | CC | 8161// +-------------+ +-------------+ +-------------+ +-------------+ 8162// | | | | 8163// V V | +----------+ 8164// +-------------+ +----+ | | 8165// | ADD | |0xff| | | 8166// +-------------+ +----+ | | 8167// | | | | 8168// V V | | 8169// +-------------+ | | 8170// | AND | | | 8171// +-------------+ | | 8172// | | | 8173// +-----+ | | 8174// | | | 8175// V V V 8176// +-------------+ 8177// | CMP | 8178// +-------------+ 8179// 8180// The AND node may be safely removed for some combinations of inputs. In 8181// particular we need to take into account the extension type of the Input, 8182// the exact values of AddConstant, CompConstant, and CC, along with the nominal 8183// width of the input (this can work for any width inputs, the above graph is 8184// specific to 8 bits. 8185// 8186// The specific equations were worked out by generating output tables for each 8187// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The 8188// problem was simplified by working with 4 bit inputs, which means we only 8189// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero 8190// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 8191// patterns present in both extensions (0,7). For every distinct set of 8192// AddConstant and CompConstants bit patterns we can consider the masked and 8193// unmasked versions to be equivalent if the result of this function is true for 8194// all 16 distinct bit patterns of for the current extension type of Input (w0). 8195// 8196// sub w8, w0, w1 8197// and w10, w8, #0x0f 8198// cmp w8, w2 8199// cset w9, AArch64CC 8200// cmp w10, w2 8201// cset w11, AArch64CC 8202// cmp w9, w11 8203// cset w0, eq 8204// ret 8205// 8206// Since the above function shows when the outputs are equivalent it defines 8207// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and 8208// would be expensive to run during compiles. The equations below were written 8209// in a test harness that confirmed they gave equivalent outputs to the above 8210// for all inputs function, so they can be used determine if the removal is 8211// legal instead. 8212// 8213// isEquivalentMaskless() is the code for testing if the AND can be removed 8214// factored out of the DAG recognition as the DAG can take several forms. 8215 8216static 8217bool isEquivalentMaskless(unsigned CC, unsigned width, 8218 ISD::LoadExtType ExtType, signed AddConstant, 8219 signed CompConstant) { 8220 // By being careful about our equations and only writing the in term 8221 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can 8222 // make them generally applicable to all bit widths. 8223 signed MaxUInt = (1 << width); 8224 8225 // For the purposes of these comparisons sign extending the type is 8226 // equivalent to zero extending the add and displacing it by half the integer 8227 // width. Provided we are careful and make sure our equations are valid over 8228 // the whole range we can just adjust the input and avoid writing equations 8229 // for sign extended inputs. 8230 if (ExtType == ISD::SEXTLOAD) 8231 AddConstant -= (1 << (width-1)); 8232 8233 switch(CC) { 8234 case AArch64CC::LE: 8235 case AArch64CC::GT: { 8236 if ((AddConstant == 0) || 8237 (CompConstant == MaxUInt - 1 && AddConstant < 0) || 8238 (AddConstant >= 0 && CompConstant < 0) || 8239 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) 8240 return true; 8241 } break; 8242 case AArch64CC::LT: 8243 case AArch64CC::GE: { 8244 if ((AddConstant == 0) || 8245 (AddConstant >= 0 && CompConstant <= 0) || 8246 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) 8247 return true; 8248 } break; 8249 case AArch64CC::HI: 8250 case AArch64CC::LS: { 8251 if ((AddConstant >= 0 && CompConstant < 0) || 8252 (AddConstant <= 0 && CompConstant >= -1 && 8253 CompConstant < AddConstant + MaxUInt)) 8254 return true; 8255 } break; 8256 case AArch64CC::PL: 8257 case AArch64CC::MI: { 8258 if ((AddConstant == 0) || 8259 (AddConstant > 0 && CompConstant <= 0) || 8260 (AddConstant < 0 && CompConstant <= AddConstant)) 8261 return true; 8262 } break; 8263 case AArch64CC::LO: 8264 case AArch64CC::HS: { 8265 if ((AddConstant >= 0 && CompConstant <= 0) || 8266 (AddConstant <= 0 && CompConstant >= 0 && 8267 CompConstant <= AddConstant + MaxUInt)) 8268 return true; 8269 } break; 8270 case AArch64CC::EQ: 8271 case AArch64CC::NE: { 8272 if ((AddConstant > 0 && CompConstant < 0) || 8273 (AddConstant < 0 && CompConstant >= 0 && 8274 CompConstant < AddConstant + MaxUInt) || 8275 (AddConstant >= 0 && CompConstant >= 0 && 8276 CompConstant >= AddConstant) || 8277 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) 8278 8279 return true; 8280 } break; 8281 case AArch64CC::VS: 8282 case AArch64CC::VC: 8283 case AArch64CC::AL: 8284 case AArch64CC::NV: 8285 return true; 8286 case AArch64CC::Invalid: 8287 break; 8288 } 8289 8290 return false; 8291} 8292 8293static 8294SDValue performCONDCombine(SDNode *N, 8295 TargetLowering::DAGCombinerInfo &DCI, 8296 SelectionDAG &DAG, unsigned CCIndex, 8297 unsigned CmpIndex) { 8298 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); 8299 SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); 8300 unsigned CondOpcode = SubsNode->getOpcode(); 8301 8302 if (CondOpcode != AArch64ISD::SUBS) 8303 return SDValue(); 8304 8305 // There is a SUBS feeding this condition. Is it fed by a mask we can 8306 // use? 8307 8308 SDNode *AndNode = SubsNode->getOperand(0).getNode(); 8309 unsigned MaskBits = 0; 8310 8311 if (AndNode->getOpcode() != ISD::AND) 8312 return SDValue(); 8313 8314 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { 8315 uint32_t CNV = CN->getZExtValue(); 8316 if (CNV == 255) 8317 MaskBits = 8; 8318 else if (CNV == 65535) 8319 MaskBits = 16; 8320 } 8321 8322 if (!MaskBits) 8323 return SDValue(); 8324 8325 SDValue AddValue = AndNode->getOperand(0); 8326 8327 if (AddValue.getOpcode() != ISD::ADD) 8328 return SDValue(); 8329 8330 // The basic dag structure is correct, grab the inputs and validate them. 8331 8332 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); 8333 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); 8334 SDValue SubsInputValue = SubsNode->getOperand(1); 8335 8336 // The mask is present and the provenance of all the values is a smaller type, 8337 // lets see if the mask is superfluous. 8338 8339 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || 8340 !isa<ConstantSDNode>(SubsInputValue.getNode())) 8341 return SDValue(); 8342 8343 ISD::LoadExtType ExtType; 8344 8345 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || 8346 !checkValueWidth(AddInputValue2, MaskBits, ExtType) || 8347 !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) 8348 return SDValue(); 8349 8350 if(!isEquivalentMaskless(CC, MaskBits, ExtType, 8351 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), 8352 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) 8353 return SDValue(); 8354 8355 // The AND is not necessary, remove it. 8356 8357 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), 8358 SubsNode->getValueType(1)); 8359 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; 8360 8361 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); 8362 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); 8363 8364 return SDValue(N, 0); 8365} 8366 8367// Optimize compare with zero and branch. 8368static SDValue performBRCONDCombine(SDNode *N, 8369 TargetLowering::DAGCombinerInfo &DCI, 8370 SelectionDAG &DAG) { 8371 SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3); 8372 if (NV.getNode()) 8373 N = NV.getNode(); 8374 SDValue Chain = N->getOperand(0); 8375 SDValue Dest = N->getOperand(1); 8376 SDValue CCVal = N->getOperand(2); 8377 SDValue Cmp = N->getOperand(3); 8378 8379 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); 8380 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); 8381 if (CC != AArch64CC::EQ && CC != AArch64CC::NE) 8382 return SDValue(); 8383 8384 unsigned CmpOpc = Cmp.getOpcode(); 8385 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) 8386 return SDValue(); 8387 8388 // Only attempt folding if there is only one use of the flag and no use of the 8389 // value. 8390 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) 8391 return SDValue(); 8392 8393 SDValue LHS = Cmp.getOperand(0); 8394 SDValue RHS = Cmp.getOperand(1); 8395 8396 assert(LHS.getValueType() == RHS.getValueType() && 8397 "Expected the value type to be the same for both operands!"); 8398 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) 8399 return SDValue(); 8400 8401 if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue()) 8402 std::swap(LHS, RHS); 8403 8404 if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue()) 8405 return SDValue(); 8406 8407 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || 8408 LHS.getOpcode() == ISD::SRL) 8409 return SDValue(); 8410 8411 // Fold the compare into the branch instruction. 8412 SDValue BR; 8413 if (CC == AArch64CC::EQ) 8414 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 8415 else 8416 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); 8417 8418 // Do not add new nodes to DAG combiner worklist. 8419 DCI.CombineTo(N, BR, false); 8420 8421 return SDValue(); 8422} 8423 8424// vselect (v1i1 setcc) -> 8425// vselect (v1iXX setcc) (XX is the size of the compared operand type) 8426// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as 8427// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine 8428// such VSELECT. 8429static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { 8430 SDValue N0 = N->getOperand(0); 8431 EVT CCVT = N0.getValueType(); 8432 8433 if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || 8434 CCVT.getVectorElementType() != MVT::i1) 8435 return SDValue(); 8436 8437 EVT ResVT = N->getValueType(0); 8438 EVT CmpVT = N0.getOperand(0).getValueType(); 8439 // Only combine when the result type is of the same size as the compared 8440 // operands. 8441 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) 8442 return SDValue(); 8443 8444 SDValue IfTrue = N->getOperand(1); 8445 SDValue IfFalse = N->getOperand(2); 8446 SDValue SetCC = 8447 DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), 8448 N0.getOperand(0), N0.getOperand(1), 8449 cast<CondCodeSDNode>(N0.getOperand(2))->get()); 8450 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, 8451 IfTrue, IfFalse); 8452} 8453 8454/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with 8455/// the compare-mask instructions rather than going via NZCV, even if LHS and 8456/// RHS are really scalar. This replaces any scalar setcc in the above pattern 8457/// with a vector one followed by a DUP shuffle on the result. 8458static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { 8459 SDValue N0 = N->getOperand(0); 8460 EVT ResVT = N->getValueType(0); 8461 8462 if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1) 8463 return SDValue(); 8464 8465 // If NumMaskElts == 0, the comparison is larger than select result. The 8466 // largest real NEON comparison is 64-bits per lane, which means the result is 8467 // at most 32-bits and an illegal vector. Just bail out for now. 8468 EVT SrcVT = N0.getOperand(0).getValueType(); 8469 8470 // Don't try to do this optimization when the setcc itself has i1 operands. 8471 // There are no legal vectors of i1, so this would be pointless. 8472 if (SrcVT == MVT::i1) 8473 return SDValue(); 8474 8475 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); 8476 if (!ResVT.isVector() || NumMaskElts == 0) 8477 return SDValue(); 8478 8479 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); 8480 EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); 8481 8482 // First perform a vector comparison, where lane 0 is the one we're interested 8483 // in. 8484 SDLoc DL(N0); 8485 SDValue LHS = 8486 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); 8487 SDValue RHS = 8488 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); 8489 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); 8490 8491 // Now duplicate the comparison mask we want across all other lanes. 8492 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); 8493 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); 8494 Mask = DAG.getNode(ISD::BITCAST, DL, 8495 ResVT.changeVectorElementTypeToInteger(), Mask); 8496 8497 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); 8498} 8499 8500SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, 8501 DAGCombinerInfo &DCI) const { 8502 SelectionDAG &DAG = DCI.DAG; 8503 switch (N->getOpcode()) { 8504 default: 8505 break; 8506 case ISD::ADD: 8507 case ISD::SUB: 8508 return performAddSubLongCombine(N, DCI, DAG); 8509 case ISD::XOR: 8510 return performXorCombine(N, DAG, DCI, Subtarget); 8511 case ISD::MUL: 8512 return performMulCombine(N, DAG, DCI, Subtarget); 8513 case ISD::SINT_TO_FP: 8514 case ISD::UINT_TO_FP: 8515 return performIntToFpCombine(N, DAG, Subtarget); 8516 case ISD::OR: 8517 return performORCombine(N, DCI, Subtarget); 8518 case ISD::INTRINSIC_WO_CHAIN: 8519 return performIntrinsicCombine(N, DCI, Subtarget); 8520 case ISD::ANY_EXTEND: 8521 case ISD::ZERO_EXTEND: 8522 case ISD::SIGN_EXTEND: 8523 return performExtendCombine(N, DCI, DAG); 8524 case ISD::BITCAST: 8525 return performBitcastCombine(N, DCI, DAG); 8526 case ISD::CONCAT_VECTORS: 8527 return performConcatVectorsCombine(N, DCI, DAG); 8528 case ISD::SELECT: 8529 return performSelectCombine(N, DAG); 8530 case ISD::VSELECT: 8531 return performVSelectCombine(N, DCI.DAG); 8532 case ISD::STORE: 8533 return performSTORECombine(N, DCI, DAG, Subtarget); 8534 case AArch64ISD::BRCOND: 8535 return performBRCONDCombine(N, DCI, DAG); 8536 case AArch64ISD::CSEL: 8537 return performCONDCombine(N, DCI, DAG, 2, 3); 8538 case AArch64ISD::DUP: 8539 return performPostLD1Combine(N, DCI, false); 8540 case ISD::INSERT_VECTOR_ELT: 8541 return performPostLD1Combine(N, DCI, true); 8542 case ISD::INTRINSIC_VOID: 8543 case ISD::INTRINSIC_W_CHAIN: 8544 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 8545 case Intrinsic::aarch64_neon_ld2: 8546 case Intrinsic::aarch64_neon_ld3: 8547 case Intrinsic::aarch64_neon_ld4: 8548 case Intrinsic::aarch64_neon_ld1x2: 8549 case Intrinsic::aarch64_neon_ld1x3: 8550 case Intrinsic::aarch64_neon_ld1x4: 8551 case Intrinsic::aarch64_neon_ld2lane: 8552 case Intrinsic::aarch64_neon_ld3lane: 8553 case Intrinsic::aarch64_neon_ld4lane: 8554 case Intrinsic::aarch64_neon_ld2r: 8555 case Intrinsic::aarch64_neon_ld3r: 8556 case Intrinsic::aarch64_neon_ld4r: 8557 case Intrinsic::aarch64_neon_st2: 8558 case Intrinsic::aarch64_neon_st3: 8559 case Intrinsic::aarch64_neon_st4: 8560 case Intrinsic::aarch64_neon_st1x2: 8561 case Intrinsic::aarch64_neon_st1x3: 8562 case Intrinsic::aarch64_neon_st1x4: 8563 case Intrinsic::aarch64_neon_st2lane: 8564 case Intrinsic::aarch64_neon_st3lane: 8565 case Intrinsic::aarch64_neon_st4lane: 8566 return performNEONPostLDSTCombine(N, DCI, DAG); 8567 default: 8568 break; 8569 } 8570 } 8571 return SDValue(); 8572} 8573 8574// Check if the return value is used as only a return value, as otherwise 8575// we can't perform a tail-call. In particular, we need to check for 8576// target ISD nodes that are returns and any other "odd" constructs 8577// that the generic analysis code won't necessarily catch. 8578bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, 8579 SDValue &Chain) const { 8580 if (N->getNumValues() != 1) 8581 return false; 8582 if (!N->hasNUsesOfValue(1, 0)) 8583 return false; 8584 8585 SDValue TCChain = Chain; 8586 SDNode *Copy = *N->use_begin(); 8587 if (Copy->getOpcode() == ISD::CopyToReg) { 8588 // If the copy has a glue operand, we conservatively assume it isn't safe to 8589 // perform a tail call. 8590 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == 8591 MVT::Glue) 8592 return false; 8593 TCChain = Copy->getOperand(0); 8594 } else if (Copy->getOpcode() != ISD::FP_EXTEND) 8595 return false; 8596 8597 bool HasRet = false; 8598 for (SDNode *Node : Copy->uses()) { 8599 if (Node->getOpcode() != AArch64ISD::RET_FLAG) 8600 return false; 8601 HasRet = true; 8602 } 8603 8604 if (!HasRet) 8605 return false; 8606 8607 Chain = TCChain; 8608 return true; 8609} 8610 8611// Return whether the an instruction can potentially be optimized to a tail 8612// call. This will cause the optimizers to attempt to move, or duplicate, 8613// return instructions to help enable tail call optimizations for this 8614// instruction. 8615bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { 8616 if (!CI->isTailCall()) 8617 return false; 8618 8619 return true; 8620} 8621 8622bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, 8623 SDValue &Offset, 8624 ISD::MemIndexedMode &AM, 8625 bool &IsInc, 8626 SelectionDAG &DAG) const { 8627 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) 8628 return false; 8629 8630 Base = Op->getOperand(0); 8631 // All of the indexed addressing mode instructions take a signed 8632 // 9 bit immediate offset. 8633 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { 8634 int64_t RHSC = (int64_t)RHS->getZExtValue(); 8635 if (RHSC >= 256 || RHSC <= -256) 8636 return false; 8637 IsInc = (Op->getOpcode() == ISD::ADD); 8638 Offset = Op->getOperand(1); 8639 return true; 8640 } 8641 return false; 8642} 8643 8644bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 8645 SDValue &Offset, 8646 ISD::MemIndexedMode &AM, 8647 SelectionDAG &DAG) const { 8648 EVT VT; 8649 SDValue Ptr; 8650 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8651 VT = LD->getMemoryVT(); 8652 Ptr = LD->getBasePtr(); 8653 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 8654 VT = ST->getMemoryVT(); 8655 Ptr = ST->getBasePtr(); 8656 } else 8657 return false; 8658 8659 bool IsInc; 8660 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) 8661 return false; 8662 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; 8663 return true; 8664} 8665 8666bool AArch64TargetLowering::getPostIndexedAddressParts( 8667 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, 8668 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { 8669 EVT VT; 8670 SDValue Ptr; 8671 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8672 VT = LD->getMemoryVT(); 8673 Ptr = LD->getBasePtr(); 8674 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 8675 VT = ST->getMemoryVT(); 8676 Ptr = ST->getBasePtr(); 8677 } else 8678 return false; 8679 8680 bool IsInc; 8681 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) 8682 return false; 8683 // Post-indexing updates the base, so it's not a valid transform 8684 // if that's not the same as the load's pointer. 8685 if (Ptr != Base) 8686 return false; 8687 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; 8688 return true; 8689} 8690 8691static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, 8692 SelectionDAG &DAG) { 8693 SDLoc DL(N); 8694 SDValue Op = N->getOperand(0); 8695 8696 if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) 8697 return; 8698 8699 Op = SDValue( 8700 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, 8701 DAG.getUNDEF(MVT::i32), Op, 8702 DAG.getTargetConstant(AArch64::hsub, MVT::i32)), 8703 0); 8704 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); 8705 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); 8706} 8707 8708void AArch64TargetLowering::ReplaceNodeResults( 8709 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { 8710 switch (N->getOpcode()) { 8711 default: 8712 llvm_unreachable("Don't know how to custom expand this"); 8713 case ISD::BITCAST: 8714 ReplaceBITCASTResults(N, Results, DAG); 8715 return; 8716 case ISD::FP_TO_UINT: 8717 case ISD::FP_TO_SINT: 8718 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); 8719 // Let normal code take care of it by not adding anything to Results. 8720 return; 8721 } 8722} 8723 8724bool AArch64TargetLowering::useLoadStackGuardNode() const { 8725 return true; 8726} 8727 8728bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { 8729 // Combine multiple FDIVs with the same divisor into multiple FMULs by the 8730 // reciprocal if there are three or more FDIVs. 8731 return NumUsers > 2; 8732} 8733 8734TargetLoweringBase::LegalizeTypeAction 8735AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { 8736 MVT SVT = VT.getSimpleVT(); 8737 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, 8738 // v4i16, v2i32 instead of to promote. 8739 if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 8740 || SVT == MVT::v1f32) 8741 return TypeWidenVector; 8742 8743 return TargetLoweringBase::getPreferredVectorAction(VT); 8744} 8745 8746// Loads and stores less than 128-bits are already atomic; ones above that 8747// are doomed anyway, so defer to the default libcall and blame the OS when 8748// things go wrong. 8749bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 8750 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 8751 return Size == 128; 8752} 8753 8754// Loads and stores less than 128-bits are already atomic; ones above that 8755// are doomed anyway, so defer to the default libcall and blame the OS when 8756// things go wrong. 8757bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 8758 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 8759 return Size == 128; 8760} 8761 8762// For the real atomic operations, we have ldxr/stxr up to 128 bits, 8763bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 8764 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 8765 return Size <= 128; 8766} 8767 8768bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { 8769 return true; 8770} 8771 8772Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 8773 AtomicOrdering Ord) const { 8774 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 8775 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 8776 bool IsAcquire = isAtLeastAcquire(Ord); 8777 8778 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd 8779 // intrinsic must return {i64, i64} and we have to recombine them into a 8780 // single i128 here. 8781 if (ValTy->getPrimitiveSizeInBits() == 128) { 8782 Intrinsic::ID Int = 8783 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; 8784 Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); 8785 8786 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 8787 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); 8788 8789 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 8790 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 8791 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 8792 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 8793 return Builder.CreateOr( 8794 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); 8795 } 8796 8797 Type *Tys[] = { Addr->getType() }; 8798 Intrinsic::ID Int = 8799 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; 8800 Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); 8801 8802 return Builder.CreateTruncOrBitCast( 8803 Builder.CreateCall(Ldxr, Addr), 8804 cast<PointerType>(Addr->getType())->getElementType()); 8805} 8806 8807Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, 8808 Value *Val, Value *Addr, 8809 AtomicOrdering Ord) const { 8810 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 8811 bool IsRelease = isAtLeastRelease(Ord); 8812 8813 // Since the intrinsics must have legal type, the i128 intrinsics take two 8814 // parameters: "i64, i64". We must marshal Val into the appropriate form 8815 // before the call. 8816 if (Val->getType()->getPrimitiveSizeInBits() == 128) { 8817 Intrinsic::ID Int = 8818 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; 8819 Function *Stxr = Intrinsic::getDeclaration(M, Int); 8820 Type *Int64Ty = Type::getInt64Ty(M->getContext()); 8821 8822 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); 8823 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); 8824 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 8825 return Builder.CreateCall3(Stxr, Lo, Hi, Addr); 8826 } 8827 8828 Intrinsic::ID Int = 8829 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; 8830 Type *Tys[] = { Addr->getType() }; 8831 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); 8832 8833 return Builder.CreateCall2( 8834 Stxr, Builder.CreateZExtOrBitCast( 8835 Val, Stxr->getFunctionType()->getParamType(0)), 8836 Addr); 8837} 8838 8839bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( 8840 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 8841 return Ty->isArrayTy(); 8842}
|