1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#include "X86.h" 16#include "X86InstrBuilder.h" 17#include "X86ISelLowering.h" 18#include "X86TargetMachine.h" 19#include "llvm/CallingConv.h" 20#include "llvm/Constants.h" 21#include "llvm/DerivedTypes.h" 22#include "llvm/GlobalVariable.h" 23#include "llvm/Function.h" 24#include "llvm/Intrinsics.h" 25#include "llvm/ADT/BitVector.h" 26#include "llvm/ADT/VectorExtras.h" 27#include "llvm/CodeGen/MachineFrameInfo.h" 28#include "llvm/CodeGen/MachineFunction.h" 29#include "llvm/CodeGen/MachineInstrBuilder.h" 30#include "llvm/CodeGen/MachineModuleInfo.h" 31#include "llvm/CodeGen/MachineRegisterInfo.h" 32#include "llvm/CodeGen/PseudoSourceValue.h" 33#include "llvm/Support/MathExtras.h" 34#include "llvm/Support/Debug.h" 35#include "llvm/Target/TargetOptions.h" 36#include "llvm/ADT/SmallSet.h" 37#include "llvm/ADT/StringExtras.h" 38#include "llvm/Support/CommandLine.h" 39using namespace llvm; 40 41static cl::opt<bool> 42DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 43 44// Forward declarations. 45static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 46 SDValue V2); 47 48X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 49 : TargetLowering(TM) { 50 Subtarget = &TM.getSubtarget<X86Subtarget>(); 51 X86ScalarSSEf64 = Subtarget->hasSSE2(); 52 X86ScalarSSEf32 = Subtarget->hasSSE1(); 53 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 54 55 RegInfo = TM.getRegisterInfo(); 56 TD = getTargetData(); 57 58 // Set up the TargetLowering object. 59 60 // X86 is weird, it always uses i8 for shift amounts and setcc results. 61 setShiftAmountType(MVT::i8); 62 setBooleanContents(ZeroOrOneBooleanContent); 63 setSchedulingPreference(SchedulingForRegPressure); 64 setShiftAmountFlavor(Mask); // shl X, 32 == shl X, 0 65 setStackPointerRegisterToSaveRestore(X86StackPtr); 66 67 if (Subtarget->isTargetDarwin()) { 68 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 69 setUseUnderscoreSetJmp(false); 70 setUseUnderscoreLongJmp(false); 71 } else if (Subtarget->isTargetMingw()) { 72 // MS runtime is weird: it exports _setjmp, but longjmp! 73 setUseUnderscoreSetJmp(true); 74 setUseUnderscoreLongJmp(false); 75 } else { 76 setUseUnderscoreSetJmp(true); 77 setUseUnderscoreLongJmp(true); 78 } 79 80 // Set up the register classes. 81 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 82 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 83 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 84 if (Subtarget->is64Bit()) 85 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 86 87 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 88 89 // We don't accept any truncstore of integer registers. 90 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 91 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 92 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 93 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 94 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 95 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 96 97 // SETOEQ and SETUNE require checking two conditions. 98 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 99 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 100 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 101 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 102 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 103 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 104 105 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 106 // operation. 107 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 108 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 109 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 110 111 if (Subtarget->is64Bit()) { 112 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 113 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 114 } else if (!UseSoftFloat) { 115 if (X86ScalarSSEf64) { 116 // We have an impenetrably clever algorithm for ui64->double only. 117 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 118 } 119 // We have an algorithm for SSE2, and we turn this into a 64-bit 120 // FILD for other targets. 121 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 122 } 123 124 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 125 // this operation. 126 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 127 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 128 129 if (!UseSoftFloat) { 130 // SSE has no i16 to fp conversion, only i32 131 if (X86ScalarSSEf32) { 132 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 133 // f32 and f64 cases are Legal, f80 case is not 134 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 135 } else { 136 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 137 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 138 } 139 } else { 140 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 141 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 142 } 143 144 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 145 // are Legal, f80 is custom lowered. 146 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 147 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 148 149 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 150 // this operation. 151 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 152 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 153 154 if (X86ScalarSSEf32) { 155 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 156 // f32 and f64 cases are Legal, f80 case is not 157 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 158 } else { 159 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 160 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 161 } 162 163 // Handle FP_TO_UINT by promoting the destination to a larger signed 164 // conversion. 165 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 166 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 167 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 168 169 if (Subtarget->is64Bit()) { 170 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 171 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 172 } else if (!UseSoftFloat) { 173 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 174 // Expand FP_TO_UINT into a select. 175 // FIXME: We would like to use a Custom expander here eventually to do 176 // the optimal thing for SSE vs. the default expansion in the legalizer. 177 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 178 else 179 // With SSE3 we can use fisttpll to convert to a signed i64; without 180 // SSE, we're stuck with a fistpll. 181 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 182 } 183 184 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 185 if (!X86ScalarSSEf64) { 186 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 187 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 188 } 189 190 // Scalar integer divide and remainder are lowered to use operations that 191 // produce two results, to match the available instructions. This exposes 192 // the two-result form to trivial CSE, which is able to combine x/y and x%y 193 // into a single instruction. 194 // 195 // Scalar integer multiply-high is also lowered to use two-result 196 // operations, to match the available instructions. However, plain multiply 197 // (low) operations are left as Legal, as there are single-result 198 // instructions for this in x86. Using the two-result multiply instructions 199 // when both high and low results are needed must be arranged by dagcombine. 200 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 201 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 202 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 203 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 204 setOperationAction(ISD::SREM , MVT::i8 , Expand); 205 setOperationAction(ISD::UREM , MVT::i8 , Expand); 206 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 207 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 208 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 209 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 210 setOperationAction(ISD::SREM , MVT::i16 , Expand); 211 setOperationAction(ISD::UREM , MVT::i16 , Expand); 212 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 213 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 214 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 215 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 216 setOperationAction(ISD::SREM , MVT::i32 , Expand); 217 setOperationAction(ISD::UREM , MVT::i32 , Expand); 218 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 219 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 220 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 221 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 222 setOperationAction(ISD::SREM , MVT::i64 , Expand); 223 setOperationAction(ISD::UREM , MVT::i64 , Expand); 224 225 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 226 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 227 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 228 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 229 if (Subtarget->is64Bit()) 230 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 231 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 232 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 233 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 234 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 235 setOperationAction(ISD::FREM , MVT::f32 , Expand); 236 setOperationAction(ISD::FREM , MVT::f64 , Expand); 237 setOperationAction(ISD::FREM , MVT::f80 , Expand); 238 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 239 240 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 241 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 242 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 243 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 244 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 245 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 246 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 247 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 248 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 249 if (Subtarget->is64Bit()) { 250 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 251 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 252 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 253 } 254 255 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 256 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 257 258 // These should be promoted to a larger select which is supported. 259 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 260 setOperationAction(ISD::SELECT , MVT::i8 , Promote); 261 // X86 wants to expand cmov itself. 262 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 263 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 264 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 265 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 266 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 267 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 268 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 269 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 270 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 271 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 272 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 273 if (Subtarget->is64Bit()) { 274 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 275 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 276 } 277 // X86 ret instruction may pop stack. 278 setOperationAction(ISD::RET , MVT::Other, Custom); 279 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 280 281 // Darwin ABI issue. 282 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 283 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 284 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 285 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 286 if (Subtarget->is64Bit()) 287 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 288 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 289 if (Subtarget->is64Bit()) { 290 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 291 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 292 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 293 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 294 } 295 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 296 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 297 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 298 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 299 if (Subtarget->is64Bit()) { 300 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 301 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 302 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 303 } 304 305 if (Subtarget->hasSSE1()) 306 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 307 308 if (!Subtarget->hasSSE2()) 309 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 310 311 // Expand certain atomics 312 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 313 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 314 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 315 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 316 317 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 318 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 319 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 320 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 321 322 if (!Subtarget->is64Bit()) { 323 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 324 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 325 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 326 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 327 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 328 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 329 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 330 } 331 332 // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion. 333 setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand); 334 // FIXME - use subtarget debug flags 335 if (!Subtarget->isTargetDarwin() && 336 !Subtarget->isTargetELF() && 337 !Subtarget->isTargetCygMing()) { 338 setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand); 339 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 340 } 341 342 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 343 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 344 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 345 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 346 if (Subtarget->is64Bit()) { 347 setExceptionPointerRegister(X86::RAX); 348 setExceptionSelectorRegister(X86::RDX); 349 } else { 350 setExceptionPointerRegister(X86::EAX); 351 setExceptionSelectorRegister(X86::EDX); 352 } 353 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 354 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 355 356 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 357 358 setOperationAction(ISD::TRAP, MVT::Other, Legal); 359 360 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 361 setOperationAction(ISD::VASTART , MVT::Other, Custom); 362 setOperationAction(ISD::VAEND , MVT::Other, Expand); 363 if (Subtarget->is64Bit()) { 364 setOperationAction(ISD::VAARG , MVT::Other, Custom); 365 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 366 } else { 367 setOperationAction(ISD::VAARG , MVT::Other, Expand); 368 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 369 } 370 371 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 372 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 373 if (Subtarget->is64Bit()) 374 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 375 if (Subtarget->isTargetCygMing()) 376 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 377 else 378 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 379 380 if (!UseSoftFloat && X86ScalarSSEf64) { 381 // f32 and f64 use SSE. 382 // Set up the FP register classes. 383 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 384 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 385 386 // Use ANDPD to simulate FABS. 387 setOperationAction(ISD::FABS , MVT::f64, Custom); 388 setOperationAction(ISD::FABS , MVT::f32, Custom); 389 390 // Use XORP to simulate FNEG. 391 setOperationAction(ISD::FNEG , MVT::f64, Custom); 392 setOperationAction(ISD::FNEG , MVT::f32, Custom); 393 394 // Use ANDPD and ORPD to simulate FCOPYSIGN. 395 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 396 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 397 398 // We don't support sin/cos/fmod 399 setOperationAction(ISD::FSIN , MVT::f64, Expand); 400 setOperationAction(ISD::FCOS , MVT::f64, Expand); 401 setOperationAction(ISD::FSIN , MVT::f32, Expand); 402 setOperationAction(ISD::FCOS , MVT::f32, Expand); 403 404 // Expand FP immediates into loads from the stack, except for the special 405 // cases we handle. 406 addLegalFPImmediate(APFloat(+0.0)); // xorpd 407 addLegalFPImmediate(APFloat(+0.0f)); // xorps 408 } else if (!UseSoftFloat && X86ScalarSSEf32) { 409 // Use SSE for f32, x87 for f64. 410 // Set up the FP register classes. 411 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 412 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 413 414 // Use ANDPS to simulate FABS. 415 setOperationAction(ISD::FABS , MVT::f32, Custom); 416 417 // Use XORP to simulate FNEG. 418 setOperationAction(ISD::FNEG , MVT::f32, Custom); 419 420 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 421 422 // Use ANDPS and ORPS to simulate FCOPYSIGN. 423 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 424 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 425 426 // We don't support sin/cos/fmod 427 setOperationAction(ISD::FSIN , MVT::f32, Expand); 428 setOperationAction(ISD::FCOS , MVT::f32, Expand); 429 430 // Special cases we handle for FP constants. 431 addLegalFPImmediate(APFloat(+0.0f)); // xorps 432 addLegalFPImmediate(APFloat(+0.0)); // FLD0 433 addLegalFPImmediate(APFloat(+1.0)); // FLD1 434 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 435 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 436 437 if (!UnsafeFPMath) { 438 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 439 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 440 } 441 } else if (!UseSoftFloat) { 442 // f32 and f64 in x87. 443 // Set up the FP register classes. 444 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 445 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 446 447 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 448 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 449 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 450 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 451 452 if (!UnsafeFPMath) { 453 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 454 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 455 } 456 addLegalFPImmediate(APFloat(+0.0)); // FLD0 457 addLegalFPImmediate(APFloat(+1.0)); // FLD1 458 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 459 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 460 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 461 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 462 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 463 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 464 } 465 466 // Long double always uses X87. 467 if (!UseSoftFloat) { 468 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 469 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 470 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 471 { 472 bool ignored; 473 APFloat TmpFlt(+0.0); 474 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 475 &ignored); 476 addLegalFPImmediate(TmpFlt); // FLD0 477 TmpFlt.changeSign(); 478 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 479 APFloat TmpFlt2(+1.0); 480 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 481 &ignored); 482 addLegalFPImmediate(TmpFlt2); // FLD1 483 TmpFlt2.changeSign(); 484 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 485 } 486 487 if (!UnsafeFPMath) { 488 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 489 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 490 } 491 } 492 493 // Always use a library call for pow. 494 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 495 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 496 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 497 498 setOperationAction(ISD::FLOG, MVT::f80, Expand); 499 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 500 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 501 setOperationAction(ISD::FEXP, MVT::f80, Expand); 502 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 503 504 // First set operation action for all vector types to either promote 505 // (for widening) or expand (for scalarization). Then we will selectively 506 // turn on ones that can be effectively codegen'd. 507 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 508 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 509 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 510 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 511 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 512 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 513 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 514 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 515 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 516 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 517 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 518 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 519 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 520 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 521 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 522 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 523 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 524 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 525 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 526 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 527 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 528 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 529 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 530 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 531 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 532 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 533 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 534 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 535 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 536 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 537 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 538 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 539 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 540 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 541 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 542 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 543 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 544 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 545 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 546 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 547 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 548 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 549 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 550 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 551 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 557 } 558 559 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 560 // with -msoft-float, disable use of MMX as well. 561 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 562 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 563 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 564 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 565 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 566 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 567 568 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 569 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 570 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 571 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 572 573 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 574 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 575 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 576 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 577 578 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 579 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 580 581 setOperationAction(ISD::AND, MVT::v8i8, Promote); 582 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 583 setOperationAction(ISD::AND, MVT::v4i16, Promote); 584 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 585 setOperationAction(ISD::AND, MVT::v2i32, Promote); 586 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 587 setOperationAction(ISD::AND, MVT::v1i64, Legal); 588 589 setOperationAction(ISD::OR, MVT::v8i8, Promote); 590 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 591 setOperationAction(ISD::OR, MVT::v4i16, Promote); 592 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 593 setOperationAction(ISD::OR, MVT::v2i32, Promote); 594 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 595 setOperationAction(ISD::OR, MVT::v1i64, Legal); 596 597 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 598 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 599 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 600 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 601 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 602 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 603 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 604 605 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 606 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 607 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 608 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 609 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 610 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 611 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 612 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 613 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 614 615 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 616 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 617 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 618 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 619 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 620 621 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 622 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 623 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 624 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 625 626 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 627 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 628 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 629 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 630 631 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 632 633 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 634 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand); 635 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 636 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 637 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 638 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 639 } 640 641 if (!UseSoftFloat && Subtarget->hasSSE1()) { 642 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 643 644 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 645 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 646 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 647 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 648 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 649 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 650 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 651 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 652 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 653 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 654 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 655 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 656 } 657 658 if (!UseSoftFloat && Subtarget->hasSSE2()) { 659 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 660 661 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 662 // registers cannot be used even for integer operations. 663 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 664 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 665 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 666 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 667 668 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 669 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 670 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 671 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 672 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 673 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 674 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 675 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 676 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 677 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 678 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 679 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 680 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 681 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 682 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 683 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 684 685 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 686 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 689 690 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 691 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 692 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 693 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 694 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 695 696 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 697 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 698 MVT VT = (MVT::SimpleValueType)i; 699 // Do not attempt to custom lower non-power-of-2 vectors 700 if (!isPowerOf2_32(VT.getVectorNumElements())) 701 continue; 702 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 703 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 704 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 705 } 706 707 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 708 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 709 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 711 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 712 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 713 714 if (Subtarget->is64Bit()) { 715 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 716 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 717 } 718 719 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 720 for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) { 721 setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote); 722 AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v2i64); 723 setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote); 724 AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v2i64); 725 setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote); 726 AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v2i64); 727 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote); 728 AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v2i64); 729 setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote); 730 AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v2i64); 731 } 732 733 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 734 735 // Custom lower v2i64 and v2f64 selects. 736 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 737 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 738 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 739 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 740 741 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 742 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 743 if (!DisableMMX && Subtarget->hasMMX()) { 744 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 745 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 746 } 747 } 748 749 if (Subtarget->hasSSE41()) { 750 // FIXME: Do we need to handle scalar-to-vector here? 751 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 752 753 // i8 and i16 vectors are custom , because the source register and source 754 // source memory operand types are not the same width. f32 vectors are 755 // custom since the immediate controlling the insert encodes additional 756 // information. 757 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 758 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 759 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 760 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 761 762 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 763 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 764 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 765 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 766 767 if (Subtarget->is64Bit()) { 768 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 769 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 770 } 771 } 772 773 if (Subtarget->hasSSE42()) { 774 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 775 } 776 777 // We want to custom lower some of our intrinsics. 778 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 779 780 // Add/Sub/Mul with overflow operations are custom lowered. 781 setOperationAction(ISD::SADDO, MVT::i32, Custom); 782 setOperationAction(ISD::SADDO, MVT::i64, Custom); 783 setOperationAction(ISD::UADDO, MVT::i32, Custom); 784 setOperationAction(ISD::UADDO, MVT::i64, Custom); 785 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 786 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 787 setOperationAction(ISD::USUBO, MVT::i32, Custom); 788 setOperationAction(ISD::USUBO, MVT::i64, Custom); 789 setOperationAction(ISD::SMULO, MVT::i32, Custom); 790 setOperationAction(ISD::SMULO, MVT::i64, Custom); 791 setOperationAction(ISD::UMULO, MVT::i32, Custom); 792 setOperationAction(ISD::UMULO, MVT::i64, Custom); 793 794 if (!Subtarget->is64Bit()) { 795 // These libcalls are not available in 32-bit. 796 setLibcallName(RTLIB::SHL_I128, 0); 797 setLibcallName(RTLIB::SRL_I128, 0); 798 setLibcallName(RTLIB::SRA_I128, 0); 799 } 800 801 // We have target-specific dag combine patterns for the following nodes: 802 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 803 setTargetDAGCombine(ISD::BUILD_VECTOR); 804 setTargetDAGCombine(ISD::SELECT); 805 setTargetDAGCombine(ISD::SHL); 806 setTargetDAGCombine(ISD::SRA); 807 setTargetDAGCombine(ISD::SRL); 808 setTargetDAGCombine(ISD::STORE); 809 if (Subtarget->is64Bit()) 810 setTargetDAGCombine(ISD::MUL); 811 812 computeRegisterProperties(); 813 814 // FIXME: These should be based on subtarget info. Plus, the values should 815 // be smaller when we are in optimizing for size mode. 816 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 817 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 818 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 819 allowUnalignedMemoryAccesses = true; // x86 supports it! 820 setPrefLoopAlignment(16); 821 benefitFromCodePlacementOpt = true; 822} 823 824 825MVT X86TargetLowering::getSetCCResultType(MVT VT) const { 826 return MVT::i8; 827} 828 829 830/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 831/// the desired ByVal argument alignment. 832static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 833 if (MaxAlign == 16) 834 return; 835 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 836 if (VTy->getBitWidth() == 128) 837 MaxAlign = 16; 838 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 839 unsigned EltAlign = 0; 840 getMaxByValAlign(ATy->getElementType(), EltAlign); 841 if (EltAlign > MaxAlign) 842 MaxAlign = EltAlign; 843 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 844 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 845 unsigned EltAlign = 0; 846 getMaxByValAlign(STy->getElementType(i), EltAlign); 847 if (EltAlign > MaxAlign) 848 MaxAlign = EltAlign; 849 if (MaxAlign == 16) 850 break; 851 } 852 } 853 return; 854} 855 856/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 857/// function arguments in the caller parameter area. For X86, aggregates 858/// that contain SSE vectors are placed at 16-byte boundaries while the rest 859/// are at 4-byte boundaries. 860unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 861 if (Subtarget->is64Bit()) { 862 // Max of 8 and alignment of type. 863 unsigned TyAlign = TD->getABITypeAlignment(Ty); 864 if (TyAlign > 8) 865 return TyAlign; 866 return 8; 867 } 868 869 unsigned Align = 4; 870 if (Subtarget->hasSSE1()) 871 getMaxByValAlign(Ty, Align); 872 return Align; 873} 874 875/// getOptimalMemOpType - Returns the target specific optimal type for load 876/// and store operations as a result of memset, memcpy, and memmove 877/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 878/// determining it. 879MVT 880X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 881 bool isSrcConst, bool isSrcStr, 882 SelectionDAG &DAG) const { 883 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 884 // linux. This is because the stack realignment code can't handle certain 885 // cases like PR2962. This should be removed when PR2962 is fixed. 886 const Function *F = DAG.getMachineFunction().getFunction(); 887 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 888 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { 889 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 890 return MVT::v4i32; 891 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 892 return MVT::v4f32; 893 } 894 if (Subtarget->is64Bit() && Size >= 8) 895 return MVT::i64; 896 return MVT::i32; 897} 898 899/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 900/// jumptable. 901SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 902 SelectionDAG &DAG) const { 903 if (usesGlobalOffsetTable()) 904 return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy()); 905 if (!Subtarget->isPICStyleRIPRel()) 906 // This doesn't have DebugLoc associated with it, but is not really the 907 // same as a Register. 908 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 909 getPointerTy()); 910 return Table; 911} 912 913//===----------------------------------------------------------------------===// 914// Return Value Calling Convention Implementation 915//===----------------------------------------------------------------------===// 916 917#include "X86GenCallingConv.inc" 918 919/// LowerRET - Lower an ISD::RET node. 920SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { 921 DebugLoc dl = Op.getDebugLoc(); 922 assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args"); 923 924 SmallVector<CCValAssign, 16> RVLocs; 925 unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv(); 926 bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); 927 CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs); 928 CCInfo.AnalyzeReturn(Op.getNode(), RetCC_X86); 929 930 // If this is the first return lowered for this function, add the regs to the 931 // liveout set for the function. 932 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { 933 for (unsigned i = 0; i != RVLocs.size(); ++i) 934 if (RVLocs[i].isRegLoc()) 935 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 936 } 937 SDValue Chain = Op.getOperand(0); 938 939 // Handle tail call return. 940 Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL); 941 if (Chain.getOpcode() == X86ISD::TAILCALL) { 942 SDValue TailCall = Chain; 943 SDValue TargetAddress = TailCall.getOperand(1); 944 SDValue StackAdjustment = TailCall.getOperand(2); 945 assert(((TargetAddress.getOpcode() == ISD::Register && 946 (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX || 947 cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) || 948 TargetAddress.getOpcode() == ISD::TargetExternalSymbol || 949 TargetAddress.getOpcode() == ISD::TargetGlobalAddress) && 950 "Expecting an global address, external symbol, or register"); 951 assert(StackAdjustment.getOpcode() == ISD::Constant && 952 "Expecting a const value"); 953 954 SmallVector<SDValue,8> Operands; 955 Operands.push_back(Chain.getOperand(0)); 956 Operands.push_back(TargetAddress); 957 Operands.push_back(StackAdjustment); 958 // Copy registers used by the call. Last operand is a flag so it is not 959 // copied. 960 for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) { 961 Operands.push_back(Chain.getOperand(i)); 962 } 963 return DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, &Operands[0], 964 Operands.size()); 965 } 966 967 // Regular return. 968 SDValue Flag; 969 970 SmallVector<SDValue, 6> RetOps; 971 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 972 // Operand #1 = Bytes To Pop 973 RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16)); 974 975 // Copy the result values into the output registers. 976 for (unsigned i = 0; i != RVLocs.size(); ++i) { 977 CCValAssign &VA = RVLocs[i]; 978 assert(VA.isRegLoc() && "Can only return in registers!"); 979 SDValue ValToCopy = Op.getOperand(i*2+1); 980 981 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 982 // the RET instruction and handled by the FP Stackifier. 983 if (VA.getLocReg() == X86::ST0 || 984 VA.getLocReg() == X86::ST1) { 985 // If this is a copy from an xmm register to ST(0), use an FPExtend to 986 // change the value to the FP stack register class. 987 if (isScalarFPTypeInSSEReg(VA.getValVT())) 988 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 989 RetOps.push_back(ValToCopy); 990 // Don't emit a copytoreg. 991 continue; 992 } 993 994 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 995 // which is returned in RAX / RDX. 996 if (Subtarget->is64Bit()) { 997 MVT ValVT = ValToCopy.getValueType(); 998 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 999 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1000 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1001 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1002 } 1003 } 1004 1005 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1006 Flag = Chain.getValue(1); 1007 } 1008 1009 // The x86-64 ABI for returning structs by value requires that we copy 1010 // the sret argument into %rax for the return. We saved the argument into 1011 // a virtual register in the entry block, so now we copy the value out 1012 // and into %rax. 1013 if (Subtarget->is64Bit() && 1014 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1015 MachineFunction &MF = DAG.getMachineFunction(); 1016 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1017 unsigned Reg = FuncInfo->getSRetReturnReg(); 1018 if (!Reg) { 1019 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1020 FuncInfo->setSRetReturnReg(Reg); 1021 } 1022 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1023 1024 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1025 Flag = Chain.getValue(1); 1026 } 1027 1028 RetOps[0] = Chain; // Update chain. 1029 1030 // Add the flag if we have it. 1031 if (Flag.getNode()) 1032 RetOps.push_back(Flag); 1033 1034 return DAG.getNode(X86ISD::RET_FLAG, dl, 1035 MVT::Other, &RetOps[0], RetOps.size()); 1036} 1037 1038 1039/// LowerCallResult - Lower the result values of an ISD::CALL into the 1040/// appropriate copies out of appropriate physical registers. This assumes that 1041/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call 1042/// being lowered. The returns a SDNode with the same number of values as the 1043/// ISD::CALL. 1044SDNode *X86TargetLowering:: 1045LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, 1046 unsigned CallingConv, SelectionDAG &DAG) { 1047 1048 DebugLoc dl = TheCall->getDebugLoc(); 1049 // Assign locations to each value returned by this call. 1050 SmallVector<CCValAssign, 16> RVLocs; 1051 bool isVarArg = TheCall->isVarArg(); 1052 bool Is64Bit = Subtarget->is64Bit(); 1053 CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs); 1054 CCInfo.AnalyzeCallResult(TheCall, RetCC_X86); 1055 1056 SmallVector<SDValue, 8> ResultVals; 1057 1058 // Copy all of the result registers out of their specified physreg. 1059 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1060 CCValAssign &VA = RVLocs[i]; 1061 MVT CopyVT = VA.getValVT(); 1062 1063 // If this is x86-64, and we disabled SSE, we can't return FP values 1064 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1065 ((Is64Bit || TheCall->isInreg()) && !Subtarget->hasSSE1())) { 1066 cerr << "SSE register return with SSE disabled\n"; 1067 exit(1); 1068 } 1069 1070 // If this is a call to a function that returns an fp value on the floating 1071 // point stack, but where we prefer to use the value in xmm registers, copy 1072 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1073 if ((VA.getLocReg() == X86::ST0 || 1074 VA.getLocReg() == X86::ST1) && 1075 isScalarFPTypeInSSEReg(VA.getValVT())) { 1076 CopyVT = MVT::f80; 1077 } 1078 1079 SDValue Val; 1080 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1081 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1082 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1083 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1084 MVT::v2i64, InFlag).getValue(1); 1085 Val = Chain.getValue(0); 1086 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1087 Val, DAG.getConstant(0, MVT::i64)); 1088 } else { 1089 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1090 MVT::i64, InFlag).getValue(1); 1091 Val = Chain.getValue(0); 1092 } 1093 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1094 } else { 1095 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1096 CopyVT, InFlag).getValue(1); 1097 Val = Chain.getValue(0); 1098 } 1099 InFlag = Chain.getValue(2); 1100 1101 if (CopyVT != VA.getValVT()) { 1102 // Round the F80 the right size, which also moves to the appropriate xmm 1103 // register. 1104 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1105 // This truncation won't change the value. 1106 DAG.getIntPtrConstant(1)); 1107 } 1108 1109 ResultVals.push_back(Val); 1110 } 1111 1112 // Merge everything together with a MERGE_VALUES node. 1113 ResultVals.push_back(Chain); 1114 return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(), 1115 &ResultVals[0], ResultVals.size()).getNode(); 1116} 1117 1118 1119//===----------------------------------------------------------------------===// 1120// C & StdCall & Fast Calling Convention implementation 1121//===----------------------------------------------------------------------===// 1122// StdCall calling convention seems to be standard for many Windows' API 1123// routines and around. It differs from C calling convention just a little: 1124// callee should clean up the stack, not caller. Symbols should be also 1125// decorated in some fancy way :) It doesn't support any vector arguments. 1126// For info on fast calling convention see Fast Calling Convention (tail call) 1127// implementation LowerX86_32FastCCCallTo. 1128 1129/// CallIsStructReturn - Determines whether a CALL node uses struct return 1130/// semantics. 1131static bool CallIsStructReturn(CallSDNode *TheCall) { 1132 unsigned NumOps = TheCall->getNumArgs(); 1133 if (!NumOps) 1134 return false; 1135 1136 return TheCall->getArgFlags(0).isSRet(); 1137} 1138 1139/// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct 1140/// return semantics. 1141static bool ArgsAreStructReturn(SDValue Op) { 1142 unsigned NumArgs = Op.getNode()->getNumValues() - 1; 1143 if (!NumArgs) 1144 return false; 1145 1146 return cast<ARG_FLAGSSDNode>(Op.getOperand(3))->getArgFlags().isSRet(); 1147} 1148 1149/// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires 1150/// the callee to pop its own arguments. Callee pop is necessary to support tail 1151/// calls. 1152bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) { 1153 if (IsVarArg) 1154 return false; 1155 1156 switch (CallingConv) { 1157 default: 1158 return false; 1159 case CallingConv::X86_StdCall: 1160 return !Subtarget->is64Bit(); 1161 case CallingConv::X86_FastCall: 1162 return !Subtarget->is64Bit(); 1163 case CallingConv::Fast: 1164 return PerformTailCallOpt; 1165 } 1166} 1167 1168/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1169/// given CallingConvention value. 1170CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const { 1171 if (Subtarget->is64Bit()) { 1172 if (Subtarget->isTargetWin64()) 1173 return CC_X86_Win64_C; 1174 else if (CC == CallingConv::Fast && PerformTailCallOpt) 1175 return CC_X86_64_TailCall; 1176 else 1177 return CC_X86_64_C; 1178 } 1179 1180 if (CC == CallingConv::X86_FastCall) 1181 return CC_X86_32_FastCall; 1182 else if (CC == CallingConv::Fast) 1183 return CC_X86_32_FastCC; 1184 else 1185 return CC_X86_32_C; 1186} 1187 1188/// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to 1189/// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node. 1190NameDecorationStyle 1191X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) { 1192 unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1193 if (CC == CallingConv::X86_FastCall) 1194 return FastCall; 1195 else if (CC == CallingConv::X86_StdCall) 1196 return StdCall; 1197 return None; 1198} 1199 1200 1201/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer 1202/// in a register before calling. 1203bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) { 1204 return !IsTailCall && !Is64Bit && 1205 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1206 Subtarget->isPICStyleGOT(); 1207} 1208 1209/// CallRequiresFnAddressInReg - Check whether the call requires the function 1210/// address to be loaded in a register. 1211bool 1212X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) { 1213 return !Is64Bit && IsTailCall && 1214 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1215 Subtarget->isPICStyleGOT(); 1216} 1217 1218/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1219/// by "Src" to address "Dst" with size and alignment information specified by 1220/// the specific parameter attribute. The copy will be passed as a byval 1221/// function parameter. 1222static SDValue 1223CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1224 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1225 DebugLoc dl) { 1226 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1227 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1228 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1229} 1230 1231SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG, 1232 const CCValAssign &VA, 1233 MachineFrameInfo *MFI, 1234 unsigned CC, 1235 SDValue Root, unsigned i) { 1236 // Create the nodes corresponding to a load from this parameter slot. 1237 ISD::ArgFlagsTy Flags = 1238 cast<ARG_FLAGSSDNode>(Op.getOperand(3 + i))->getArgFlags(); 1239 bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt; 1240 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1241 1242 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1243 // changed with more analysis. 1244 // In case of tail call optimization mark all arguments mutable. Since they 1245 // could be overwritten by lowering of arguments in case of a tail call. 1246 int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8, 1247 VA.getLocMemOffset(), isImmutable); 1248 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1249 if (Flags.isByVal()) 1250 return FIN; 1251 return DAG.getLoad(VA.getValVT(), Op.getDebugLoc(), Root, FIN, 1252 PseudoSourceValue::getFixedStack(FI), 0); 1253} 1254 1255SDValue 1256X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { 1257 MachineFunction &MF = DAG.getMachineFunction(); 1258 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1259 DebugLoc dl = Op.getDebugLoc(); 1260 1261 const Function* Fn = MF.getFunction(); 1262 if (Fn->hasExternalLinkage() && 1263 Subtarget->isTargetCygMing() && 1264 Fn->getName() == "main") 1265 FuncInfo->setForceFramePointer(true); 1266 1267 // Decorate the function name. 1268 FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op)); 1269 1270 MachineFrameInfo *MFI = MF.getFrameInfo(); 1271 SDValue Root = Op.getOperand(0); 1272 bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0; 1273 unsigned CC = MF.getFunction()->getCallingConv(); 1274 bool Is64Bit = Subtarget->is64Bit(); 1275 bool IsWin64 = Subtarget->isTargetWin64(); 1276 1277 assert(!(isVarArg && CC == CallingConv::Fast) && 1278 "Var args not supported with calling convention fastcc"); 1279 1280 // Assign locations to all of the incoming arguments. 1281 SmallVector<CCValAssign, 16> ArgLocs; 1282 CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); 1283 CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC)); 1284 1285 SmallVector<SDValue, 8> ArgValues; 1286 unsigned LastVal = ~0U; 1287 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1288 CCValAssign &VA = ArgLocs[i]; 1289 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1290 // places. 1291 assert(VA.getValNo() != LastVal && 1292 "Don't support value assigned to multiple locs yet"); 1293 LastVal = VA.getValNo(); 1294 1295 if (VA.isRegLoc()) { 1296 MVT RegVT = VA.getLocVT(); 1297 TargetRegisterClass *RC = NULL; 1298 if (RegVT == MVT::i32) 1299 RC = X86::GR32RegisterClass; 1300 else if (Is64Bit && RegVT == MVT::i64) 1301 RC = X86::GR64RegisterClass; 1302 else if (RegVT == MVT::f32) 1303 RC = X86::FR32RegisterClass; 1304 else if (RegVT == MVT::f64) 1305 RC = X86::FR64RegisterClass; 1306 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1307 RC = X86::VR128RegisterClass; 1308 else if (RegVT.isVector()) { 1309 assert(RegVT.getSizeInBits() == 64); 1310 if (!Is64Bit) 1311 RC = X86::VR64RegisterClass; // MMX values are passed in MMXs. 1312 else { 1313 // Darwin calling convention passes MMX values in either GPRs or 1314 // XMMs in x86-64. Other targets pass them in memory. 1315 if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) { 1316 RC = X86::VR128RegisterClass; // MMX values are passed in XMMs. 1317 RegVT = MVT::v2i64; 1318 } else { 1319 RC = X86::GR64RegisterClass; // v1i64 values are passed in GPRs. 1320 RegVT = MVT::i64; 1321 } 1322 } 1323 } else { 1324 assert(0 && "Unknown argument type!"); 1325 } 1326 1327 unsigned Reg = DAG.getMachineFunction().addLiveIn(VA.getLocReg(), RC); 1328 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT); 1329 1330 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1331 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1332 // right size. 1333 if (VA.getLocInfo() == CCValAssign::SExt) 1334 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1335 DAG.getValueType(VA.getValVT())); 1336 else if (VA.getLocInfo() == CCValAssign::ZExt) 1337 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1338 DAG.getValueType(VA.getValVT())); 1339 1340 if (VA.getLocInfo() != CCValAssign::Full) 1341 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1342 1343 // Handle MMX values passed in GPRs. 1344 if (Is64Bit && RegVT != VA.getLocVT()) { 1345 if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass) 1346 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue); 1347 else if (RC == X86::VR128RegisterClass) { 1348 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1349 ArgValue, DAG.getConstant(0, MVT::i64)); 1350 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue); 1351 } 1352 } 1353 1354 ArgValues.push_back(ArgValue); 1355 } else { 1356 assert(VA.isMemLoc()); 1357 ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i)); 1358 } 1359 } 1360 1361 // The x86-64 ABI for returning structs by value requires that we copy 1362 // the sret argument into %rax for the return. Save the argument into 1363 // a virtual register so that we can access it from the return points. 1364 if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1365 MachineFunction &MF = DAG.getMachineFunction(); 1366 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1367 unsigned Reg = FuncInfo->getSRetReturnReg(); 1368 if (!Reg) { 1369 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1370 FuncInfo->setSRetReturnReg(Reg); 1371 } 1372 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]); 1373 Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root); 1374 } 1375 1376 unsigned StackSize = CCInfo.getNextStackOffset(); 1377 // align stack specially for tail calls 1378 if (PerformTailCallOpt && CC == CallingConv::Fast) 1379 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1380 1381 // If the function takes variable number of arguments, make a frame index for 1382 // the start of the first vararg value... for expansion of llvm.va_start. 1383 if (isVarArg) { 1384 if (Is64Bit || CC != CallingConv::X86_FastCall) { 1385 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); 1386 } 1387 if (Is64Bit) { 1388 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1389 1390 // FIXME: We should really autogenerate these arrays 1391 static const unsigned GPR64ArgRegsWin64[] = { 1392 X86::RCX, X86::RDX, X86::R8, X86::R9 1393 }; 1394 static const unsigned XMMArgRegsWin64[] = { 1395 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1396 }; 1397 static const unsigned GPR64ArgRegs64Bit[] = { 1398 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1399 }; 1400 static const unsigned XMMArgRegs64Bit[] = { 1401 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1402 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1403 }; 1404 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1405 1406 if (IsWin64) { 1407 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1408 GPR64ArgRegs = GPR64ArgRegsWin64; 1409 XMMArgRegs = XMMArgRegsWin64; 1410 } else { 1411 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1412 GPR64ArgRegs = GPR64ArgRegs64Bit; 1413 XMMArgRegs = XMMArgRegs64Bit; 1414 } 1415 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1416 TotalNumIntRegs); 1417 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1418 TotalNumXMMRegs); 1419 1420 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1421 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1422 "SSE register cannot be used when SSE is disabled!"); 1423 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1424 "SSE register cannot be used when SSE is disabled!"); 1425 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1426 // Kernel mode asks for SSE to be disabled, so don't push them 1427 // on the stack. 1428 TotalNumXMMRegs = 0; 1429 1430 // For X86-64, if there are vararg parameters that are passed via 1431 // registers, then we must store them to their spots on the stack so they 1432 // may be loaded by deferencing the result of va_next. 1433 VarArgsGPOffset = NumIntRegs * 8; 1434 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1435 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1436 TotalNumXMMRegs * 16, 16); 1437 1438 // Store the integer parameter registers. 1439 SmallVector<SDValue, 8> MemOps; 1440 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1441 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1442 DAG.getIntPtrConstant(VarArgsGPOffset)); 1443 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1444 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1445 X86::GR64RegisterClass); 1446 SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64); 1447 SDValue Store = 1448 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1449 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); 1450 MemOps.push_back(Store); 1451 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, 1452 DAG.getIntPtrConstant(8)); 1453 } 1454 1455 // Now store the XMM (fp + vector) parameter registers. 1456 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1457 DAG.getIntPtrConstant(VarArgsFPOffset)); 1458 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1459 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1460 X86::VR128RegisterClass); 1461 SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::v4f32); 1462 SDValue Store = 1463 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1464 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); 1465 MemOps.push_back(Store); 1466 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, 1467 DAG.getIntPtrConstant(16)); 1468 } 1469 if (!MemOps.empty()) 1470 Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1471 &MemOps[0], MemOps.size()); 1472 } 1473 } 1474 1475 ArgValues.push_back(Root); 1476 1477 // Some CCs need callee pop. 1478 if (IsCalleePop(isVarArg, CC)) { 1479 BytesToPopOnReturn = StackSize; // Callee pops everything. 1480 BytesCallerReserves = 0; 1481 } else { 1482 BytesToPopOnReturn = 0; // Callee pops nothing. 1483 // If this is an sret function, the return should pop the hidden pointer. 1484 if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op)) 1485 BytesToPopOnReturn = 4; 1486 BytesCallerReserves = StackSize; 1487 } 1488 1489 if (!Is64Bit) { 1490 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1491 if (CC == CallingConv::X86_FastCall) 1492 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1493 } 1494 1495 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1496 1497 // Return the new list of results. 1498 return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(), 1499 &ArgValues[0], ArgValues.size()).getValue(Op.getResNo()); 1500} 1501 1502SDValue 1503X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, 1504 const SDValue &StackPtr, 1505 const CCValAssign &VA, 1506 SDValue Chain, 1507 SDValue Arg, ISD::ArgFlagsTy Flags) { 1508 DebugLoc dl = TheCall->getDebugLoc(); 1509 unsigned LocMemOffset = VA.getLocMemOffset(); 1510 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1511 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1512 if (Flags.isByVal()) { 1513 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1514 } 1515 return DAG.getStore(Chain, dl, Arg, PtrOff, 1516 PseudoSourceValue::getStack(), LocMemOffset); 1517} 1518 1519/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1520/// optimization is performed and it is required. 1521SDValue 1522X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1523 SDValue &OutRetAddr, 1524 SDValue Chain, 1525 bool IsTailCall, 1526 bool Is64Bit, 1527 int FPDiff, 1528 DebugLoc dl) { 1529 if (!IsTailCall || FPDiff==0) return Chain; 1530 1531 // Adjust the Return address stack slot. 1532 MVT VT = getPointerTy(); 1533 OutRetAddr = getReturnAddressFrameIndex(DAG); 1534 1535 // Load the "old" Return address. 1536 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); 1537 return SDValue(OutRetAddr.getNode(), 1); 1538} 1539 1540/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1541/// optimization is performed and it is required (FPDiff!=0). 1542static SDValue 1543EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1544 SDValue Chain, SDValue RetAddrFrIdx, 1545 bool Is64Bit, int FPDiff, DebugLoc dl) { 1546 // Store the return address to the appropriate stack slot. 1547 if (!FPDiff) return Chain; 1548 // Calculate the new stack slot for the return address. 1549 int SlotSize = Is64Bit ? 8 : 4; 1550 int NewReturnAddrFI = 1551 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); 1552 MVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1553 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1554 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1555 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); 1556 return Chain; 1557} 1558 1559SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { 1560 MachineFunction &MF = DAG.getMachineFunction(); 1561 CallSDNode *TheCall = cast<CallSDNode>(Op.getNode()); 1562 SDValue Chain = TheCall->getChain(); 1563 unsigned CC = TheCall->getCallingConv(); 1564 bool isVarArg = TheCall->isVarArg(); 1565 bool IsTailCall = TheCall->isTailCall() && 1566 CC == CallingConv::Fast && PerformTailCallOpt; 1567 SDValue Callee = TheCall->getCallee(); 1568 bool Is64Bit = Subtarget->is64Bit(); 1569 bool IsStructRet = CallIsStructReturn(TheCall); 1570 DebugLoc dl = TheCall->getDebugLoc(); 1571 1572 assert(!(isVarArg && CC == CallingConv::Fast) && 1573 "Var args not supported with calling convention fastcc"); 1574 1575 // Analyze operands of the call, assigning locations to each operand. 1576 SmallVector<CCValAssign, 16> ArgLocs; 1577 CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); 1578 CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC)); 1579 1580 // Get a count of how many bytes are to be pushed on the stack. 1581 unsigned NumBytes = CCInfo.getNextStackOffset(); 1582 if (PerformTailCallOpt && CC == CallingConv::Fast) 1583 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1584 1585 int FPDiff = 0; 1586 if (IsTailCall) { 1587 // Lower arguments at fp - stackoffset + fpdiff. 1588 unsigned NumBytesCallerPushed = 1589 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1590 FPDiff = NumBytesCallerPushed - NumBytes; 1591 1592 // Set the delta of movement of the returnaddr stackslot. 1593 // But only set if delta is greater than previous delta. 1594 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1595 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1596 } 1597 1598 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1599 1600 SDValue RetAddrFrIdx; 1601 // Load return adress for tail calls. 1602 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit, 1603 FPDiff, dl); 1604 1605 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1606 SmallVector<SDValue, 8> MemOpChains; 1607 SDValue StackPtr; 1608 1609 // Walk the register/memloc assignments, inserting copies/loads. In the case 1610 // of tail call optimization arguments are handle later. 1611 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1612 CCValAssign &VA = ArgLocs[i]; 1613 SDValue Arg = TheCall->getArg(i); 1614 ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i); 1615 bool isByVal = Flags.isByVal(); 1616 1617 // Promote the value if needed. 1618 switch (VA.getLocInfo()) { 1619 default: assert(0 && "Unknown loc info!"); 1620 case CCValAssign::Full: break; 1621 case CCValAssign::SExt: 1622 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1623 break; 1624 case CCValAssign::ZExt: 1625 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1626 break; 1627 case CCValAssign::AExt: 1628 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1629 break; 1630 } 1631 1632 if (VA.isRegLoc()) { 1633 if (Is64Bit) { 1634 MVT RegVT = VA.getLocVT(); 1635 if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1636 switch (VA.getLocReg()) { 1637 default: 1638 break; 1639 case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX: 1640 case X86::R8: { 1641 // Special case: passing MMX values in GPR registers. 1642 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1643 break; 1644 } 1645 case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3: 1646 case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: { 1647 // Special case: passing MMX values in XMM registers. 1648 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1649 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1650 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1651 break; 1652 } 1653 } 1654 } 1655 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1656 } else { 1657 if (!IsTailCall || (IsTailCall && isByVal)) { 1658 assert(VA.isMemLoc()); 1659 if (StackPtr.getNode() == 0) 1660 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1661 1662 MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, 1663 Chain, Arg, Flags)); 1664 } 1665 } 1666 } 1667 1668 if (!MemOpChains.empty()) 1669 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1670 &MemOpChains[0], MemOpChains.size()); 1671 1672 // Build a sequence of copy-to-reg nodes chained together with token chain 1673 // and flag operands which copy the outgoing args into registers. 1674 SDValue InFlag; 1675 // Tail call byval lowering might overwrite argument registers so in case of 1676 // tail call optimization the copies to registers are lowered later. 1677 if (!IsTailCall) 1678 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1679 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1680 RegsToPass[i].second, InFlag); 1681 InFlag = Chain.getValue(1); 1682 } 1683 1684 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1685 // GOT pointer. 1686 if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) { 1687 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1688 DAG.getNode(X86ISD::GlobalBaseReg, 1689 DebugLoc::getUnknownLoc(), 1690 getPointerTy()), 1691 InFlag); 1692 InFlag = Chain.getValue(1); 1693 } 1694 // If we are tail calling and generating PIC/GOT style code load the address 1695 // of the callee into ecx. The value in ecx is used as target of the tail 1696 // jump. This is done to circumvent the ebx/callee-saved problem for tail 1697 // calls on PIC/GOT architectures. Normally we would just put the address of 1698 // GOT into ebx and then call target@PLT. But for tail callss ebx would be 1699 // restored (since ebx is callee saved) before jumping to the target@PLT. 1700 if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) { 1701 // Note: The actual moving to ecx is done further down. 1702 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1703 if (G && !G->getGlobal()->hasHiddenVisibility() && 1704 !G->getGlobal()->hasProtectedVisibility()) 1705 Callee = LowerGlobalAddress(Callee, DAG); 1706 else if (isa<ExternalSymbolSDNode>(Callee)) 1707 Callee = LowerExternalSymbol(Callee,DAG); 1708 } 1709 1710 if (Is64Bit && isVarArg) { 1711 // From AMD64 ABI document: 1712 // For calls that may call functions that use varargs or stdargs 1713 // (prototype-less calls or calls to functions containing ellipsis (...) in 1714 // the declaration) %al is used as hidden argument to specify the number 1715 // of SSE registers used. The contents of %al do not need to match exactly 1716 // the number of registers, but must be an ubound on the number of SSE 1717 // registers used and is in the range 0 - 8 inclusive. 1718 1719 // FIXME: Verify this on Win64 1720 // Count the number of XMM registers allocated. 1721 static const unsigned XMMArgRegs[] = { 1722 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1723 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1724 }; 1725 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1726 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1727 && "SSE registers cannot be used when SSE is disabled"); 1728 1729 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1730 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1731 InFlag = Chain.getValue(1); 1732 } 1733 1734 1735 // For tail calls lower the arguments to the 'real' stack slot. 1736 if (IsTailCall) { 1737 SmallVector<SDValue, 8> MemOpChains2; 1738 SDValue FIN; 1739 int FI = 0; 1740 // Do not flag preceeding copytoreg stuff together with the following stuff. 1741 InFlag = SDValue(); 1742 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1743 CCValAssign &VA = ArgLocs[i]; 1744 if (!VA.isRegLoc()) { 1745 assert(VA.isMemLoc()); 1746 SDValue Arg = TheCall->getArg(i); 1747 ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i); 1748 // Create frame index. 1749 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1750 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1751 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); 1752 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1753 1754 if (Flags.isByVal()) { 1755 // Copy relative to framepointer. 1756 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1757 if (StackPtr.getNode() == 0) 1758 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1759 getPointerTy()); 1760 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1761 1762 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain, 1763 Flags, DAG, dl)); 1764 } else { 1765 // Store relative to framepointer. 1766 MemOpChains2.push_back( 1767 DAG.getStore(Chain, dl, Arg, FIN, 1768 PseudoSourceValue::getFixedStack(FI), 0)); 1769 } 1770 } 1771 } 1772 1773 if (!MemOpChains2.empty()) 1774 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1775 &MemOpChains2[0], MemOpChains2.size()); 1776 1777 // Copy arguments to their registers. 1778 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1779 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1780 RegsToPass[i].second, InFlag); 1781 InFlag = Chain.getValue(1); 1782 } 1783 InFlag =SDValue(); 1784 1785 // Store the return address to the appropriate stack slot. 1786 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 1787 FPDiff, dl); 1788 } 1789 1790 // If the callee is a GlobalAddress node (quite common, every direct call is) 1791 // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. 1792 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1793 // We should use extra load for direct calls to dllimported functions in 1794 // non-JIT mode. 1795 if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(), 1796 getTargetMachine(), true)) 1797 Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy(), 1798 G->getOffset()); 1799 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1800 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); 1801 } else if (IsTailCall) { 1802 unsigned Opc = Is64Bit ? X86::R9 : X86::EAX; 1803 1804 Chain = DAG.getCopyToReg(Chain, dl, 1805 DAG.getRegister(Opc, getPointerTy()), 1806 Callee,InFlag); 1807 Callee = DAG.getRegister(Opc, getPointerTy()); 1808 // Add register as live out. 1809 DAG.getMachineFunction().getRegInfo().addLiveOut(Opc); 1810 } 1811 1812 // Returns a chain & a flag for retval copy to use. 1813 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1814 SmallVector<SDValue, 8> Ops; 1815 1816 if (IsTailCall) { 1817 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1818 DAG.getIntPtrConstant(0, true), InFlag); 1819 InFlag = Chain.getValue(1); 1820 1821 // Returns a chain & a flag for retval copy to use. 1822 NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1823 Ops.clear(); 1824 } 1825 1826 Ops.push_back(Chain); 1827 Ops.push_back(Callee); 1828 1829 if (IsTailCall) 1830 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 1831 1832 // Add argument registers to the end of the list so that they are known live 1833 // into the call. 1834 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1835 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1836 RegsToPass[i].second.getValueType())); 1837 1838 // Add an implicit use GOT pointer in EBX. 1839 if (!IsTailCall && !Is64Bit && 1840 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1841 Subtarget->isPICStyleGOT()) 1842 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 1843 1844 // Add an implicit use of AL for x86 vararg functions. 1845 if (Is64Bit && isVarArg) 1846 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 1847 1848 if (InFlag.getNode()) 1849 Ops.push_back(InFlag); 1850 1851 if (IsTailCall) { 1852 assert(InFlag.getNode() && 1853 "Flag must be set. Depend on flag being set in LowerRET"); 1854 Chain = DAG.getNode(X86ISD::TAILCALL, dl, 1855 TheCall->getVTList(), &Ops[0], Ops.size()); 1856 1857 return SDValue(Chain.getNode(), Op.getResNo()); 1858 } 1859 1860 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 1861 InFlag = Chain.getValue(1); 1862 1863 // Create the CALLSEQ_END node. 1864 unsigned NumBytesForCalleeToPush; 1865 if (IsCalleePop(isVarArg, CC)) 1866 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 1867 else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet) 1868 // If this is is a call to a struct-return function, the callee 1869 // pops the hidden struct pointer, so we have to push it back. 1870 // This is common for Darwin/X86, Linux & Mingw32 targets. 1871 NumBytesForCalleeToPush = 4; 1872 else 1873 NumBytesForCalleeToPush = 0; // Callee pops nothing. 1874 1875 // Returns a flag for retval copy to use. 1876 Chain = DAG.getCALLSEQ_END(Chain, 1877 DAG.getIntPtrConstant(NumBytes, true), 1878 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 1879 true), 1880 InFlag); 1881 InFlag = Chain.getValue(1); 1882 1883 // Handle result values, copying them out of physregs into vregs that we 1884 // return. 1885 return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG), 1886 Op.getResNo()); 1887} 1888 1889 1890//===----------------------------------------------------------------------===// 1891// Fast Calling Convention (tail call) implementation 1892//===----------------------------------------------------------------------===// 1893 1894// Like std call, callee cleans arguments, convention except that ECX is 1895// reserved for storing the tail called function address. Only 2 registers are 1896// free for argument passing (inreg). Tail call optimization is performed 1897// provided: 1898// * tailcallopt is enabled 1899// * caller/callee are fastcc 1900// On X86_64 architecture with GOT-style position independent code only local 1901// (within module) calls are supported at the moment. 1902// To keep the stack aligned according to platform abi the function 1903// GetAlignedArgumentStackSize ensures that argument delta is always multiples 1904// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 1905// If a tail called function callee has more arguments than the caller the 1906// caller needs to make sure that there is room to move the RETADDR to. This is 1907// achieved by reserving an area the size of the argument delta right after the 1908// original REtADDR, but before the saved framepointer or the spilled registers 1909// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 1910// stack layout: 1911// arg1 1912// arg2 1913// RETADDR 1914// [ new RETADDR 1915// move area ] 1916// (possible EBP) 1917// ESI 1918// EDI 1919// local1 .. 1920 1921/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 1922/// for a 16 byte align requirement. 1923unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 1924 SelectionDAG& DAG) { 1925 MachineFunction &MF = DAG.getMachineFunction(); 1926 const TargetMachine &TM = MF.getTarget(); 1927 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 1928 unsigned StackAlignment = TFI.getStackAlignment(); 1929 uint64_t AlignMask = StackAlignment - 1; 1930 int64_t Offset = StackSize; 1931 uint64_t SlotSize = TD->getPointerSize(); 1932 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 1933 // Number smaller than 12 so just add the difference. 1934 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 1935 } else { 1936 // Mask out lower bits, add stackalignment once plus the 12 bytes. 1937 Offset = ((~AlignMask) & Offset) + StackAlignment + 1938 (StackAlignment-SlotSize); 1939 } 1940 return Offset; 1941} 1942 1943/// IsEligibleForTailCallElimination - Check to see whether the next instruction 1944/// following the call is a return. A function is eligible if caller/callee 1945/// calling conventions match, currently only fastcc supports tail calls, and 1946/// the function CALL is immediatly followed by a RET. 1947bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall, 1948 SDValue Ret, 1949 SelectionDAG& DAG) const { 1950 if (!PerformTailCallOpt) 1951 return false; 1952 1953 if (CheckTailCallReturnConstraints(TheCall, Ret)) { 1954 MachineFunction &MF = DAG.getMachineFunction(); 1955 unsigned CallerCC = MF.getFunction()->getCallingConv(); 1956 unsigned CalleeCC= TheCall->getCallingConv(); 1957 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { 1958 SDValue Callee = TheCall->getCallee(); 1959 // On x86/32Bit PIC/GOT tail calls are supported. 1960 if (getTargetMachine().getRelocationModel() != Reloc::PIC_ || 1961 !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit()) 1962 return true; 1963 1964 // Can only do local tail calls (in same module, hidden or protected) on 1965 // x86_64 PIC/GOT at the moment. 1966 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1967 return G->getGlobal()->hasHiddenVisibility() 1968 || G->getGlobal()->hasProtectedVisibility(); 1969 } 1970 } 1971 1972 return false; 1973} 1974 1975FastISel * 1976X86TargetLowering::createFastISel(MachineFunction &mf, 1977 MachineModuleInfo *mmo, 1978 DwarfWriter *dw, 1979 DenseMap<const Value *, unsigned> &vm, 1980 DenseMap<const BasicBlock *, 1981 MachineBasicBlock *> &bm, 1982 DenseMap<const AllocaInst *, int> &am 1983#ifndef NDEBUG 1984 , SmallSet<Instruction*, 8> &cil 1985#endif 1986 ) { 1987 return X86::createFastISel(mf, mmo, dw, vm, bm, am 1988#ifndef NDEBUG 1989 , cil 1990#endif 1991 ); 1992} 1993 1994 1995//===----------------------------------------------------------------------===// 1996// Other Lowering Hooks 1997//===----------------------------------------------------------------------===// 1998 1999 2000SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2001 MachineFunction &MF = DAG.getMachineFunction(); 2002 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2003 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2004 2005 if (ReturnAddrIndex == 0) { 2006 // Set up a frame object for the return address. 2007 uint64_t SlotSize = TD->getPointerSize(); 2008 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize); 2009 FuncInfo->setRAIndex(ReturnAddrIndex); 2010 } 2011 2012 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2013} 2014 2015 2016/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2017/// specific condition code, returning the condition code and the LHS/RHS of the 2018/// comparison to make. 2019static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2020 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2021 if (!isFP) { 2022 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2023 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2024 // X > -1 -> X == 0, jump !sign. 2025 RHS = DAG.getConstant(0, RHS.getValueType()); 2026 return X86::COND_NS; 2027 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2028 // X < 0 -> X == 0, jump on sign. 2029 return X86::COND_S; 2030 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2031 // X < 1 -> X <= 0 2032 RHS = DAG.getConstant(0, RHS.getValueType()); 2033 return X86::COND_LE; 2034 } 2035 } 2036 2037 switch (SetCCOpcode) { 2038 default: assert(0 && "Invalid integer condition!"); 2039 case ISD::SETEQ: return X86::COND_E; 2040 case ISD::SETGT: return X86::COND_G; 2041 case ISD::SETGE: return X86::COND_GE; 2042 case ISD::SETLT: return X86::COND_L; 2043 case ISD::SETLE: return X86::COND_LE; 2044 case ISD::SETNE: return X86::COND_NE; 2045 case ISD::SETULT: return X86::COND_B; 2046 case ISD::SETUGT: return X86::COND_A; 2047 case ISD::SETULE: return X86::COND_BE; 2048 case ISD::SETUGE: return X86::COND_AE; 2049 } 2050 } 2051 2052 // First determine if it is required or is profitable to flip the operands. 2053 2054 // If LHS is a foldable load, but RHS is not, flip the condition. 2055 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2056 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2057 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2058 std::swap(LHS, RHS); 2059 } 2060 2061 switch (SetCCOpcode) { 2062 default: break; 2063 case ISD::SETOLT: 2064 case ISD::SETOLE: 2065 case ISD::SETUGT: 2066 case ISD::SETUGE: 2067 std::swap(LHS, RHS); 2068 break; 2069 } 2070 2071 // On a floating point condition, the flags are set as follows: 2072 // ZF PF CF op 2073 // 0 | 0 | 0 | X > Y 2074 // 0 | 0 | 1 | X < Y 2075 // 1 | 0 | 0 | X == Y 2076 // 1 | 1 | 1 | unordered 2077 switch (SetCCOpcode) { 2078 default: assert(0 && "Condcode should be pre-legalized away"); 2079 case ISD::SETUEQ: 2080 case ISD::SETEQ: return X86::COND_E; 2081 case ISD::SETOLT: // flipped 2082 case ISD::SETOGT: 2083 case ISD::SETGT: return X86::COND_A; 2084 case ISD::SETOLE: // flipped 2085 case ISD::SETOGE: 2086 case ISD::SETGE: return X86::COND_AE; 2087 case ISD::SETUGT: // flipped 2088 case ISD::SETULT: 2089 case ISD::SETLT: return X86::COND_B; 2090 case ISD::SETUGE: // flipped 2091 case ISD::SETULE: 2092 case ISD::SETLE: return X86::COND_BE; 2093 case ISD::SETONE: 2094 case ISD::SETNE: return X86::COND_NE; 2095 case ISD::SETUO: return X86::COND_P; 2096 case ISD::SETO: return X86::COND_NP; 2097 } 2098} 2099 2100/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2101/// code. Current x86 isa includes the following FP cmov instructions: 2102/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2103static bool hasFPCMov(unsigned X86CC) { 2104 switch (X86CC) { 2105 default: 2106 return false; 2107 case X86::COND_B: 2108 case X86::COND_BE: 2109 case X86::COND_E: 2110 case X86::COND_P: 2111 case X86::COND_A: 2112 case X86::COND_AE: 2113 case X86::COND_NE: 2114 case X86::COND_NP: 2115 return true; 2116 } 2117} 2118 2119/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2120/// the specified range (L, H]. 2121static bool isUndefOrInRange(int Val, int Low, int Hi) { 2122 return (Val < 0) || (Val >= Low && Val < Hi); 2123} 2124 2125/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2126/// specified value. 2127static bool isUndefOrEqual(int Val, int CmpVal) { 2128 if (Val < 0 || Val == CmpVal) 2129 return true; 2130 return false; 2131} 2132 2133/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2134/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2135/// the second operand. 2136static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2137 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2138 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2139 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2140 return (Mask[0] < 2 && Mask[1] < 2); 2141 return false; 2142} 2143 2144bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2145 SmallVector<int, 8> M; 2146 N->getMask(M); 2147 return ::isPSHUFDMask(M, N->getValueType(0)); 2148} 2149 2150/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2151/// is suitable for input to PSHUFHW. 2152static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2153 if (VT != MVT::v8i16) 2154 return false; 2155 2156 // Lower quadword copied in order or undef. 2157 for (int i = 0; i != 4; ++i) 2158 if (Mask[i] >= 0 && Mask[i] != i) 2159 return false; 2160 2161 // Upper quadword shuffled. 2162 for (int i = 4; i != 8; ++i) 2163 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2164 return false; 2165 2166 return true; 2167} 2168 2169bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2170 SmallVector<int, 8> M; 2171 N->getMask(M); 2172 return ::isPSHUFHWMask(M, N->getValueType(0)); 2173} 2174 2175/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2176/// is suitable for input to PSHUFLW. 2177static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2178 if (VT != MVT::v8i16) 2179 return false; 2180 2181 // Upper quadword copied in order. 2182 for (int i = 4; i != 8; ++i) 2183 if (Mask[i] >= 0 && Mask[i] != i) 2184 return false; 2185 2186 // Lower quadword shuffled. 2187 for (int i = 0; i != 4; ++i) 2188 if (Mask[i] >= 4) 2189 return false; 2190 2191 return true; 2192} 2193 2194bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2195 SmallVector<int, 8> M; 2196 N->getMask(M); 2197 return ::isPSHUFLWMask(M, N->getValueType(0)); 2198} 2199 2200/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2201/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2202static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2203 int NumElems = VT.getVectorNumElements(); 2204 if (NumElems != 2 && NumElems != 4) 2205 return false; 2206 2207 int Half = NumElems / 2; 2208 for (int i = 0; i < Half; ++i) 2209 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2210 return false; 2211 for (int i = Half; i < NumElems; ++i) 2212 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2213 return false; 2214 2215 return true; 2216} 2217 2218bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2219 SmallVector<int, 8> M; 2220 N->getMask(M); 2221 return ::isSHUFPMask(M, N->getValueType(0)); 2222} 2223 2224/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2225/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2226/// half elements to come from vector 1 (which would equal the dest.) and 2227/// the upper half to come from vector 2. 2228static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2229 int NumElems = VT.getVectorNumElements(); 2230 2231 if (NumElems != 2 && NumElems != 4) 2232 return false; 2233 2234 int Half = NumElems / 2; 2235 for (int i = 0; i < Half; ++i) 2236 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2237 return false; 2238 for (int i = Half; i < NumElems; ++i) 2239 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2240 return false; 2241 return true; 2242} 2243 2244static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2245 SmallVector<int, 8> M; 2246 N->getMask(M); 2247 return isCommutedSHUFPMask(M, N->getValueType(0)); 2248} 2249 2250/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2251/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2252bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2253 if (N->getValueType(0).getVectorNumElements() != 4) 2254 return false; 2255 2256 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2257 return isUndefOrEqual(N->getMaskElt(0), 6) && 2258 isUndefOrEqual(N->getMaskElt(1), 7) && 2259 isUndefOrEqual(N->getMaskElt(2), 2) && 2260 isUndefOrEqual(N->getMaskElt(3), 3); 2261} 2262 2263/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2264/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2265bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2266 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2267 2268 if (NumElems != 2 && NumElems != 4) 2269 return false; 2270 2271 for (unsigned i = 0; i < NumElems/2; ++i) 2272 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2273 return false; 2274 2275 for (unsigned i = NumElems/2; i < NumElems; ++i) 2276 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2277 return false; 2278 2279 return true; 2280} 2281 2282/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand 2283/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D} 2284/// and MOVLHPS. 2285bool X86::isMOVHPMask(ShuffleVectorSDNode *N) { 2286 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2287 2288 if (NumElems != 2 && NumElems != 4) 2289 return false; 2290 2291 for (unsigned i = 0; i < NumElems/2; ++i) 2292 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2293 return false; 2294 2295 for (unsigned i = 0; i < NumElems/2; ++i) 2296 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2297 return false; 2298 2299 return true; 2300} 2301 2302/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2303/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2304/// <2, 3, 2, 3> 2305bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2306 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2307 2308 if (NumElems != 4) 2309 return false; 2310 2311 return isUndefOrEqual(N->getMaskElt(0), 2) && 2312 isUndefOrEqual(N->getMaskElt(1), 3) && 2313 isUndefOrEqual(N->getMaskElt(2), 2) && 2314 isUndefOrEqual(N->getMaskElt(3), 3); 2315} 2316 2317/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2318/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2319static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, MVT VT, 2320 bool V2IsSplat = false) { 2321 int NumElts = VT.getVectorNumElements(); 2322 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2323 return false; 2324 2325 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2326 int BitI = Mask[i]; 2327 int BitI1 = Mask[i+1]; 2328 if (!isUndefOrEqual(BitI, j)) 2329 return false; 2330 if (V2IsSplat) { 2331 if (!isUndefOrEqual(BitI1, NumElts)) 2332 return false; 2333 } else { 2334 if (!isUndefOrEqual(BitI1, j + NumElts)) 2335 return false; 2336 } 2337 } 2338 return true; 2339} 2340 2341bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2342 SmallVector<int, 8> M; 2343 N->getMask(M); 2344 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2345} 2346 2347/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2348/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2349static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, MVT VT, 2350 bool V2IsSplat = false) { 2351 int NumElts = VT.getVectorNumElements(); 2352 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2353 return false; 2354 2355 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2356 int BitI = Mask[i]; 2357 int BitI1 = Mask[i+1]; 2358 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2359 return false; 2360 if (V2IsSplat) { 2361 if (isUndefOrEqual(BitI1, NumElts)) 2362 return false; 2363 } else { 2364 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2365 return false; 2366 } 2367 } 2368 return true; 2369} 2370 2371bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2372 SmallVector<int, 8> M; 2373 N->getMask(M); 2374 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2375} 2376 2377/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2378/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2379/// <0, 0, 1, 1> 2380static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) { 2381 int NumElems = VT.getVectorNumElements(); 2382 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2383 return false; 2384 2385 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2386 int BitI = Mask[i]; 2387 int BitI1 = Mask[i+1]; 2388 if (!isUndefOrEqual(BitI, j)) 2389 return false; 2390 if (!isUndefOrEqual(BitI1, j)) 2391 return false; 2392 } 2393 return true; 2394} 2395 2396bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2397 SmallVector<int, 8> M; 2398 N->getMask(M); 2399 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2400} 2401 2402/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2403/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2404/// <2, 2, 3, 3> 2405static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) { 2406 int NumElems = VT.getVectorNumElements(); 2407 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2408 return false; 2409 2410 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2411 int BitI = Mask[i]; 2412 int BitI1 = Mask[i+1]; 2413 if (!isUndefOrEqual(BitI, j)) 2414 return false; 2415 if (!isUndefOrEqual(BitI1, j)) 2416 return false; 2417 } 2418 return true; 2419} 2420 2421bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2422 SmallVector<int, 8> M; 2423 N->getMask(M); 2424 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2425} 2426 2427/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2428/// specifies a shuffle of elements that is suitable for input to MOVSS, 2429/// MOVSD, and MOVD, i.e. setting the lowest element. 2430static bool isMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2431 if (VT.getVectorElementType().getSizeInBits() < 32) 2432 return false; 2433 2434 int NumElts = VT.getVectorNumElements(); 2435 2436 if (!isUndefOrEqual(Mask[0], NumElts)) 2437 return false; 2438 2439 for (int i = 1; i < NumElts; ++i) 2440 if (!isUndefOrEqual(Mask[i], i)) 2441 return false; 2442 2443 return true; 2444} 2445 2446bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2447 SmallVector<int, 8> M; 2448 N->getMask(M); 2449 return ::isMOVLMask(M, N->getValueType(0)); 2450} 2451 2452/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2453/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2454/// element of vector 2 and the other elements to come from vector 1 in order. 2455static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT, 2456 bool V2IsSplat = false, bool V2IsUndef = false) { 2457 int NumOps = VT.getVectorNumElements(); 2458 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2459 return false; 2460 2461 if (!isUndefOrEqual(Mask[0], 0)) 2462 return false; 2463 2464 for (int i = 1; i < NumOps; ++i) 2465 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2466 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2467 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2468 return false; 2469 2470 return true; 2471} 2472 2473static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2474 bool V2IsUndef = false) { 2475 SmallVector<int, 8> M; 2476 N->getMask(M); 2477 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 2478} 2479 2480/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2481/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2482bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 2483 if (N->getValueType(0).getVectorNumElements() != 4) 2484 return false; 2485 2486 // Expect 1, 1, 3, 3 2487 for (unsigned i = 0; i < 2; ++i) { 2488 int Elt = N->getMaskElt(i); 2489 if (Elt >= 0 && Elt != 1) 2490 return false; 2491 } 2492 2493 bool HasHi = false; 2494 for (unsigned i = 2; i < 4; ++i) { 2495 int Elt = N->getMaskElt(i); 2496 if (Elt >= 0 && Elt != 3) 2497 return false; 2498 if (Elt == 3) 2499 HasHi = true; 2500 } 2501 // Don't use movshdup if it can be done with a shufps. 2502 // FIXME: verify that matching u, u, 3, 3 is what we want. 2503 return HasHi; 2504} 2505 2506/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2507/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2508bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 2509 if (N->getValueType(0).getVectorNumElements() != 4) 2510 return false; 2511 2512 // Expect 0, 0, 2, 2 2513 for (unsigned i = 0; i < 2; ++i) 2514 if (N->getMaskElt(i) > 0) 2515 return false; 2516 2517 bool HasHi = false; 2518 for (unsigned i = 2; i < 4; ++i) { 2519 int Elt = N->getMaskElt(i); 2520 if (Elt >= 0 && Elt != 2) 2521 return false; 2522 if (Elt == 2) 2523 HasHi = true; 2524 } 2525 // Don't use movsldup if it can be done with a shufps. 2526 return HasHi; 2527} 2528 2529/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2530/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 2531bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 2532 int e = N->getValueType(0).getVectorNumElements() / 2; 2533 2534 for (int i = 0; i < e; ++i) 2535 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2536 return false; 2537 for (int i = 0; i < e; ++i) 2538 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 2539 return false; 2540 return true; 2541} 2542 2543/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 2544/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* 2545/// instructions. 2546unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 2547 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2548 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 2549 2550 unsigned Shift = (NumOperands == 4) ? 2 : 1; 2551 unsigned Mask = 0; 2552 for (int i = 0; i < NumOperands; ++i) { 2553 int Val = SVOp->getMaskElt(NumOperands-i-1); 2554 if (Val < 0) Val = 0; 2555 if (Val >= NumOperands) Val -= NumOperands; 2556 Mask |= Val; 2557 if (i != NumOperands - 1) 2558 Mask <<= Shift; 2559 } 2560 return Mask; 2561} 2562 2563/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 2564/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW 2565/// instructions. 2566unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 2567 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2568 unsigned Mask = 0; 2569 // 8 nodes, but we only care about the last 4. 2570 for (unsigned i = 7; i >= 4; --i) { 2571 int Val = SVOp->getMaskElt(i); 2572 if (Val >= 0) 2573 Mask |= (Val - 4); 2574 if (i != 4) 2575 Mask <<= 2; 2576 } 2577 return Mask; 2578} 2579 2580/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 2581/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW 2582/// instructions. 2583unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 2584 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2585 unsigned Mask = 0; 2586 // 8 nodes, but we only care about the first 4. 2587 for (int i = 3; i >= 0; --i) { 2588 int Val = SVOp->getMaskElt(i); 2589 if (Val >= 0) 2590 Mask |= Val; 2591 if (i != 0) 2592 Mask <<= 2; 2593 } 2594 return Mask; 2595} 2596 2597/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 2598/// their permute mask. 2599static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 2600 SelectionDAG &DAG) { 2601 MVT VT = SVOp->getValueType(0); 2602 unsigned NumElems = VT.getVectorNumElements(); 2603 SmallVector<int, 8> MaskVec; 2604 2605 for (unsigned i = 0; i != NumElems; ++i) { 2606 int idx = SVOp->getMaskElt(i); 2607 if (idx < 0) 2608 MaskVec.push_back(idx); 2609 else if (idx < (int)NumElems) 2610 MaskVec.push_back(idx + NumElems); 2611 else 2612 MaskVec.push_back(idx - NumElems); 2613 } 2614 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 2615 SVOp->getOperand(0), &MaskVec[0]); 2616} 2617 2618/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 2619/// the two vector operands have swapped position. 2620static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, MVT VT) { 2621 unsigned NumElems = VT.getVectorNumElements(); 2622 for (unsigned i = 0; i != NumElems; ++i) { 2623 int idx = Mask[i]; 2624 if (idx < 0) 2625 continue; 2626 else if (idx < (int)NumElems) 2627 Mask[i] = idx + NumElems; 2628 else 2629 Mask[i] = idx - NumElems; 2630 } 2631} 2632 2633/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 2634/// match movhlps. The lower half elements should come from upper half of 2635/// V1 (and in order), and the upper half elements should come from the upper 2636/// half of V2 (and in order). 2637static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 2638 if (Op->getValueType(0).getVectorNumElements() != 4) 2639 return false; 2640 for (unsigned i = 0, e = 2; i != e; ++i) 2641 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 2642 return false; 2643 for (unsigned i = 2; i != 4; ++i) 2644 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 2645 return false; 2646 return true; 2647} 2648 2649/// isScalarLoadToVector - Returns true if the node is a scalar load that 2650/// is promoted to a vector. It also returns the LoadSDNode by reference if 2651/// required. 2652static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 2653 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 2654 return false; 2655 N = N->getOperand(0).getNode(); 2656 if (!ISD::isNON_EXTLoad(N)) 2657 return false; 2658 if (LD) 2659 *LD = cast<LoadSDNode>(N); 2660 return true; 2661} 2662 2663/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 2664/// match movlp{s|d}. The lower half elements should come from lower half of 2665/// V1 (and in order), and the upper half elements should come from the upper 2666/// half of V2 (and in order). And since V1 will become the source of the 2667/// MOVLP, it must be either a vector load or a scalar load to vector. 2668static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 2669 ShuffleVectorSDNode *Op) { 2670 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 2671 return false; 2672 // Is V2 is a vector load, don't do this transformation. We will try to use 2673 // load folding shufps op. 2674 if (ISD::isNON_EXTLoad(V2)) 2675 return false; 2676 2677 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 2678 2679 if (NumElems != 2 && NumElems != 4) 2680 return false; 2681 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 2682 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 2683 return false; 2684 for (unsigned i = NumElems/2; i != NumElems; ++i) 2685 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 2686 return false; 2687 return true; 2688} 2689 2690/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 2691/// all the same. 2692static bool isSplatVector(SDNode *N) { 2693 if (N->getOpcode() != ISD::BUILD_VECTOR) 2694 return false; 2695 2696 SDValue SplatValue = N->getOperand(0); 2697 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 2698 if (N->getOperand(i) != SplatValue) 2699 return false; 2700 return true; 2701} 2702 2703/// isZeroNode - Returns true if Elt is a constant zero or a floating point 2704/// constant +0.0. 2705static inline bool isZeroNode(SDValue Elt) { 2706 return ((isa<ConstantSDNode>(Elt) && 2707 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 2708 (isa<ConstantFPSDNode>(Elt) && 2709 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 2710} 2711 2712/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 2713/// to an zero vector. 2714/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 2715static bool isZeroShuffle(ShuffleVectorSDNode *N) { 2716 SDValue V1 = N->getOperand(0); 2717 SDValue V2 = N->getOperand(1); 2718 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2719 for (unsigned i = 0; i != NumElems; ++i) { 2720 int Idx = N->getMaskElt(i); 2721 if (Idx >= (int)NumElems) { 2722 unsigned Opc = V2.getOpcode(); 2723 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 2724 continue; 2725 if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V2.getOperand(Idx-NumElems))) 2726 return false; 2727 } else if (Idx >= 0) { 2728 unsigned Opc = V1.getOpcode(); 2729 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 2730 continue; 2731 if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V1.getOperand(Idx))) 2732 return false; 2733 } 2734 } 2735 return true; 2736} 2737 2738/// getZeroVector - Returns a vector of specified type with all zero elements. 2739/// 2740static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG, 2741 DebugLoc dl) { 2742 assert(VT.isVector() && "Expected a vector type"); 2743 2744 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2745 // type. This ensures they get CSE'd. 2746 SDValue Vec; 2747 if (VT.getSizeInBits() == 64) { // MMX 2748 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2749 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2750 } else if (HasSSE2) { // SSE2 2751 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2752 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2753 } else { // SSE1 2754 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 2755 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 2756 } 2757 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2758} 2759 2760/// getOnesVector - Returns a vector of specified type with all bits set. 2761/// 2762static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { 2763 assert(VT.isVector() && "Expected a vector type"); 2764 2765 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2766 // type. This ensures they get CSE'd. 2767 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 2768 SDValue Vec; 2769 if (VT.getSizeInBits() == 64) // MMX 2770 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2771 else // SSE 2772 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2773 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2774} 2775 2776 2777/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 2778/// that point to V2 points to its first element. 2779static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 2780 MVT VT = SVOp->getValueType(0); 2781 unsigned NumElems = VT.getVectorNumElements(); 2782 2783 bool Changed = false; 2784 SmallVector<int, 8> MaskVec; 2785 SVOp->getMask(MaskVec); 2786 2787 for (unsigned i = 0; i != NumElems; ++i) { 2788 if (MaskVec[i] > (int)NumElems) { 2789 MaskVec[i] = NumElems; 2790 Changed = true; 2791 } 2792 } 2793 if (Changed) 2794 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 2795 SVOp->getOperand(1), &MaskVec[0]); 2796 return SDValue(SVOp, 0); 2797} 2798 2799/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 2800/// operation of specified width. 2801static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 2802 SDValue V2) { 2803 unsigned NumElems = VT.getVectorNumElements(); 2804 SmallVector<int, 8> Mask; 2805 Mask.push_back(NumElems); 2806 for (unsigned i = 1; i != NumElems; ++i) 2807 Mask.push_back(i); 2808 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2809} 2810 2811/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 2812static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 2813 SDValue V2) { 2814 unsigned NumElems = VT.getVectorNumElements(); 2815 SmallVector<int, 8> Mask; 2816 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 2817 Mask.push_back(i); 2818 Mask.push_back(i + NumElems); 2819 } 2820 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2821} 2822 2823/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 2824static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 2825 SDValue V2) { 2826 unsigned NumElems = VT.getVectorNumElements(); 2827 unsigned Half = NumElems/2; 2828 SmallVector<int, 8> Mask; 2829 for (unsigned i = 0; i != Half; ++i) { 2830 Mask.push_back(i + Half); 2831 Mask.push_back(i + NumElems + Half); 2832 } 2833 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2834} 2835 2836/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 2837static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 2838 bool HasSSE2) { 2839 if (SV->getValueType(0).getVectorNumElements() <= 4) 2840 return SDValue(SV, 0); 2841 2842 MVT PVT = MVT::v4f32; 2843 MVT VT = SV->getValueType(0); 2844 DebugLoc dl = SV->getDebugLoc(); 2845 SDValue V1 = SV->getOperand(0); 2846 int NumElems = VT.getVectorNumElements(); 2847 int EltNo = SV->getSplatIndex(); 2848 2849 // unpack elements to the correct location 2850 while (NumElems > 4) { 2851 if (EltNo < NumElems/2) { 2852 V1 = getUnpackl(DAG, dl, VT, V1, V1); 2853 } else { 2854 V1 = getUnpackh(DAG, dl, VT, V1, V1); 2855 EltNo -= NumElems/2; 2856 } 2857 NumElems >>= 1; 2858 } 2859 2860 // Perform the splat. 2861 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 2862 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 2863 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 2864 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 2865} 2866 2867/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 2868/// vector of zero or undef vector. This produces a shuffle where the low 2869/// element of V2 is swizzled into the zero/undef vector, landing at element 2870/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 2871static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 2872 bool isZero, bool HasSSE2, 2873 SelectionDAG &DAG) { 2874 MVT VT = V2.getValueType(); 2875 SDValue V1 = isZero 2876 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 2877 unsigned NumElems = VT.getVectorNumElements(); 2878 SmallVector<int, 16> MaskVec; 2879 for (unsigned i = 0; i != NumElems; ++i) 2880 // If this is the insertion idx, put the low elt of V2 here. 2881 MaskVec.push_back(i == Idx ? NumElems : i); 2882 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 2883} 2884 2885/// getNumOfConsecutiveZeros - Return the number of elements in a result of 2886/// a shuffle that is zero. 2887static 2888unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 2889 bool Low, SelectionDAG &DAG) { 2890 unsigned NumZeros = 0; 2891 for (int i = 0; i < NumElems; ++i) { 2892 unsigned Index = Low ? i : NumElems-i-1; 2893 int Idx = SVOp->getMaskElt(Index); 2894 if (Idx < 0) { 2895 ++NumZeros; 2896 continue; 2897 } 2898 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 2899 if (Elt.getNode() && isZeroNode(Elt)) 2900 ++NumZeros; 2901 else 2902 break; 2903 } 2904 return NumZeros; 2905} 2906 2907/// isVectorShift - Returns true if the shuffle can be implemented as a 2908/// logical left or right shift of a vector. 2909/// FIXME: split into pslldqi, psrldqi, palignr variants. 2910static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 2911 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 2912 int NumElems = SVOp->getValueType(0).getVectorNumElements(); 2913 2914 isLeft = true; 2915 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 2916 if (!NumZeros) { 2917 isLeft = false; 2918 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 2919 if (!NumZeros) 2920 return false; 2921 } 2922 bool SeenV1 = false; 2923 bool SeenV2 = false; 2924 for (int i = NumZeros; i < NumElems; ++i) { 2925 int Val = isLeft ? (i - NumZeros) : i; 2926 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 2927 if (Idx < 0) 2928 continue; 2929 if (Idx < NumElems) 2930 SeenV1 = true; 2931 else { 2932 Idx -= NumElems; 2933 SeenV2 = true; 2934 } 2935 if (Idx != Val) 2936 return false; 2937 } 2938 if (SeenV1 && SeenV2) 2939 return false; 2940 2941 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 2942 ShAmt = NumZeros; 2943 return true; 2944} 2945 2946 2947/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 2948/// 2949static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 2950 unsigned NumNonZero, unsigned NumZero, 2951 SelectionDAG &DAG, TargetLowering &TLI) { 2952 if (NumNonZero > 8) 2953 return SDValue(); 2954 2955 DebugLoc dl = Op.getDebugLoc(); 2956 SDValue V(0, 0); 2957 bool First = true; 2958 for (unsigned i = 0; i < 16; ++i) { 2959 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 2960 if (ThisIsNonZero && First) { 2961 if (NumZero) 2962 V = getZeroVector(MVT::v8i16, true, DAG, dl); 2963 else 2964 V = DAG.getUNDEF(MVT::v8i16); 2965 First = false; 2966 } 2967 2968 if ((i & 1) != 0) { 2969 SDValue ThisElt(0, 0), LastElt(0, 0); 2970 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 2971 if (LastIsNonZero) { 2972 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 2973 MVT::i16, Op.getOperand(i-1)); 2974 } 2975 if (ThisIsNonZero) { 2976 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 2977 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 2978 ThisElt, DAG.getConstant(8, MVT::i8)); 2979 if (LastIsNonZero) 2980 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 2981 } else 2982 ThisElt = LastElt; 2983 2984 if (ThisElt.getNode()) 2985 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 2986 DAG.getIntPtrConstant(i/2)); 2987 } 2988 } 2989 2990 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 2991} 2992 2993/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 2994/// 2995static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 2996 unsigned NumNonZero, unsigned NumZero, 2997 SelectionDAG &DAG, TargetLowering &TLI) { 2998 if (NumNonZero > 4) 2999 return SDValue(); 3000 3001 DebugLoc dl = Op.getDebugLoc(); 3002 SDValue V(0, 0); 3003 bool First = true; 3004 for (unsigned i = 0; i < 8; ++i) { 3005 bool isNonZero = (NonZeros & (1 << i)) != 0; 3006 if (isNonZero) { 3007 if (First) { 3008 if (NumZero) 3009 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3010 else 3011 V = DAG.getUNDEF(MVT::v8i16); 3012 First = false; 3013 } 3014 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3015 MVT::v8i16, V, Op.getOperand(i), 3016 DAG.getIntPtrConstant(i)); 3017 } 3018 } 3019 3020 return V; 3021} 3022 3023/// getVShift - Return a vector logical shift node. 3024/// 3025static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp, 3026 unsigned NumBits, SelectionDAG &DAG, 3027 const TargetLowering &TLI, DebugLoc dl) { 3028 bool isMMX = VT.getSizeInBits() == 64; 3029 MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3030 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3031 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3032 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3033 DAG.getNode(Opc, dl, ShVT, SrcOp, 3034 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3035} 3036 3037SDValue 3038X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3039 DebugLoc dl = Op.getDebugLoc(); 3040 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3041 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3042 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3043 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3044 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3045 // eliminated on x86-32 hosts. 3046 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3047 return Op; 3048 3049 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3050 return getOnesVector(Op.getValueType(), DAG, dl); 3051 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3052 } 3053 3054 MVT VT = Op.getValueType(); 3055 MVT EVT = VT.getVectorElementType(); 3056 unsigned EVTBits = EVT.getSizeInBits(); 3057 3058 unsigned NumElems = Op.getNumOperands(); 3059 unsigned NumZero = 0; 3060 unsigned NumNonZero = 0; 3061 unsigned NonZeros = 0; 3062 bool IsAllConstants = true; 3063 SmallSet<SDValue, 8> Values; 3064 for (unsigned i = 0; i < NumElems; ++i) { 3065 SDValue Elt = Op.getOperand(i); 3066 if (Elt.getOpcode() == ISD::UNDEF) 3067 continue; 3068 Values.insert(Elt); 3069 if (Elt.getOpcode() != ISD::Constant && 3070 Elt.getOpcode() != ISD::ConstantFP) 3071 IsAllConstants = false; 3072 if (isZeroNode(Elt)) 3073 NumZero++; 3074 else { 3075 NonZeros |= (1 << i); 3076 NumNonZero++; 3077 } 3078 } 3079 3080 if (NumNonZero == 0) { 3081 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3082 return DAG.getUNDEF(VT); 3083 } 3084 3085 // Special case for single non-zero, non-undef, element. 3086 if (NumNonZero == 1) { 3087 unsigned Idx = CountTrailingZeros_32(NonZeros); 3088 SDValue Item = Op.getOperand(Idx); 3089 3090 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3091 // the value are obviously zero, truncate the value to i32 and do the 3092 // insertion that way. Only do this if the value is non-constant or if the 3093 // value is a constant being inserted into element 0. It is cheaper to do 3094 // a constant pool load than it is to do a movd + shuffle. 3095 if (EVT == MVT::i64 && !Subtarget->is64Bit() && 3096 (!IsAllConstants || Idx == 0)) { 3097 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3098 // Handle MMX and SSE both. 3099 MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3100 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3101 3102 // Truncate the value (which may itself be a constant) to i32, and 3103 // convert it to a vector with movd (S2V+shuffle to zero extend). 3104 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3105 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3106 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3107 Subtarget->hasSSE2(), DAG); 3108 3109 // Now we have our 32-bit value zero extended in the low element of 3110 // a vector. If Idx != 0, swizzle it into place. 3111 if (Idx != 0) { 3112 SmallVector<int, 4> Mask; 3113 Mask.push_back(Idx); 3114 for (unsigned i = 1; i != VecElts; ++i) 3115 Mask.push_back(i); 3116 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3117 DAG.getUNDEF(Item.getValueType()), 3118 &Mask[0]); 3119 } 3120 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3121 } 3122 } 3123 3124 // If we have a constant or non-constant insertion into the low element of 3125 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3126 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3127 // depending on what the source datatype is. 3128 if (Idx == 0) { 3129 if (NumZero == 0) { 3130 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3131 } else if (EVT == MVT::i32 || EVT == MVT::f32 || EVT == MVT::f64 || 3132 (EVT == MVT::i64 && Subtarget->is64Bit())) { 3133 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3134 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3135 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3136 DAG); 3137 } else if (EVT == MVT::i16 || EVT == MVT::i8) { 3138 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3139 MVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3140 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3141 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3142 Subtarget->hasSSE2(), DAG); 3143 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3144 } 3145 } 3146 3147 // Is it a vector logical left shift? 3148 if (NumElems == 2 && Idx == 1 && 3149 isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) { 3150 unsigned NumBits = VT.getSizeInBits(); 3151 return getVShift(true, VT, 3152 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3153 VT, Op.getOperand(1)), 3154 NumBits/2, DAG, *this, dl); 3155 } 3156 3157 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3158 return SDValue(); 3159 3160 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3161 // is a non-constant being inserted into an element other than the low one, 3162 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3163 // movd/movss) to move this into the low element, then shuffle it into 3164 // place. 3165 if (EVTBits == 32) { 3166 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3167 3168 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3169 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3170 Subtarget->hasSSE2(), DAG); 3171 SmallVector<int, 8> MaskVec; 3172 for (unsigned i = 0; i < NumElems; i++) 3173 MaskVec.push_back(i == Idx ? 0 : 1); 3174 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3175 } 3176 } 3177 3178 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3179 if (Values.size() == 1) 3180 return SDValue(); 3181 3182 // A vector full of immediates; various special cases are already 3183 // handled, so this is best done with a single constant-pool load. 3184 if (IsAllConstants) 3185 return SDValue(); 3186 3187 // Let legalizer expand 2-wide build_vectors. 3188 if (EVTBits == 64) { 3189 if (NumNonZero == 1) { 3190 // One half is zero or undef. 3191 unsigned Idx = CountTrailingZeros_32(NonZeros); 3192 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3193 Op.getOperand(Idx)); 3194 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3195 Subtarget->hasSSE2(), DAG); 3196 } 3197 return SDValue(); 3198 } 3199 3200 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3201 if (EVTBits == 8 && NumElems == 16) { 3202 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3203 *this); 3204 if (V.getNode()) return V; 3205 } 3206 3207 if (EVTBits == 16 && NumElems == 8) { 3208 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3209 *this); 3210 if (V.getNode()) return V; 3211 } 3212 3213 // If element VT is == 32 bits, turn it into a number of shuffles. 3214 SmallVector<SDValue, 8> V; 3215 V.resize(NumElems); 3216 if (NumElems == 4 && NumZero > 0) { 3217 for (unsigned i = 0; i < 4; ++i) { 3218 bool isZero = !(NonZeros & (1 << i)); 3219 if (isZero) 3220 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3221 else 3222 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3223 } 3224 3225 for (unsigned i = 0; i < 2; ++i) { 3226 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3227 default: break; 3228 case 0: 3229 V[i] = V[i*2]; // Must be a zero vector. 3230 break; 3231 case 1: 3232 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3233 break; 3234 case 2: 3235 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3236 break; 3237 case 3: 3238 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3239 break; 3240 } 3241 } 3242 3243 SmallVector<int, 8> MaskVec; 3244 bool Reverse = (NonZeros & 0x3) == 2; 3245 for (unsigned i = 0; i < 2; ++i) 3246 MaskVec.push_back(Reverse ? 1-i : i); 3247 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3248 for (unsigned i = 0; i < 2; ++i) 3249 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3250 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3251 } 3252 3253 if (Values.size() > 2) { 3254 // If we have SSE 4.1, Expand into a number of inserts unless the number of 3255 // values to be inserted is equal to the number of elements, in which case 3256 // use the unpack code below in the hopes of matching the consecutive elts 3257 // load merge pattern for shuffles. 3258 // FIXME: We could probably just check that here directly. 3259 if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 3260 getSubtarget()->hasSSE41()) { 3261 V[0] = DAG.getUNDEF(VT); 3262 for (unsigned i = 0; i < NumElems; ++i) 3263 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3264 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3265 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3266 return V[0]; 3267 } 3268 // Expand into a number of unpckl*. 3269 // e.g. for v4f32 3270 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3271 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3272 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3273 for (unsigned i = 0; i < NumElems; ++i) 3274 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3275 NumElems >>= 1; 3276 while (NumElems != 0) { 3277 for (unsigned i = 0; i < NumElems; ++i) 3278 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3279 NumElems >>= 1; 3280 } 3281 return V[0]; 3282 } 3283 3284 return SDValue(); 3285} 3286 3287// v8i16 shuffles - Prefer shuffles in the following order: 3288// 1. [all] pshuflw, pshufhw, optional move 3289// 2. [ssse3] 1 x pshufb 3290// 3. [ssse3] 2 x pshufb + 1 x por 3291// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3292static 3293SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 3294 SelectionDAG &DAG, X86TargetLowering &TLI) { 3295 SDValue V1 = SVOp->getOperand(0); 3296 SDValue V2 = SVOp->getOperand(1); 3297 DebugLoc dl = SVOp->getDebugLoc(); 3298 SmallVector<int, 8> MaskVals; 3299 3300 // Determine if more than 1 of the words in each of the low and high quadwords 3301 // of the result come from the same quadword of one of the two inputs. Undef 3302 // mask values count as coming from any quadword, for better codegen. 3303 SmallVector<unsigned, 4> LoQuad(4); 3304 SmallVector<unsigned, 4> HiQuad(4); 3305 BitVector InputQuads(4); 3306 for (unsigned i = 0; i < 8; ++i) { 3307 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3308 int EltIdx = SVOp->getMaskElt(i); 3309 MaskVals.push_back(EltIdx); 3310 if (EltIdx < 0) { 3311 ++Quad[0]; 3312 ++Quad[1]; 3313 ++Quad[2]; 3314 ++Quad[3]; 3315 continue; 3316 } 3317 ++Quad[EltIdx / 4]; 3318 InputQuads.set(EltIdx / 4); 3319 } 3320 3321 int BestLoQuad = -1; 3322 unsigned MaxQuad = 1; 3323 for (unsigned i = 0; i < 4; ++i) { 3324 if (LoQuad[i] > MaxQuad) { 3325 BestLoQuad = i; 3326 MaxQuad = LoQuad[i]; 3327 } 3328 } 3329 3330 int BestHiQuad = -1; 3331 MaxQuad = 1; 3332 for (unsigned i = 0; i < 4; ++i) { 3333 if (HiQuad[i] > MaxQuad) { 3334 BestHiQuad = i; 3335 MaxQuad = HiQuad[i]; 3336 } 3337 } 3338 3339 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3340 // of the two input vectors, shuffle them into one input vector so only a 3341 // single pshufb instruction is necessary. If There are more than 2 input 3342 // quads, disable the next transformation since it does not help SSSE3. 3343 bool V1Used = InputQuads[0] || InputQuads[1]; 3344 bool V2Used = InputQuads[2] || InputQuads[3]; 3345 if (TLI.getSubtarget()->hasSSSE3()) { 3346 if (InputQuads.count() == 2 && V1Used && V2Used) { 3347 BestLoQuad = InputQuads.find_first(); 3348 BestHiQuad = InputQuads.find_next(BestLoQuad); 3349 } 3350 if (InputQuads.count() > 2) { 3351 BestLoQuad = -1; 3352 BestHiQuad = -1; 3353 } 3354 } 3355 3356 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3357 // the shuffle mask. If a quad is scored as -1, that means that it contains 3358 // words from all 4 input quadwords. 3359 SDValue NewV; 3360 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3361 SmallVector<int, 8> MaskV; 3362 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 3363 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 3364 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 3365 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3366 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 3367 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3368 3369 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3370 // source words for the shuffle, to aid later transformations. 3371 bool AllWordsInNewV = true; 3372 bool InOrder[2] = { true, true }; 3373 for (unsigned i = 0; i != 8; ++i) { 3374 int idx = MaskVals[i]; 3375 if (idx != (int)i) 3376 InOrder[i/4] = false; 3377 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3378 continue; 3379 AllWordsInNewV = false; 3380 break; 3381 } 3382 3383 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3384 if (AllWordsInNewV) { 3385 for (int i = 0; i != 8; ++i) { 3386 int idx = MaskVals[i]; 3387 if (idx < 0) 3388 continue; 3389 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3390 if ((idx != i) && idx < 4) 3391 pshufhw = false; 3392 if ((idx != i) && idx > 3) 3393 pshuflw = false; 3394 } 3395 V1 = NewV; 3396 V2Used = false; 3397 BestLoQuad = 0; 3398 BestHiQuad = 1; 3399 } 3400 3401 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3402 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3403 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3404 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 3405 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 3406 } 3407 } 3408 3409 // If we have SSSE3, and all words of the result are from 1 input vector, 3410 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 3411 // is present, fall back to case 4. 3412 if (TLI.getSubtarget()->hasSSSE3()) { 3413 SmallVector<SDValue,16> pshufbMask; 3414 3415 // If we have elements from both input vectors, set the high bit of the 3416 // shuffle mask element to zero out elements that come from V2 in the V1 3417 // mask, and elements that come from V1 in the V2 mask, so that the two 3418 // results can be OR'd together. 3419 bool TwoInputs = V1Used && V2Used; 3420 for (unsigned i = 0; i != 8; ++i) { 3421 int EltIdx = MaskVals[i] * 2; 3422 if (TwoInputs && (EltIdx >= 16)) { 3423 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3424 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3425 continue; 3426 } 3427 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3428 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 3429 } 3430 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 3431 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3432 DAG.getNode(ISD::BUILD_VECTOR, dl, 3433 MVT::v16i8, &pshufbMask[0], 16)); 3434 if (!TwoInputs) 3435 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3436 3437 // Calculate the shuffle mask for the second input, shuffle it, and 3438 // OR it with the first shuffled input. 3439 pshufbMask.clear(); 3440 for (unsigned i = 0; i != 8; ++i) { 3441 int EltIdx = MaskVals[i] * 2; 3442 if (EltIdx < 16) { 3443 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3444 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3445 continue; 3446 } 3447 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3448 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 3449 } 3450 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 3451 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3452 DAG.getNode(ISD::BUILD_VECTOR, dl, 3453 MVT::v16i8, &pshufbMask[0], 16)); 3454 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3455 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3456 } 3457 3458 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 3459 // and update MaskVals with new element order. 3460 BitVector InOrder(8); 3461 if (BestLoQuad >= 0) { 3462 SmallVector<int, 8> MaskV; 3463 for (int i = 0; i != 4; ++i) { 3464 int idx = MaskVals[i]; 3465 if (idx < 0) { 3466 MaskV.push_back(-1); 3467 InOrder.set(i); 3468 } else if ((idx / 4) == BestLoQuad) { 3469 MaskV.push_back(idx & 3); 3470 InOrder.set(i); 3471 } else { 3472 MaskV.push_back(-1); 3473 } 3474 } 3475 for (unsigned i = 4; i != 8; ++i) 3476 MaskV.push_back(i); 3477 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3478 &MaskV[0]); 3479 } 3480 3481 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 3482 // and update MaskVals with the new element order. 3483 if (BestHiQuad >= 0) { 3484 SmallVector<int, 8> MaskV; 3485 for (unsigned i = 0; i != 4; ++i) 3486 MaskV.push_back(i); 3487 for (unsigned i = 4; i != 8; ++i) { 3488 int idx = MaskVals[i]; 3489 if (idx < 0) { 3490 MaskV.push_back(-1); 3491 InOrder.set(i); 3492 } else if ((idx / 4) == BestHiQuad) { 3493 MaskV.push_back((idx & 3) + 4); 3494 InOrder.set(i); 3495 } else { 3496 MaskV.push_back(-1); 3497 } 3498 } 3499 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3500 &MaskV[0]); 3501 } 3502 3503 // In case BestHi & BestLo were both -1, which means each quadword has a word 3504 // from each of the four input quadwords, calculate the InOrder bitvector now 3505 // before falling through to the insert/extract cleanup. 3506 if (BestLoQuad == -1 && BestHiQuad == -1) { 3507 NewV = V1; 3508 for (int i = 0; i != 8; ++i) 3509 if (MaskVals[i] < 0 || MaskVals[i] == i) 3510 InOrder.set(i); 3511 } 3512 3513 // The other elements are put in the right place using pextrw and pinsrw. 3514 for (unsigned i = 0; i != 8; ++i) { 3515 if (InOrder[i]) 3516 continue; 3517 int EltIdx = MaskVals[i]; 3518 if (EltIdx < 0) 3519 continue; 3520 SDValue ExtOp = (EltIdx < 8) 3521 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 3522 DAG.getIntPtrConstant(EltIdx)) 3523 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 3524 DAG.getIntPtrConstant(EltIdx - 8)); 3525 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 3526 DAG.getIntPtrConstant(i)); 3527 } 3528 return NewV; 3529} 3530 3531// v16i8 shuffles - Prefer shuffles in the following order: 3532// 1. [ssse3] 1 x pshufb 3533// 2. [ssse3] 2 x pshufb + 1 x por 3534// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 3535static 3536SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 3537 SelectionDAG &DAG, X86TargetLowering &TLI) { 3538 SDValue V1 = SVOp->getOperand(0); 3539 SDValue V2 = SVOp->getOperand(1); 3540 DebugLoc dl = SVOp->getDebugLoc(); 3541 SmallVector<int, 16> MaskVals; 3542 SVOp->getMask(MaskVals); 3543 3544 // If we have SSSE3, case 1 is generated when all result bytes come from 3545 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 3546 // present, fall back to case 3. 3547 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 3548 bool V1Only = true; 3549 bool V2Only = true; 3550 for (unsigned i = 0; i < 16; ++i) { 3551 int EltIdx = MaskVals[i]; 3552 if (EltIdx < 0) 3553 continue; 3554 if (EltIdx < 16) 3555 V2Only = false; 3556 else 3557 V1Only = false; 3558 } 3559 3560 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 3561 if (TLI.getSubtarget()->hasSSSE3()) { 3562 SmallVector<SDValue,16> pshufbMask; 3563 3564 // If all result elements are from one input vector, then only translate 3565 // undef mask values to 0x80 (zero out result) in the pshufb mask. 3566 // 3567 // Otherwise, we have elements from both input vectors, and must zero out 3568 // elements that come from V2 in the first mask, and V1 in the second mask 3569 // so that we can OR them together. 3570 bool TwoInputs = !(V1Only || V2Only); 3571 for (unsigned i = 0; i != 16; ++i) { 3572 int EltIdx = MaskVals[i]; 3573 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 3574 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3575 continue; 3576 } 3577 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3578 } 3579 // If all the elements are from V2, assign it to V1 and return after 3580 // building the first pshufb. 3581 if (V2Only) 3582 V1 = V2; 3583 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3584 DAG.getNode(ISD::BUILD_VECTOR, dl, 3585 MVT::v16i8, &pshufbMask[0], 16)); 3586 if (!TwoInputs) 3587 return V1; 3588 3589 // Calculate the shuffle mask for the second input, shuffle it, and 3590 // OR it with the first shuffled input. 3591 pshufbMask.clear(); 3592 for (unsigned i = 0; i != 16; ++i) { 3593 int EltIdx = MaskVals[i]; 3594 if (EltIdx < 16) { 3595 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3596 continue; 3597 } 3598 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3599 } 3600 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3601 DAG.getNode(ISD::BUILD_VECTOR, dl, 3602 MVT::v16i8, &pshufbMask[0], 16)); 3603 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3604 } 3605 3606 // No SSSE3 - Calculate in place words and then fix all out of place words 3607 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 3608 // the 16 different words that comprise the two doublequadword input vectors. 3609 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3610 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 3611 SDValue NewV = V2Only ? V2 : V1; 3612 for (int i = 0; i != 8; ++i) { 3613 int Elt0 = MaskVals[i*2]; 3614 int Elt1 = MaskVals[i*2+1]; 3615 3616 // This word of the result is all undef, skip it. 3617 if (Elt0 < 0 && Elt1 < 0) 3618 continue; 3619 3620 // This word of the result is already in the correct place, skip it. 3621 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 3622 continue; 3623 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 3624 continue; 3625 3626 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 3627 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 3628 SDValue InsElt; 3629 3630 // If Elt0 and Elt1 are defined, are consecutive, and can be load 3631 // using a single extract together, load it and store it. 3632 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 3633 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3634 DAG.getIntPtrConstant(Elt1 / 2)); 3635 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3636 DAG.getIntPtrConstant(i)); 3637 continue; 3638 } 3639 3640 // If Elt1 is defined, extract it from the appropriate source. If the 3641 // source byte is not also odd, shift the extracted word left 8 bits 3642 // otherwise clear the bottom 8 bits if we need to do an or. 3643 if (Elt1 >= 0) { 3644 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3645 DAG.getIntPtrConstant(Elt1 / 2)); 3646 if ((Elt1 & 1) == 0) 3647 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 3648 DAG.getConstant(8, TLI.getShiftAmountTy())); 3649 else if (Elt0 >= 0) 3650 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 3651 DAG.getConstant(0xFF00, MVT::i16)); 3652 } 3653 // If Elt0 is defined, extract it from the appropriate source. If the 3654 // source byte is not also even, shift the extracted word right 8 bits. If 3655 // Elt1 was also defined, OR the extracted values together before 3656 // inserting them in the result. 3657 if (Elt0 >= 0) { 3658 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 3659 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 3660 if ((Elt0 & 1) != 0) 3661 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 3662 DAG.getConstant(8, TLI.getShiftAmountTy())); 3663 else if (Elt1 >= 0) 3664 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 3665 DAG.getConstant(0x00FF, MVT::i16)); 3666 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 3667 : InsElt0; 3668 } 3669 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3670 DAG.getIntPtrConstant(i)); 3671 } 3672 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 3673} 3674 3675/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 3676/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 3677/// done when every pair / quad of shuffle mask elements point to elements in 3678/// the right sequence. e.g. 3679/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 3680static 3681SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 3682 SelectionDAG &DAG, 3683 TargetLowering &TLI, DebugLoc dl) { 3684 MVT VT = SVOp->getValueType(0); 3685 SDValue V1 = SVOp->getOperand(0); 3686 SDValue V2 = SVOp->getOperand(1); 3687 unsigned NumElems = VT.getVectorNumElements(); 3688 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 3689 MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 3690 MVT MaskEltVT = MaskVT.getVectorElementType(); 3691 MVT NewVT = MaskVT; 3692 switch (VT.getSimpleVT()) { 3693 default: assert(false && "Unexpected!"); 3694 case MVT::v4f32: NewVT = MVT::v2f64; break; 3695 case MVT::v4i32: NewVT = MVT::v2i64; break; 3696 case MVT::v8i16: NewVT = MVT::v4i32; break; 3697 case MVT::v16i8: NewVT = MVT::v4i32; break; 3698 } 3699 3700 if (NewWidth == 2) { 3701 if (VT.isInteger()) 3702 NewVT = MVT::v2i64; 3703 else 3704 NewVT = MVT::v2f64; 3705 } 3706 int Scale = NumElems / NewWidth; 3707 SmallVector<int, 8> MaskVec; 3708 for (unsigned i = 0; i < NumElems; i += Scale) { 3709 int StartIdx = -1; 3710 for (int j = 0; j < Scale; ++j) { 3711 int EltIdx = SVOp->getMaskElt(i+j); 3712 if (EltIdx < 0) 3713 continue; 3714 if (StartIdx == -1) 3715 StartIdx = EltIdx - (EltIdx % Scale); 3716 if (EltIdx != StartIdx + j) 3717 return SDValue(); 3718 } 3719 if (StartIdx == -1) 3720 MaskVec.push_back(-1); 3721 else 3722 MaskVec.push_back(StartIdx / Scale); 3723 } 3724 3725 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 3726 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 3727 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 3728} 3729 3730/// getVZextMovL - Return a zero-extending vector move low node. 3731/// 3732static SDValue getVZextMovL(MVT VT, MVT OpVT, 3733 SDValue SrcOp, SelectionDAG &DAG, 3734 const X86Subtarget *Subtarget, DebugLoc dl) { 3735 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 3736 LoadSDNode *LD = NULL; 3737 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 3738 LD = dyn_cast<LoadSDNode>(SrcOp); 3739 if (!LD) { 3740 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 3741 // instead. 3742 MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 3743 if ((EVT != MVT::i64 || Subtarget->is64Bit()) && 3744 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 3745 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 3746 SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) { 3747 // PR2108 3748 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 3749 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3750 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 3751 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3752 OpVT, 3753 SrcOp.getOperand(0) 3754 .getOperand(0)))); 3755 } 3756 } 3757 } 3758 3759 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3760 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 3761 DAG.getNode(ISD::BIT_CONVERT, dl, 3762 OpVT, SrcOp))); 3763} 3764 3765/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 3766/// shuffles. 3767static SDValue 3768LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3769 SDValue V1 = SVOp->getOperand(0); 3770 SDValue V2 = SVOp->getOperand(1); 3771 DebugLoc dl = SVOp->getDebugLoc(); 3772 MVT VT = SVOp->getValueType(0); 3773 3774 SmallVector<std::pair<int, int>, 8> Locs; 3775 Locs.resize(4); 3776 SmallVector<int, 8> Mask1(4U, -1); 3777 SmallVector<int, 8> PermMask; 3778 SVOp->getMask(PermMask); 3779 3780 unsigned NumHi = 0; 3781 unsigned NumLo = 0; 3782 for (unsigned i = 0; i != 4; ++i) { 3783 int Idx = PermMask[i]; 3784 if (Idx < 0) { 3785 Locs[i] = std::make_pair(-1, -1); 3786 } else { 3787 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 3788 if (Idx < 4) { 3789 Locs[i] = std::make_pair(0, NumLo); 3790 Mask1[NumLo] = Idx; 3791 NumLo++; 3792 } else { 3793 Locs[i] = std::make_pair(1, NumHi); 3794 if (2+NumHi < 4) 3795 Mask1[2+NumHi] = Idx; 3796 NumHi++; 3797 } 3798 } 3799 } 3800 3801 if (NumLo <= 2 && NumHi <= 2) { 3802 // If no more than two elements come from either vector. This can be 3803 // implemented with two shuffles. First shuffle gather the elements. 3804 // The second shuffle, which takes the first shuffle as both of its 3805 // vector operands, put the elements into the right order. 3806 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 3807 3808 SmallVector<int, 8> Mask2(4U, -1); 3809 3810 for (unsigned i = 0; i != 4; ++i) { 3811 if (Locs[i].first == -1) 3812 continue; 3813 else { 3814 unsigned Idx = (i < 2) ? 0 : 4; 3815 Idx += Locs[i].first * 2 + Locs[i].second; 3816 Mask2[i] = Idx; 3817 } 3818 } 3819 3820 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 3821 } else if (NumLo == 3 || NumHi == 3) { 3822 // Otherwise, we must have three elements from one vector, call it X, and 3823 // one element from the other, call it Y. First, use a shufps to build an 3824 // intermediate vector with the one element from Y and the element from X 3825 // that will be in the same half in the final destination (the indexes don't 3826 // matter). Then, use a shufps to build the final vector, taking the half 3827 // containing the element from Y from the intermediate, and the other half 3828 // from X. 3829 if (NumHi == 3) { 3830 // Normalize it so the 3 elements come from V1. 3831 CommuteVectorShuffleMask(PermMask, VT); 3832 std::swap(V1, V2); 3833 } 3834 3835 // Find the element from V2. 3836 unsigned HiIndex; 3837 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 3838 int Val = PermMask[HiIndex]; 3839 if (Val < 0) 3840 continue; 3841 if (Val >= 4) 3842 break; 3843 } 3844 3845 Mask1[0] = PermMask[HiIndex]; 3846 Mask1[1] = -1; 3847 Mask1[2] = PermMask[HiIndex^1]; 3848 Mask1[3] = -1; 3849 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 3850 3851 if (HiIndex >= 2) { 3852 Mask1[0] = PermMask[0]; 3853 Mask1[1] = PermMask[1]; 3854 Mask1[2] = HiIndex & 1 ? 6 : 4; 3855 Mask1[3] = HiIndex & 1 ? 4 : 6; 3856 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 3857 } else { 3858 Mask1[0] = HiIndex & 1 ? 2 : 0; 3859 Mask1[1] = HiIndex & 1 ? 0 : 2; 3860 Mask1[2] = PermMask[2]; 3861 Mask1[3] = PermMask[3]; 3862 if (Mask1[2] >= 0) 3863 Mask1[2] += 4; 3864 if (Mask1[3] >= 0) 3865 Mask1[3] += 4; 3866 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 3867 } 3868 } 3869 3870 // Break it into (shuffle shuffle_hi, shuffle_lo). 3871 Locs.clear(); 3872 SmallVector<int,8> LoMask(4U, -1); 3873 SmallVector<int,8> HiMask(4U, -1); 3874 3875 SmallVector<int,8> *MaskPtr = &LoMask; 3876 unsigned MaskIdx = 0; 3877 unsigned LoIdx = 0; 3878 unsigned HiIdx = 2; 3879 for (unsigned i = 0; i != 4; ++i) { 3880 if (i == 2) { 3881 MaskPtr = &HiMask; 3882 MaskIdx = 1; 3883 LoIdx = 0; 3884 HiIdx = 2; 3885 } 3886 int Idx = PermMask[i]; 3887 if (Idx < 0) { 3888 Locs[i] = std::make_pair(-1, -1); 3889 } else if (Idx < 4) { 3890 Locs[i] = std::make_pair(MaskIdx, LoIdx); 3891 (*MaskPtr)[LoIdx] = Idx; 3892 LoIdx++; 3893 } else { 3894 Locs[i] = std::make_pair(MaskIdx, HiIdx); 3895 (*MaskPtr)[HiIdx] = Idx; 3896 HiIdx++; 3897 } 3898 } 3899 3900 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 3901 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 3902 SmallVector<int, 8> MaskOps; 3903 for (unsigned i = 0; i != 4; ++i) { 3904 if (Locs[i].first == -1) { 3905 MaskOps.push_back(-1); 3906 } else { 3907 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 3908 MaskOps.push_back(Idx); 3909 } 3910 } 3911 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 3912} 3913 3914SDValue 3915X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 3916 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 3917 SDValue V1 = Op.getOperand(0); 3918 SDValue V2 = Op.getOperand(1); 3919 MVT VT = Op.getValueType(); 3920 DebugLoc dl = Op.getDebugLoc(); 3921 unsigned NumElems = VT.getVectorNumElements(); 3922 bool isMMX = VT.getSizeInBits() == 64; 3923 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 3924 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 3925 bool V1IsSplat = false; 3926 bool V2IsSplat = false; 3927 3928 if (isZeroShuffle(SVOp)) 3929 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3930 3931 // Promote splats to v4f32. 3932 if (SVOp->isSplat()) { 3933 if (isMMX || NumElems < 4) 3934 return Op; 3935 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 3936 } 3937 3938 // If the shuffle can be profitably rewritten as a narrower shuffle, then 3939 // do it! 3940 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 3941 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 3942 if (NewOp.getNode()) 3943 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3944 LowerVECTOR_SHUFFLE(NewOp, DAG)); 3945 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 3946 // FIXME: Figure out a cleaner way to do this. 3947 // Try to make use of movq to zero out the top part. 3948 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 3949 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 3950 if (NewOp.getNode()) { 3951 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 3952 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 3953 DAG, Subtarget, dl); 3954 } 3955 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 3956 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 3957 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 3958 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 3959 DAG, Subtarget, dl); 3960 } 3961 } 3962 3963 if (X86::isPSHUFDMask(SVOp)) 3964 return Op; 3965 3966 // Check if this can be converted into a logical shift. 3967 bool isLeft = false; 3968 unsigned ShAmt = 0; 3969 SDValue ShVal; 3970 bool isShift = getSubtarget()->hasSSE2() && 3971 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 3972 if (isShift && ShVal.hasOneUse()) { 3973 // If the shifted value has multiple uses, it may be cheaper to use 3974 // v_set0 + movlhps or movhlps, etc. 3975 MVT EVT = VT.getVectorElementType(); 3976 ShAmt *= EVT.getSizeInBits(); 3977 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 3978 } 3979 3980 if (X86::isMOVLMask(SVOp)) { 3981 if (V1IsUndef) 3982 return V2; 3983 if (ISD::isBuildVectorAllZeros(V1.getNode())) 3984 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 3985 if (!isMMX) 3986 return Op; 3987 } 3988 3989 // FIXME: fold these into legal mask. 3990 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 3991 X86::isMOVSLDUPMask(SVOp) || 3992 X86::isMOVHLPSMask(SVOp) || 3993 X86::isMOVHPMask(SVOp) || 3994 X86::isMOVLPMask(SVOp))) 3995 return Op; 3996 3997 if (ShouldXformToMOVHLPS(SVOp) || 3998 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 3999 return CommuteVectorShuffle(SVOp, DAG); 4000 4001 if (isShift) { 4002 // No better options. Use a vshl / vsrl. 4003 MVT EVT = VT.getVectorElementType(); 4004 ShAmt *= EVT.getSizeInBits(); 4005 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4006 } 4007 4008 bool Commuted = false; 4009 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4010 // 1,1,1,1 -> v8i16 though. 4011 V1IsSplat = isSplatVector(V1.getNode()); 4012 V2IsSplat = isSplatVector(V2.getNode()); 4013 4014 // Canonicalize the splat or undef, if present, to be on the RHS. 4015 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4016 Op = CommuteVectorShuffle(SVOp, DAG); 4017 SVOp = cast<ShuffleVectorSDNode>(Op); 4018 V1 = SVOp->getOperand(0); 4019 V2 = SVOp->getOperand(1); 4020 std::swap(V1IsSplat, V2IsSplat); 4021 std::swap(V1IsUndef, V2IsUndef); 4022 Commuted = true; 4023 } 4024 4025 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4026 // Shuffling low element of v1 into undef, just return v1. 4027 if (V2IsUndef) 4028 return V1; 4029 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4030 // the instruction selector will not match, so get a canonical MOVL with 4031 // swapped operands to undo the commute. 4032 return getMOVL(DAG, dl, VT, V2, V1); 4033 } 4034 4035 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4036 X86::isUNPCKH_v_undef_Mask(SVOp) || 4037 X86::isUNPCKLMask(SVOp) || 4038 X86::isUNPCKHMask(SVOp)) 4039 return Op; 4040 4041 if (V2IsSplat) { 4042 // Normalize mask so all entries that point to V2 points to its first 4043 // element then try to match unpck{h|l} again. If match, return a 4044 // new vector_shuffle with the corrected mask. 4045 SDValue NewMask = NormalizeMask(SVOp, DAG); 4046 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4047 if (NSVOp != SVOp) { 4048 if (X86::isUNPCKLMask(NSVOp, true)) { 4049 return NewMask; 4050 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4051 return NewMask; 4052 } 4053 } 4054 } 4055 4056 if (Commuted) { 4057 // Commute is back and try unpck* again. 4058 // FIXME: this seems wrong. 4059 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4060 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4061 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4062 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4063 X86::isUNPCKLMask(NewSVOp) || 4064 X86::isUNPCKHMask(NewSVOp)) 4065 return NewOp; 4066 } 4067 4068 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4069 4070 // Normalize the node to match x86 shuffle ops if needed 4071 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4072 return CommuteVectorShuffle(SVOp, DAG); 4073 4074 // Check for legal shuffle and return? 4075 SmallVector<int, 16> PermMask; 4076 SVOp->getMask(PermMask); 4077 if (isShuffleMaskLegal(PermMask, VT)) 4078 return Op; 4079 4080 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4081 if (VT == MVT::v8i16) { 4082 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4083 if (NewOp.getNode()) 4084 return NewOp; 4085 } 4086 4087 if (VT == MVT::v16i8) { 4088 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4089 if (NewOp.getNode()) 4090 return NewOp; 4091 } 4092 4093 // Handle all 4 wide cases with a number of shuffles except for MMX. 4094 if (NumElems == 4 && !isMMX) 4095 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4096 4097 return SDValue(); 4098} 4099 4100SDValue 4101X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4102 SelectionDAG &DAG) { 4103 MVT VT = Op.getValueType(); 4104 DebugLoc dl = Op.getDebugLoc(); 4105 if (VT.getSizeInBits() == 8) { 4106 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4107 Op.getOperand(0), Op.getOperand(1)); 4108 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4109 DAG.getValueType(VT)); 4110 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4111 } else if (VT.getSizeInBits() == 16) { 4112 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4113 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4114 if (Idx == 0) 4115 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4116 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4117 DAG.getNode(ISD::BIT_CONVERT, dl, 4118 MVT::v4i32, 4119 Op.getOperand(0)), 4120 Op.getOperand(1))); 4121 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4122 Op.getOperand(0), Op.getOperand(1)); 4123 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4124 DAG.getValueType(VT)); 4125 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4126 } else if (VT == MVT::f32) { 4127 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4128 // the result back to FR32 register. It's only worth matching if the 4129 // result has a single use which is a store or a bitcast to i32. And in 4130 // the case of a store, it's not worth it if the index is a constant 0, 4131 // because a MOVSSmr can be used instead, which is smaller and faster. 4132 if (!Op.hasOneUse()) 4133 return SDValue(); 4134 SDNode *User = *Op.getNode()->use_begin(); 4135 if ((User->getOpcode() != ISD::STORE || 4136 (isa<ConstantSDNode>(Op.getOperand(1)) && 4137 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4138 (User->getOpcode() != ISD::BIT_CONVERT || 4139 User->getValueType(0) != MVT::i32)) 4140 return SDValue(); 4141 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4142 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4143 Op.getOperand(0)), 4144 Op.getOperand(1)); 4145 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4146 } else if (VT == MVT::i32) { 4147 // ExtractPS works with constant index. 4148 if (isa<ConstantSDNode>(Op.getOperand(1))) 4149 return Op; 4150 } 4151 return SDValue(); 4152} 4153 4154 4155SDValue 4156X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4157 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4158 return SDValue(); 4159 4160 if (Subtarget->hasSSE41()) { 4161 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4162 if (Res.getNode()) 4163 return Res; 4164 } 4165 4166 MVT VT = Op.getValueType(); 4167 DebugLoc dl = Op.getDebugLoc(); 4168 // TODO: handle v16i8. 4169 if (VT.getSizeInBits() == 16) { 4170 SDValue Vec = Op.getOperand(0); 4171 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4172 if (Idx == 0) 4173 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4174 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4175 DAG.getNode(ISD::BIT_CONVERT, dl, 4176 MVT::v4i32, Vec), 4177 Op.getOperand(1))); 4178 // Transform it so it match pextrw which produces a 32-bit result. 4179 MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1); 4180 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT, 4181 Op.getOperand(0), Op.getOperand(1)); 4182 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EVT, Extract, 4183 DAG.getValueType(VT)); 4184 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4185 } else if (VT.getSizeInBits() == 32) { 4186 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4187 if (Idx == 0) 4188 return Op; 4189 4190 // SHUFPS the element to the lowest double word, then movss. 4191 int Mask[4] = { Idx, -1, -1, -1 }; 4192 MVT VVT = Op.getOperand(0).getValueType(); 4193 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4194 DAG.getUNDEF(VVT), Mask); 4195 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4196 DAG.getIntPtrConstant(0)); 4197 } else if (VT.getSizeInBits() == 64) { 4198 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4199 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4200 // to match extract_elt for f64. 4201 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4202 if (Idx == 0) 4203 return Op; 4204 4205 // UNPCKHPD the element to the lowest double word, then movsd. 4206 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4207 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4208 int Mask[2] = { 1, -1 }; 4209 MVT VVT = Op.getOperand(0).getValueType(); 4210 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4211 DAG.getUNDEF(VVT), Mask); 4212 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4213 DAG.getIntPtrConstant(0)); 4214 } 4215 4216 return SDValue(); 4217} 4218 4219SDValue 4220X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4221 MVT VT = Op.getValueType(); 4222 MVT EVT = VT.getVectorElementType(); 4223 DebugLoc dl = Op.getDebugLoc(); 4224 4225 SDValue N0 = Op.getOperand(0); 4226 SDValue N1 = Op.getOperand(1); 4227 SDValue N2 = Op.getOperand(2); 4228 4229 if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) && 4230 isa<ConstantSDNode>(N2)) { 4231 unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB 4232 : X86ISD::PINSRW; 4233 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4234 // argument. 4235 if (N1.getValueType() != MVT::i32) 4236 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4237 if (N2.getValueType() != MVT::i32) 4238 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4239 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4240 } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4241 // Bits [7:6] of the constant are the source select. This will always be 4242 // zero here. The DAG Combiner may combine an extract_elt index into these 4243 // bits. For example (insert (extract, 3), 2) could be matched by putting 4244 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4245 // Bits [5:4] of the constant are the destination select. This is the 4246 // value of the incoming immediate. 4247 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4248 // combine either bitwise AND or insert of float 0.0 to set these bits. 4249 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4250 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4251 } else if (EVT == MVT::i32) { 4252 // InsertPS works with constant index. 4253 if (isa<ConstantSDNode>(N2)) 4254 return Op; 4255 } 4256 return SDValue(); 4257} 4258 4259SDValue 4260X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4261 MVT VT = Op.getValueType(); 4262 MVT EVT = VT.getVectorElementType(); 4263 4264 if (Subtarget->hasSSE41()) 4265 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4266 4267 if (EVT == MVT::i8) 4268 return SDValue(); 4269 4270 DebugLoc dl = Op.getDebugLoc(); 4271 SDValue N0 = Op.getOperand(0); 4272 SDValue N1 = Op.getOperand(1); 4273 SDValue N2 = Op.getOperand(2); 4274 4275 if (EVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 4276 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4277 // as its second argument. 4278 if (N1.getValueType() != MVT::i32) 4279 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4280 if (N2.getValueType() != MVT::i32) 4281 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4282 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 4283 } 4284 return SDValue(); 4285} 4286 4287SDValue 4288X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4289 DebugLoc dl = Op.getDebugLoc(); 4290 if (Op.getValueType() == MVT::v2f32) 4291 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4292 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4293 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4294 Op.getOperand(0)))); 4295 4296 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4297 MVT VT = MVT::v2i32; 4298 switch (Op.getValueType().getSimpleVT()) { 4299 default: break; 4300 case MVT::v16i8: 4301 case MVT::v8i16: 4302 VT = MVT::v4i32; 4303 break; 4304 } 4305 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4306 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4307} 4308 4309// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4310// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4311// one of the above mentioned nodes. It has to be wrapped because otherwise 4312// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4313// be used to form addressing mode. These wrapped nodes will be selected 4314// into MOV32ri. 4315SDValue 4316X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4317 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4318 // FIXME there isn't really any debug info here, should come from the parent 4319 DebugLoc dl = CP->getDebugLoc(); 4320 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4321 CP->getAlignment()); 4322 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4323 // With PIC, the address is actually $g + Offset. 4324 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4325 !Subtarget->isPICStyleRIPRel()) { 4326 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4327 DAG.getNode(X86ISD::GlobalBaseReg, 4328 DebugLoc::getUnknownLoc(), 4329 getPointerTy()), 4330 Result); 4331 } 4332 4333 return Result; 4334} 4335 4336SDValue 4337X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 4338 int64_t Offset, 4339 SelectionDAG &DAG) const { 4340 bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_; 4341 bool ExtraLoadRequired = 4342 Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false); 4343 4344 // Create the TargetGlobalAddress node, folding in the constant 4345 // offset if it is legal. 4346 SDValue Result; 4347 if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) { 4348 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 4349 Offset = 0; 4350 } else 4351 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0); 4352 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4353 4354 // With PIC, the address is actually $g + Offset. 4355 if (IsPic && !Subtarget->isPICStyleRIPRel()) { 4356 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4357 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 4358 Result); 4359 } 4360 4361 // For Darwin & Mingw32, external and weak symbols are indirect, so we want to 4362 // load the value at address GV, not the value of GV itself. This means that 4363 // the GlobalAddress must be in the base or index register of the address, not 4364 // the GV offset field. Platform check is inside GVRequiresExtraLoad() call 4365 // The same applies for external symbols during PIC codegen 4366 if (ExtraLoadRequired) 4367 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 4368 PseudoSourceValue::getGOT(), 0); 4369 4370 // If there was a non-zero offset that we didn't fold, create an explicit 4371 // addition for it. 4372 if (Offset != 0) 4373 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 4374 DAG.getConstant(Offset, getPointerTy())); 4375 4376 return Result; 4377} 4378 4379SDValue 4380X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 4381 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 4382 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 4383 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 4384} 4385 4386static SDValue 4387GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 4388 SDValue *InFlag, const MVT PtrVT, unsigned ReturnReg) { 4389 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 4390 DebugLoc dl = GA->getDebugLoc(); 4391 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4392 GA->getValueType(0), 4393 GA->getOffset()); 4394 if (InFlag) { 4395 SDValue Ops[] = { Chain, TGA, *InFlag }; 4396 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 4397 } else { 4398 SDValue Ops[] = { Chain, TGA }; 4399 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 4400 } 4401 SDValue Flag = Chain.getValue(1); 4402 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 4403} 4404 4405// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 4406static SDValue 4407LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4408 const MVT PtrVT) { 4409 SDValue InFlag; 4410 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 4411 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 4412 DAG.getNode(X86ISD::GlobalBaseReg, 4413 DebugLoc::getUnknownLoc(), 4414 PtrVT), InFlag); 4415 InFlag = Chain.getValue(1); 4416 4417 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX); 4418} 4419 4420// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 4421static SDValue 4422LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4423 const MVT PtrVT) { 4424 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX); 4425} 4426 4427// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 4428// "local exec" model. 4429static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4430 const MVT PtrVT, TLSModel::Model model, 4431 bool is64Bit) { 4432 DebugLoc dl = GA->getDebugLoc(); 4433 // Get the Thread Pointer 4434 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 4435 DebugLoc::getUnknownLoc(), PtrVT, 4436 DAG.getRegister(is64Bit? X86::FS : X86::GS, 4437 MVT::i32)); 4438 4439 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 4440 NULL, 0); 4441 4442 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 4443 // exec) 4444 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4445 GA->getValueType(0), 4446 GA->getOffset()); 4447 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); 4448 4449 if (model == TLSModel::InitialExec) 4450 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 4451 PseudoSourceValue::getGOT(), 0); 4452 4453 // The address of the thread local variable is the add of the thread 4454 // pointer with the offset of the variable. 4455 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 4456} 4457 4458SDValue 4459X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 4460 // TODO: implement the "local dynamic" model 4461 // TODO: implement the "initial exec"model for pic executables 4462 assert(Subtarget->isTargetELF() && 4463 "TLS not implemented for non-ELF targets"); 4464 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4465 GlobalValue *GV = GA->getGlobal(); 4466 TLSModel::Model model = 4467 getTLSModel (GV, getTargetMachine().getRelocationModel()); 4468 if (Subtarget->is64Bit()) { 4469 switch (model) { 4470 case TLSModel::GeneralDynamic: 4471 case TLSModel::LocalDynamic: // not implemented 4472 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 4473 4474 case TLSModel::InitialExec: 4475 case TLSModel::LocalExec: 4476 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, true); 4477 } 4478 } else { 4479 switch (model) { 4480 case TLSModel::GeneralDynamic: 4481 case TLSModel::LocalDynamic: // not implemented 4482 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 4483 4484 case TLSModel::InitialExec: 4485 case TLSModel::LocalExec: 4486 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, false); 4487 } 4488 } 4489 assert(0 && "Unreachable"); 4490 return SDValue(); 4491} 4492 4493SDValue 4494X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4495 // FIXME there isn't really any debug info here 4496 DebugLoc dl = Op.getDebugLoc(); 4497 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4498 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy()); 4499 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4500 // With PIC, the address is actually $g + Offset. 4501 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4502 !Subtarget->isPICStyleRIPRel()) { 4503 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4504 DAG.getNode(X86ISD::GlobalBaseReg, 4505 DebugLoc::getUnknownLoc(), 4506 getPointerTy()), 4507 Result); 4508 } 4509 4510 return Result; 4511} 4512 4513SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4514 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4515 // FIXME there isn't really any debug into here 4516 DebugLoc dl = JT->getDebugLoc(); 4517 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy()); 4518 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4519 // With PIC, the address is actually $g + Offset. 4520 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4521 !Subtarget->isPICStyleRIPRel()) { 4522 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4523 DAG.getNode(X86ISD::GlobalBaseReg, 4524 DebugLoc::getUnknownLoc(), 4525 getPointerTy()), 4526 Result); 4527 } 4528 4529 return Result; 4530} 4531 4532/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 4533/// take a 2 x i32 value to shift plus a shift amount. 4534SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 4535 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4536 MVT VT = Op.getValueType(); 4537 unsigned VTBits = VT.getSizeInBits(); 4538 DebugLoc dl = Op.getDebugLoc(); 4539 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 4540 SDValue ShOpLo = Op.getOperand(0); 4541 SDValue ShOpHi = Op.getOperand(1); 4542 SDValue ShAmt = Op.getOperand(2); 4543 SDValue Tmp1 = isSRA ? 4544 DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 4545 DAG.getConstant(VTBits - 1, MVT::i8)) : 4546 DAG.getConstant(0, VT); 4547 4548 SDValue Tmp2, Tmp3; 4549 if (Op.getOpcode() == ISD::SHL_PARTS) { 4550 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 4551 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4552 } else { 4553 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 4554 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 4555 } 4556 4557 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 4558 DAG.getConstant(VTBits, MVT::i8)); 4559 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, 4560 AndNode, DAG.getConstant(0, MVT::i8)); 4561 4562 SDValue Hi, Lo; 4563 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 4564 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 4565 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 4566 4567 if (Op.getOpcode() == ISD::SHL_PARTS) { 4568 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4569 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4570 } else { 4571 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4572 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4573 } 4574 4575 SDValue Ops[2] = { Lo, Hi }; 4576 return DAG.getMergeValues(Ops, 2, dl); 4577} 4578 4579SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4580 MVT SrcVT = Op.getOperand(0).getValueType(); 4581 4582 if (SrcVT.isVector()) { 4583 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 4584 return Op; 4585 } 4586 return SDValue(); 4587 } 4588 4589 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 4590 "Unknown SINT_TO_FP to lower!"); 4591 4592 // These are really Legal; return the operand so the caller accepts it as 4593 // Legal. 4594 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 4595 return Op; 4596 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 4597 Subtarget->is64Bit()) { 4598 return Op; 4599 } 4600 4601 DebugLoc dl = Op.getDebugLoc(); 4602 unsigned Size = SrcVT.getSizeInBits()/8; 4603 MachineFunction &MF = DAG.getMachineFunction(); 4604 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); 4605 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4606 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4607 StackSlot, 4608 PseudoSourceValue::getFixedStack(SSFI), 0); 4609 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 4610} 4611 4612SDValue X86TargetLowering::BuildFILD(SDValue Op, MVT SrcVT, SDValue Chain, 4613 SDValue StackSlot, 4614 SelectionDAG &DAG) { 4615 // Build the FILD 4616 DebugLoc dl = Op.getDebugLoc(); 4617 SDVTList Tys; 4618 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 4619 if (useSSE) 4620 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 4621 else 4622 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 4623 SmallVector<SDValue, 8> Ops; 4624 Ops.push_back(Chain); 4625 Ops.push_back(StackSlot); 4626 Ops.push_back(DAG.getValueType(SrcVT)); 4627 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 4628 Tys, &Ops[0], Ops.size()); 4629 4630 if (useSSE) { 4631 Chain = Result.getValue(1); 4632 SDValue InFlag = Result.getValue(2); 4633 4634 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 4635 // shouldn't be necessary except that RFP cannot be live across 4636 // multiple blocks. When stackifier is fixed, they can be uncoupled. 4637 MachineFunction &MF = DAG.getMachineFunction(); 4638 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); 4639 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4640 Tys = DAG.getVTList(MVT::Other); 4641 SmallVector<SDValue, 8> Ops; 4642 Ops.push_back(Chain); 4643 Ops.push_back(Result); 4644 Ops.push_back(StackSlot); 4645 Ops.push_back(DAG.getValueType(Op.getValueType())); 4646 Ops.push_back(InFlag); 4647 Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size()); 4648 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 4649 PseudoSourceValue::getFixedStack(SSFI), 0); 4650 } 4651 4652 return Result; 4653} 4654 4655// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 4656SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 4657 // This algorithm is not obvious. Here it is in C code, more or less: 4658 /* 4659 double uint64_to_double( uint32_t hi, uint32_t lo ) { 4660 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 4661 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 4662 4663 // Copy ints to xmm registers. 4664 __m128i xh = _mm_cvtsi32_si128( hi ); 4665 __m128i xl = _mm_cvtsi32_si128( lo ); 4666 4667 // Combine into low half of a single xmm register. 4668 __m128i x = _mm_unpacklo_epi32( xh, xl ); 4669 __m128d d; 4670 double sd; 4671 4672 // Merge in appropriate exponents to give the integer bits the right 4673 // magnitude. 4674 x = _mm_unpacklo_epi32( x, exp ); 4675 4676 // Subtract away the biases to deal with the IEEE-754 double precision 4677 // implicit 1. 4678 d = _mm_sub_pd( (__m128d) x, bias ); 4679 4680 // All conversions up to here are exact. The correctly rounded result is 4681 // calculated using the current rounding mode using the following 4682 // horizontal add. 4683 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 4684 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 4685 // store doesn't really need to be here (except 4686 // maybe to zero the other double) 4687 return sd; 4688 } 4689 */ 4690 4691 DebugLoc dl = Op.getDebugLoc(); 4692 4693 // Build some magic constants. 4694 std::vector<Constant*> CV0; 4695 CV0.push_back(ConstantInt::get(APInt(32, 0x45300000))); 4696 CV0.push_back(ConstantInt::get(APInt(32, 0x43300000))); 4697 CV0.push_back(ConstantInt::get(APInt(32, 0))); 4698 CV0.push_back(ConstantInt::get(APInt(32, 0))); 4699 Constant *C0 = ConstantVector::get(CV0); 4700 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 4701 4702 std::vector<Constant*> CV1; 4703 CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL)))); 4704 CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL)))); 4705 Constant *C1 = ConstantVector::get(CV1); 4706 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 4707 4708 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4709 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4710 Op.getOperand(0), 4711 DAG.getIntPtrConstant(1))); 4712 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4713 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4714 Op.getOperand(0), 4715 DAG.getIntPtrConstant(0))); 4716 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 4717 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 4718 PseudoSourceValue::getConstantPool(), 0, 4719 false, 16); 4720 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 4721 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 4722 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 4723 PseudoSourceValue::getConstantPool(), 0, 4724 false, 16); 4725 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 4726 4727 // Add the halves; easiest way is to swap them into another reg first. 4728 int ShufMask[2] = { 1, -1 }; 4729 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 4730 DAG.getUNDEF(MVT::v2f64), ShufMask); 4731 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 4732 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 4733 DAG.getIntPtrConstant(0)); 4734} 4735 4736// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 4737SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 4738 DebugLoc dl = Op.getDebugLoc(); 4739 // FP constant to bias correct the final result. 4740 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 4741 MVT::f64); 4742 4743 // Load the 32-bit value into an XMM register. 4744 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4745 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4746 Op.getOperand(0), 4747 DAG.getIntPtrConstant(0))); 4748 4749 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 4750 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 4751 DAG.getIntPtrConstant(0)); 4752 4753 // Or the load with the bias. 4754 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 4755 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4756 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4757 MVT::v2f64, Load)), 4758 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4759 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4760 MVT::v2f64, Bias))); 4761 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 4762 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 4763 DAG.getIntPtrConstant(0)); 4764 4765 // Subtract the bias. 4766 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 4767 4768 // Handle final rounding. 4769 MVT DestVT = Op.getValueType(); 4770 4771 if (DestVT.bitsLT(MVT::f64)) { 4772 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 4773 DAG.getIntPtrConstant(0)); 4774 } else if (DestVT.bitsGT(MVT::f64)) { 4775 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 4776 } 4777 4778 // Handle final rounding. 4779 return Sub; 4780} 4781 4782SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4783 SDValue N0 = Op.getOperand(0); 4784 DebugLoc dl = Op.getDebugLoc(); 4785 4786 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 4787 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 4788 // the optimization here. 4789 if (DAG.SignBitIsZero(N0)) 4790 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 4791 4792 MVT SrcVT = N0.getValueType(); 4793 if (SrcVT == MVT::i64) { 4794 // We only handle SSE2 f64 target here; caller can expand the rest. 4795 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 4796 return SDValue(); 4797 4798 return LowerUINT_TO_FP_i64(Op, DAG); 4799 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 4800 return LowerUINT_TO_FP_i32(Op, DAG); 4801 } 4802 4803 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 4804 4805 // Make a 64-bit buffer, and use it to build an FILD. 4806 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 4807 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 4808 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 4809 getPointerTy(), StackSlot, WordOff); 4810 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4811 StackSlot, NULL, 0); 4812 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 4813 OffsetSlot, NULL, 0); 4814 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 4815} 4816 4817std::pair<SDValue,SDValue> X86TargetLowering:: 4818FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 4819 DebugLoc dl = Op.getDebugLoc(); 4820 4821 MVT DstTy = Op.getValueType(); 4822 4823 if (!IsSigned) { 4824 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 4825 DstTy = MVT::i64; 4826 } 4827 4828 assert(DstTy.getSimpleVT() <= MVT::i64 && 4829 DstTy.getSimpleVT() >= MVT::i16 && 4830 "Unknown FP_TO_SINT to lower!"); 4831 4832 // These are really Legal. 4833 if (DstTy == MVT::i32 && 4834 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 4835 return std::make_pair(SDValue(), SDValue()); 4836 if (Subtarget->is64Bit() && 4837 DstTy == MVT::i64 && 4838 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 4839 return std::make_pair(SDValue(), SDValue()); 4840 4841 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 4842 // stack slot. 4843 MachineFunction &MF = DAG.getMachineFunction(); 4844 unsigned MemSize = DstTy.getSizeInBits()/8; 4845 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 4846 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4847 4848 unsigned Opc; 4849 switch (DstTy.getSimpleVT()) { 4850 default: assert(0 && "Invalid FP_TO_SINT to lower!"); 4851 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 4852 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 4853 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 4854 } 4855 4856 SDValue Chain = DAG.getEntryNode(); 4857 SDValue Value = Op.getOperand(0); 4858 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 4859 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 4860 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 4861 PseudoSourceValue::getFixedStack(SSFI), 0); 4862 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 4863 SDValue Ops[] = { 4864 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 4865 }; 4866 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 4867 Chain = Value.getValue(1); 4868 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 4869 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4870 } 4871 4872 // Build the FP_TO_INT*_IN_MEM 4873 SDValue Ops[] = { Chain, Value, StackSlot }; 4874 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 4875 4876 return std::make_pair(FIST, StackSlot); 4877} 4878 4879SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 4880 if (Op.getValueType().isVector()) { 4881 if (Op.getValueType() == MVT::v2i32 && 4882 Op.getOperand(0).getValueType() == MVT::v2f64) { 4883 return Op; 4884 } 4885 return SDValue(); 4886 } 4887 4888 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 4889 SDValue FIST = Vals.first, StackSlot = Vals.second; 4890 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 4891 if (FIST.getNode() == 0) return Op; 4892 4893 // Load the result. 4894 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 4895 FIST, StackSlot, NULL, 0); 4896} 4897 4898SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 4899 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 4900 SDValue FIST = Vals.first, StackSlot = Vals.second; 4901 assert(FIST.getNode() && "Unexpected failure"); 4902 4903 // Load the result. 4904 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 4905 FIST, StackSlot, NULL, 0); 4906} 4907 4908SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 4909 DebugLoc dl = Op.getDebugLoc(); 4910 MVT VT = Op.getValueType(); 4911 MVT EltVT = VT; 4912 if (VT.isVector()) 4913 EltVT = VT.getVectorElementType(); 4914 std::vector<Constant*> CV; 4915 if (EltVT == MVT::f64) { 4916 Constant *C = ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63)))); 4917 CV.push_back(C); 4918 CV.push_back(C); 4919 } else { 4920 Constant *C = ConstantFP::get(APFloat(APInt(32, ~(1U << 31)))); 4921 CV.push_back(C); 4922 CV.push_back(C); 4923 CV.push_back(C); 4924 CV.push_back(C); 4925 } 4926 Constant *C = ConstantVector::get(CV); 4927 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 4928 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 4929 PseudoSourceValue::getConstantPool(), 0, 4930 false, 16); 4931 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 4932} 4933 4934SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 4935 DebugLoc dl = Op.getDebugLoc(); 4936 MVT VT = Op.getValueType(); 4937 MVT EltVT = VT; 4938 unsigned EltNum = 1; 4939 if (VT.isVector()) { 4940 EltVT = VT.getVectorElementType(); 4941 EltNum = VT.getVectorNumElements(); 4942 } 4943 std::vector<Constant*> CV; 4944 if (EltVT == MVT::f64) { 4945 Constant *C = ConstantFP::get(APFloat(APInt(64, 1ULL << 63))); 4946 CV.push_back(C); 4947 CV.push_back(C); 4948 } else { 4949 Constant *C = ConstantFP::get(APFloat(APInt(32, 1U << 31))); 4950 CV.push_back(C); 4951 CV.push_back(C); 4952 CV.push_back(C); 4953 CV.push_back(C); 4954 } 4955 Constant *C = ConstantVector::get(CV); 4956 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 4957 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 4958 PseudoSourceValue::getConstantPool(), 0, 4959 false, 16); 4960 if (VT.isVector()) { 4961 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4962 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 4963 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4964 Op.getOperand(0)), 4965 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 4966 } else { 4967 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 4968 } 4969} 4970 4971SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 4972 SDValue Op0 = Op.getOperand(0); 4973 SDValue Op1 = Op.getOperand(1); 4974 DebugLoc dl = Op.getDebugLoc(); 4975 MVT VT = Op.getValueType(); 4976 MVT SrcVT = Op1.getValueType(); 4977 4978 // If second operand is smaller, extend it first. 4979 if (SrcVT.bitsLT(VT)) { 4980 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 4981 SrcVT = VT; 4982 } 4983 // And if it is bigger, shrink it first. 4984 if (SrcVT.bitsGT(VT)) { 4985 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 4986 SrcVT = VT; 4987 } 4988 4989 // At this point the operands and the result should have the same 4990 // type, and that won't be f80 since that is not custom lowered. 4991 4992 // First get the sign bit of second operand. 4993 std::vector<Constant*> CV; 4994 if (SrcVT == MVT::f64) { 4995 CV.push_back(ConstantFP::get(APFloat(APInt(64, 1ULL << 63)))); 4996 CV.push_back(ConstantFP::get(APFloat(APInt(64, 0)))); 4997 } else { 4998 CV.push_back(ConstantFP::get(APFloat(APInt(32, 1U << 31)))); 4999 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5000 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5001 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5002 } 5003 Constant *C = ConstantVector::get(CV); 5004 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5005 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5006 PseudoSourceValue::getConstantPool(), 0, 5007 false, 16); 5008 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5009 5010 // Shift sign bit right or left if the two operands have different types. 5011 if (SrcVT.bitsGT(VT)) { 5012 // Op0 is MVT::f32, Op1 is MVT::f64. 5013 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5014 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5015 DAG.getConstant(32, MVT::i32)); 5016 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5017 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5018 DAG.getIntPtrConstant(0)); 5019 } 5020 5021 // Clear first operand sign bit. 5022 CV.clear(); 5023 if (VT == MVT::f64) { 5024 CV.push_back(ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63))))); 5025 CV.push_back(ConstantFP::get(APFloat(APInt(64, 0)))); 5026 } else { 5027 CV.push_back(ConstantFP::get(APFloat(APInt(32, ~(1U << 31))))); 5028 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5029 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5030 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5031 } 5032 C = ConstantVector::get(CV); 5033 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5034 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5035 PseudoSourceValue::getConstantPool(), 0, 5036 false, 16); 5037 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5038 5039 // Or the value with the sign bit. 5040 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5041} 5042 5043/// Emit nodes that will be selected as "test Op0,Op0", or something 5044/// equivalent. 5045SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5046 SelectionDAG &DAG) { 5047 DebugLoc dl = Op.getDebugLoc(); 5048 5049 // CF and OF aren't always set the way we want. Determine which 5050 // of these we need. 5051 bool NeedCF = false; 5052 bool NeedOF = false; 5053 switch (X86CC) { 5054 case X86::COND_A: case X86::COND_AE: 5055 case X86::COND_B: case X86::COND_BE: 5056 NeedCF = true; 5057 break; 5058 case X86::COND_G: case X86::COND_GE: 5059 case X86::COND_L: case X86::COND_LE: 5060 case X86::COND_O: case X86::COND_NO: 5061 NeedOF = true; 5062 break; 5063 default: break; 5064 } 5065 5066 // See if we can use the EFLAGS value from the operand instead of 5067 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5068 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5069 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5070 unsigned Opcode = 0; 5071 unsigned NumOperands = 0; 5072 switch (Op.getNode()->getOpcode()) { 5073 case ISD::ADD: 5074 // Due to an isel shortcoming, be conservative if this add is likely to 5075 // be selected as part of a load-modify-store instruction. When the root 5076 // node in a match is a store, isel doesn't know how to remap non-chain 5077 // non-flag uses of other nodes in the match, such as the ADD in this 5078 // case. This leads to the ADD being left around and reselected, with 5079 // the result being two adds in the output. 5080 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5081 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5082 if (UI->getOpcode() == ISD::STORE) 5083 goto default_case; 5084 if (ConstantSDNode *C = 5085 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5086 // An add of one will be selected as an INC. 5087 if (C->getAPIntValue() == 1) { 5088 Opcode = X86ISD::INC; 5089 NumOperands = 1; 5090 break; 5091 } 5092 // An add of negative one (subtract of one) will be selected as a DEC. 5093 if (C->getAPIntValue().isAllOnesValue()) { 5094 Opcode = X86ISD::DEC; 5095 NumOperands = 1; 5096 break; 5097 } 5098 } 5099 // Otherwise use a regular EFLAGS-setting add. 5100 Opcode = X86ISD::ADD; 5101 NumOperands = 2; 5102 break; 5103 case ISD::SUB: 5104 // Due to the ISEL shortcoming noted above, be conservative if this sub is 5105 // likely to be selected as part of a load-modify-store instruction. 5106 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5107 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5108 if (UI->getOpcode() == ISD::STORE) 5109 goto default_case; 5110 // Otherwise use a regular EFLAGS-setting sub. 5111 Opcode = X86ISD::SUB; 5112 NumOperands = 2; 5113 break; 5114 case X86ISD::ADD: 5115 case X86ISD::SUB: 5116 case X86ISD::INC: 5117 case X86ISD::DEC: 5118 return SDValue(Op.getNode(), 1); 5119 default: 5120 default_case: 5121 break; 5122 } 5123 if (Opcode != 0) { 5124 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5125 SmallVector<SDValue, 4> Ops; 5126 for (unsigned i = 0; i != NumOperands; ++i) 5127 Ops.push_back(Op.getOperand(i)); 5128 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5129 DAG.ReplaceAllUsesWith(Op, New); 5130 return SDValue(New.getNode(), 1); 5131 } 5132 } 5133 5134 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5135 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5136 DAG.getConstant(0, Op.getValueType())); 5137} 5138 5139/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5140/// equivalent. 5141SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5142 SelectionDAG &DAG) { 5143 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5144 if (C->getAPIntValue() == 0) 5145 return EmitTest(Op0, X86CC, DAG); 5146 5147 DebugLoc dl = Op0.getDebugLoc(); 5148 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5149} 5150 5151SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5152 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5153 SDValue Op0 = Op.getOperand(0); 5154 SDValue Op1 = Op.getOperand(1); 5155 DebugLoc dl = Op.getDebugLoc(); 5156 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5157 5158 // Lower (X & (1 << N)) == 0 to BT(X, N). 5159 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5160 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5161 if (Op0.getOpcode() == ISD::AND && 5162 Op0.hasOneUse() && 5163 Op1.getOpcode() == ISD::Constant && 5164 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5165 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5166 SDValue LHS, RHS; 5167 if (Op0.getOperand(1).getOpcode() == ISD::SHL) { 5168 if (ConstantSDNode *Op010C = 5169 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0))) 5170 if (Op010C->getZExtValue() == 1) { 5171 LHS = Op0.getOperand(0); 5172 RHS = Op0.getOperand(1).getOperand(1); 5173 } 5174 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { 5175 if (ConstantSDNode *Op000C = 5176 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0))) 5177 if (Op000C->getZExtValue() == 1) { 5178 LHS = Op0.getOperand(1); 5179 RHS = Op0.getOperand(0).getOperand(1); 5180 } 5181 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { 5182 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1)); 5183 SDValue AndLHS = Op0.getOperand(0); 5184 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5185 LHS = AndLHS.getOperand(0); 5186 RHS = AndLHS.getOperand(1); 5187 } 5188 } 5189 5190 if (LHS.getNode()) { 5191 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5192 // instruction. Since the shift amount is in-range-or-undefined, we know 5193 // that doing a bittest on the i16 value is ok. We extend to i32 because 5194 // the encoding for the i16 version is larger than the i32 version. 5195 if (LHS.getValueType() == MVT::i8) 5196 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5197 5198 // If the operand types disagree, extend the shift amount to match. Since 5199 // BT ignores high bits (like shifts) we can use anyextend. 5200 if (LHS.getValueType() != RHS.getValueType()) 5201 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5202 5203 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5204 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5205 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5206 DAG.getConstant(Cond, MVT::i8), BT); 5207 } 5208 } 5209 5210 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5211 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5212 5213 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5214 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5215 DAG.getConstant(X86CC, MVT::i8), Cond); 5216} 5217 5218SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5219 SDValue Cond; 5220 SDValue Op0 = Op.getOperand(0); 5221 SDValue Op1 = Op.getOperand(1); 5222 SDValue CC = Op.getOperand(2); 5223 MVT VT = Op.getValueType(); 5224 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5225 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5226 DebugLoc dl = Op.getDebugLoc(); 5227 5228 if (isFP) { 5229 unsigned SSECC = 8; 5230 MVT VT0 = Op0.getValueType(); 5231 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 5232 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 5233 bool Swap = false; 5234 5235 switch (SetCCOpcode) { 5236 default: break; 5237 case ISD::SETOEQ: 5238 case ISD::SETEQ: SSECC = 0; break; 5239 case ISD::SETOGT: 5240 case ISD::SETGT: Swap = true; // Fallthrough 5241 case ISD::SETLT: 5242 case ISD::SETOLT: SSECC = 1; break; 5243 case ISD::SETOGE: 5244 case ISD::SETGE: Swap = true; // Fallthrough 5245 case ISD::SETLE: 5246 case ISD::SETOLE: SSECC = 2; break; 5247 case ISD::SETUO: SSECC = 3; break; 5248 case ISD::SETUNE: 5249 case ISD::SETNE: SSECC = 4; break; 5250 case ISD::SETULE: Swap = true; 5251 case ISD::SETUGE: SSECC = 5; break; 5252 case ISD::SETULT: Swap = true; 5253 case ISD::SETUGT: SSECC = 6; break; 5254 case ISD::SETO: SSECC = 7; break; 5255 } 5256 if (Swap) 5257 std::swap(Op0, Op1); 5258 5259 // In the two special cases we can't handle, emit two comparisons. 5260 if (SSECC == 8) { 5261 if (SetCCOpcode == ISD::SETUEQ) { 5262 SDValue UNORD, EQ; 5263 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 5264 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 5265 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 5266 } 5267 else if (SetCCOpcode == ISD::SETONE) { 5268 SDValue ORD, NEQ; 5269 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 5270 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 5271 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 5272 } 5273 assert(0 && "Illegal FP comparison"); 5274 } 5275 // Handle all other FP comparisons here. 5276 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 5277 } 5278 5279 // We are handling one of the integer comparisons here. Since SSE only has 5280 // GT and EQ comparisons for integer, swapping operands and multiple 5281 // operations may be required for some comparisons. 5282 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 5283 bool Swap = false, Invert = false, FlipSigns = false; 5284 5285 switch (VT.getSimpleVT()) { 5286 default: break; 5287 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 5288 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 5289 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 5290 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 5291 } 5292 5293 switch (SetCCOpcode) { 5294 default: break; 5295 case ISD::SETNE: Invert = true; 5296 case ISD::SETEQ: Opc = EQOpc; break; 5297 case ISD::SETLT: Swap = true; 5298 case ISD::SETGT: Opc = GTOpc; break; 5299 case ISD::SETGE: Swap = true; 5300 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 5301 case ISD::SETULT: Swap = true; 5302 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 5303 case ISD::SETUGE: Swap = true; 5304 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 5305 } 5306 if (Swap) 5307 std::swap(Op0, Op1); 5308 5309 // Since SSE has no unsigned integer comparisons, we need to flip the sign 5310 // bits of the inputs before performing those operations. 5311 if (FlipSigns) { 5312 MVT EltVT = VT.getVectorElementType(); 5313 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 5314 EltVT); 5315 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 5316 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 5317 SignBits.size()); 5318 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 5319 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 5320 } 5321 5322 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 5323 5324 // If the logical-not of the result is required, perform that now. 5325 if (Invert) 5326 Result = DAG.getNOT(dl, Result, VT); 5327 5328 return Result; 5329} 5330 5331// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 5332static bool isX86LogicalCmp(SDValue Op) { 5333 unsigned Opc = Op.getNode()->getOpcode(); 5334 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 5335 return true; 5336 if (Op.getResNo() == 1 && 5337 (Opc == X86ISD::ADD || 5338 Opc == X86ISD::SUB || 5339 Opc == X86ISD::SMUL || 5340 Opc == X86ISD::UMUL || 5341 Opc == X86ISD::INC || 5342 Opc == X86ISD::DEC)) 5343 return true; 5344 5345 return false; 5346} 5347 5348SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 5349 bool addTest = true; 5350 SDValue Cond = Op.getOperand(0); 5351 DebugLoc dl = Op.getDebugLoc(); 5352 SDValue CC; 5353 5354 if (Cond.getOpcode() == ISD::SETCC) 5355 Cond = LowerSETCC(Cond, DAG); 5356 5357 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5358 // setting operand in place of the X86ISD::SETCC. 5359 if (Cond.getOpcode() == X86ISD::SETCC) { 5360 CC = Cond.getOperand(0); 5361 5362 SDValue Cmp = Cond.getOperand(1); 5363 unsigned Opc = Cmp.getOpcode(); 5364 MVT VT = Op.getValueType(); 5365 5366 bool IllegalFPCMov = false; 5367 if (VT.isFloatingPoint() && !VT.isVector() && 5368 !isScalarFPTypeInSSEReg(VT)) // FPStack? 5369 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 5370 5371 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 5372 Opc == X86ISD::BT) { // FIXME 5373 Cond = Cmp; 5374 addTest = false; 5375 } 5376 } 5377 5378 if (addTest) { 5379 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5380 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5381 } 5382 5383 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 5384 SmallVector<SDValue, 4> Ops; 5385 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 5386 // condition is true. 5387 Ops.push_back(Op.getOperand(2)); 5388 Ops.push_back(Op.getOperand(1)); 5389 Ops.push_back(CC); 5390 Ops.push_back(Cond); 5391 return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size()); 5392} 5393 5394// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 5395// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 5396// from the AND / OR. 5397static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 5398 Opc = Op.getOpcode(); 5399 if (Opc != ISD::OR && Opc != ISD::AND) 5400 return false; 5401 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5402 Op.getOperand(0).hasOneUse() && 5403 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 5404 Op.getOperand(1).hasOneUse()); 5405} 5406 5407// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 5408// 1 and that the SETCC node has a single use. 5409static bool isXor1OfSetCC(SDValue Op) { 5410 if (Op.getOpcode() != ISD::XOR) 5411 return false; 5412 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5413 if (N1C && N1C->getAPIntValue() == 1) { 5414 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5415 Op.getOperand(0).hasOneUse(); 5416 } 5417 return false; 5418} 5419 5420SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 5421 bool addTest = true; 5422 SDValue Chain = Op.getOperand(0); 5423 SDValue Cond = Op.getOperand(1); 5424 SDValue Dest = Op.getOperand(2); 5425 DebugLoc dl = Op.getDebugLoc(); 5426 SDValue CC; 5427 5428 if (Cond.getOpcode() == ISD::SETCC) 5429 Cond = LowerSETCC(Cond, DAG); 5430#if 0 5431 // FIXME: LowerXALUO doesn't handle these!! 5432 else if (Cond.getOpcode() == X86ISD::ADD || 5433 Cond.getOpcode() == X86ISD::SUB || 5434 Cond.getOpcode() == X86ISD::SMUL || 5435 Cond.getOpcode() == X86ISD::UMUL) 5436 Cond = LowerXALUO(Cond, DAG); 5437#endif 5438 5439 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5440 // setting operand in place of the X86ISD::SETCC. 5441 if (Cond.getOpcode() == X86ISD::SETCC) { 5442 CC = Cond.getOperand(0); 5443 5444 SDValue Cmp = Cond.getOperand(1); 5445 unsigned Opc = Cmp.getOpcode(); 5446 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 5447 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 5448 Cond = Cmp; 5449 addTest = false; 5450 } else { 5451 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 5452 default: break; 5453 case X86::COND_O: 5454 case X86::COND_B: 5455 // These can only come from an arithmetic instruction with overflow, 5456 // e.g. SADDO, UADDO. 5457 Cond = Cond.getNode()->getOperand(1); 5458 addTest = false; 5459 break; 5460 } 5461 } 5462 } else { 5463 unsigned CondOpc; 5464 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 5465 SDValue Cmp = Cond.getOperand(0).getOperand(1); 5466 if (CondOpc == ISD::OR) { 5467 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 5468 // two branches instead of an explicit OR instruction with a 5469 // separate test. 5470 if (Cmp == Cond.getOperand(1).getOperand(1) && 5471 isX86LogicalCmp(Cmp)) { 5472 CC = Cond.getOperand(0).getOperand(0); 5473 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5474 Chain, Dest, CC, Cmp); 5475 CC = Cond.getOperand(1).getOperand(0); 5476 Cond = Cmp; 5477 addTest = false; 5478 } 5479 } else { // ISD::AND 5480 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 5481 // two branches instead of an explicit AND instruction with a 5482 // separate test. However, we only do this if this block doesn't 5483 // have a fall-through edge, because this requires an explicit 5484 // jmp when the condition is false. 5485 if (Cmp == Cond.getOperand(1).getOperand(1) && 5486 isX86LogicalCmp(Cmp) && 5487 Op.getNode()->hasOneUse()) { 5488 X86::CondCode CCode = 5489 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5490 CCode = X86::GetOppositeBranchCondition(CCode); 5491 CC = DAG.getConstant(CCode, MVT::i8); 5492 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 5493 // Look for an unconditional branch following this conditional branch. 5494 // We need this because we need to reverse the successors in order 5495 // to implement FCMP_OEQ. 5496 if (User.getOpcode() == ISD::BR) { 5497 SDValue FalseBB = User.getOperand(1); 5498 SDValue NewBR = 5499 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 5500 assert(NewBR == User); 5501 Dest = FalseBB; 5502 5503 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5504 Chain, Dest, CC, Cmp); 5505 X86::CondCode CCode = 5506 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 5507 CCode = X86::GetOppositeBranchCondition(CCode); 5508 CC = DAG.getConstant(CCode, MVT::i8); 5509 Cond = Cmp; 5510 addTest = false; 5511 } 5512 } 5513 } 5514 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 5515 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 5516 // It should be transformed during dag combiner except when the condition 5517 // is set by a arithmetics with overflow node. 5518 X86::CondCode CCode = 5519 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5520 CCode = X86::GetOppositeBranchCondition(CCode); 5521 CC = DAG.getConstant(CCode, MVT::i8); 5522 Cond = Cond.getOperand(0).getOperand(1); 5523 addTest = false; 5524 } 5525 } 5526 5527 if (addTest) { 5528 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5529 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5530 } 5531 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5532 Chain, Dest, CC, Cond); 5533} 5534 5535 5536// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 5537// Calls to _alloca is needed to probe the stack when allocating more than 4k 5538// bytes in one go. Touching the stack at 4K increments is necessary to ensure 5539// that the guard pages used by the OS virtual memory manager are allocated in 5540// correct sequence. 5541SDValue 5542X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 5543 SelectionDAG &DAG) { 5544 assert(Subtarget->isTargetCygMing() && 5545 "This should be used only on Cygwin/Mingw targets"); 5546 DebugLoc dl = Op.getDebugLoc(); 5547 5548 // Get the inputs. 5549 SDValue Chain = Op.getOperand(0); 5550 SDValue Size = Op.getOperand(1); 5551 // FIXME: Ensure alignment here 5552 5553 SDValue Flag; 5554 5555 MVT IntPtr = getPointerTy(); 5556 MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 5557 5558 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 5559 5560 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 5561 Flag = Chain.getValue(1); 5562 5563 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5564 SDValue Ops[] = { Chain, 5565 DAG.getTargetExternalSymbol("_alloca", IntPtr), 5566 DAG.getRegister(X86::EAX, IntPtr), 5567 DAG.getRegister(X86StackPtr, SPTy), 5568 Flag }; 5569 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 5570 Flag = Chain.getValue(1); 5571 5572 Chain = DAG.getCALLSEQ_END(Chain, 5573 DAG.getIntPtrConstant(0, true), 5574 DAG.getIntPtrConstant(0, true), 5575 Flag); 5576 5577 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 5578 5579 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 5580 return DAG.getMergeValues(Ops1, 2, dl); 5581} 5582 5583SDValue 5584X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 5585 SDValue Chain, 5586 SDValue Dst, SDValue Src, 5587 SDValue Size, unsigned Align, 5588 const Value *DstSV, 5589 uint64_t DstSVOff) { 5590 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5591 5592 // If not DWORD aligned or size is more than the threshold, call the library. 5593 // The libc version is likely to be faster for these cases. It can use the 5594 // address value and run time information about the CPU. 5595 if ((Align & 3) != 0 || 5596 !ConstantSize || 5597 ConstantSize->getZExtValue() > 5598 getSubtarget()->getMaxInlineSizeThreshold()) { 5599 SDValue InFlag(0, 0); 5600 5601 // Check to see if there is a specialized entry-point for memory zeroing. 5602 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 5603 5604 if (const char *bzeroEntry = V && 5605 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 5606 MVT IntPtr = getPointerTy(); 5607 const Type *IntPtrTy = TD->getIntPtrType(); 5608 TargetLowering::ArgListTy Args; 5609 TargetLowering::ArgListEntry Entry; 5610 Entry.Node = Dst; 5611 Entry.Ty = IntPtrTy; 5612 Args.push_back(Entry); 5613 Entry.Node = Size; 5614 Args.push_back(Entry); 5615 std::pair<SDValue,SDValue> CallResult = 5616 LowerCallTo(Chain, Type::VoidTy, false, false, false, false, 5617 CallingConv::C, false, 5618 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 5619 return CallResult.second; 5620 } 5621 5622 // Otherwise have the target-independent code call memset. 5623 return SDValue(); 5624 } 5625 5626 uint64_t SizeVal = ConstantSize->getZExtValue(); 5627 SDValue InFlag(0, 0); 5628 MVT AVT; 5629 SDValue Count; 5630 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 5631 unsigned BytesLeft = 0; 5632 bool TwoRepStos = false; 5633 if (ValC) { 5634 unsigned ValReg; 5635 uint64_t Val = ValC->getZExtValue() & 255; 5636 5637 // If the value is a constant, then we can potentially use larger sets. 5638 switch (Align & 3) { 5639 case 2: // WORD aligned 5640 AVT = MVT::i16; 5641 ValReg = X86::AX; 5642 Val = (Val << 8) | Val; 5643 break; 5644 case 0: // DWORD aligned 5645 AVT = MVT::i32; 5646 ValReg = X86::EAX; 5647 Val = (Val << 8) | Val; 5648 Val = (Val << 16) | Val; 5649 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 5650 AVT = MVT::i64; 5651 ValReg = X86::RAX; 5652 Val = (Val << 32) | Val; 5653 } 5654 break; 5655 default: // Byte aligned 5656 AVT = MVT::i8; 5657 ValReg = X86::AL; 5658 Count = DAG.getIntPtrConstant(SizeVal); 5659 break; 5660 } 5661 5662 if (AVT.bitsGT(MVT::i8)) { 5663 unsigned UBytes = AVT.getSizeInBits() / 8; 5664 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 5665 BytesLeft = SizeVal % UBytes; 5666 } 5667 5668 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 5669 InFlag); 5670 InFlag = Chain.getValue(1); 5671 } else { 5672 AVT = MVT::i8; 5673 Count = DAG.getIntPtrConstant(SizeVal); 5674 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 5675 InFlag = Chain.getValue(1); 5676 } 5677 5678 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 5679 X86::ECX, 5680 Count, InFlag); 5681 InFlag = Chain.getValue(1); 5682 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 5683 X86::EDI, 5684 Dst, InFlag); 5685 InFlag = Chain.getValue(1); 5686 5687 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5688 SmallVector<SDValue, 8> Ops; 5689 Ops.push_back(Chain); 5690 Ops.push_back(DAG.getValueType(AVT)); 5691 Ops.push_back(InFlag); 5692 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 5693 5694 if (TwoRepStos) { 5695 InFlag = Chain.getValue(1); 5696 Count = Size; 5697 MVT CVT = Count.getValueType(); 5698 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 5699 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 5700 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 5701 X86::ECX, 5702 Left, InFlag); 5703 InFlag = Chain.getValue(1); 5704 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5705 Ops.clear(); 5706 Ops.push_back(Chain); 5707 Ops.push_back(DAG.getValueType(MVT::i8)); 5708 Ops.push_back(InFlag); 5709 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 5710 } else if (BytesLeft) { 5711 // Handle the last 1 - 7 bytes. 5712 unsigned Offset = SizeVal - BytesLeft; 5713 MVT AddrVT = Dst.getValueType(); 5714 MVT SizeVT = Size.getValueType(); 5715 5716 Chain = DAG.getMemset(Chain, dl, 5717 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 5718 DAG.getConstant(Offset, AddrVT)), 5719 Src, 5720 DAG.getConstant(BytesLeft, SizeVT), 5721 Align, DstSV, DstSVOff + Offset); 5722 } 5723 5724 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 5725 return Chain; 5726} 5727 5728SDValue 5729X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 5730 SDValue Chain, SDValue Dst, SDValue Src, 5731 SDValue Size, unsigned Align, 5732 bool AlwaysInline, 5733 const Value *DstSV, uint64_t DstSVOff, 5734 const Value *SrcSV, uint64_t SrcSVOff) { 5735 // This requires the copy size to be a constant, preferrably 5736 // within a subtarget-specific limit. 5737 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5738 if (!ConstantSize) 5739 return SDValue(); 5740 uint64_t SizeVal = ConstantSize->getZExtValue(); 5741 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 5742 return SDValue(); 5743 5744 /// If not DWORD aligned, call the library. 5745 if ((Align & 3) != 0) 5746 return SDValue(); 5747 5748 // DWORD aligned 5749 MVT AVT = MVT::i32; 5750 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 5751 AVT = MVT::i64; 5752 5753 unsigned UBytes = AVT.getSizeInBits() / 8; 5754 unsigned CountVal = SizeVal / UBytes; 5755 SDValue Count = DAG.getIntPtrConstant(CountVal); 5756 unsigned BytesLeft = SizeVal % UBytes; 5757 5758 SDValue InFlag(0, 0); 5759 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 5760 X86::ECX, 5761 Count, InFlag); 5762 InFlag = Chain.getValue(1); 5763 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 5764 X86::EDI, 5765 Dst, InFlag); 5766 InFlag = Chain.getValue(1); 5767 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 5768 X86::ESI, 5769 Src, InFlag); 5770 InFlag = Chain.getValue(1); 5771 5772 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5773 SmallVector<SDValue, 8> Ops; 5774 Ops.push_back(Chain); 5775 Ops.push_back(DAG.getValueType(AVT)); 5776 Ops.push_back(InFlag); 5777 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size()); 5778 5779 SmallVector<SDValue, 4> Results; 5780 Results.push_back(RepMovs); 5781 if (BytesLeft) { 5782 // Handle the last 1 - 7 bytes. 5783 unsigned Offset = SizeVal - BytesLeft; 5784 MVT DstVT = Dst.getValueType(); 5785 MVT SrcVT = Src.getValueType(); 5786 MVT SizeVT = Size.getValueType(); 5787 Results.push_back(DAG.getMemcpy(Chain, dl, 5788 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 5789 DAG.getConstant(Offset, DstVT)), 5790 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 5791 DAG.getConstant(Offset, SrcVT)), 5792 DAG.getConstant(BytesLeft, SizeVT), 5793 Align, AlwaysInline, 5794 DstSV, DstSVOff + Offset, 5795 SrcSV, SrcSVOff + Offset)); 5796 } 5797 5798 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 5799 &Results[0], Results.size()); 5800} 5801 5802SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 5803 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 5804 DebugLoc dl = Op.getDebugLoc(); 5805 5806 if (!Subtarget->is64Bit()) { 5807 // vastart just stores the address of the VarArgsFrameIndex slot into the 5808 // memory location argument. 5809 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 5810 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); 5811 } 5812 5813 // __va_list_tag: 5814 // gp_offset (0 - 6 * 8) 5815 // fp_offset (48 - 48 + 8 * 16) 5816 // overflow_arg_area (point to parameters coming in memory). 5817 // reg_save_area 5818 SmallVector<SDValue, 8> MemOps; 5819 SDValue FIN = Op.getOperand(1); 5820 // Store gp_offset 5821 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 5822 DAG.getConstant(VarArgsGPOffset, MVT::i32), 5823 FIN, SV, 0); 5824 MemOps.push_back(Store); 5825 5826 // Store fp_offset 5827 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5828 FIN, DAG.getIntPtrConstant(4)); 5829 Store = DAG.getStore(Op.getOperand(0), dl, 5830 DAG.getConstant(VarArgsFPOffset, MVT::i32), 5831 FIN, SV, 0); 5832 MemOps.push_back(Store); 5833 5834 // Store ptr to overflow_arg_area 5835 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5836 FIN, DAG.getIntPtrConstant(4)); 5837 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 5838 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); 5839 MemOps.push_back(Store); 5840 5841 // Store ptr to reg_save_area. 5842 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5843 FIN, DAG.getIntPtrConstant(8)); 5844 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 5845 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); 5846 MemOps.push_back(Store); 5847 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 5848 &MemOps[0], MemOps.size()); 5849} 5850 5851SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 5852 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 5853 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 5854 SDValue Chain = Op.getOperand(0); 5855 SDValue SrcPtr = Op.getOperand(1); 5856 SDValue SrcSV = Op.getOperand(2); 5857 5858 assert(0 && "VAArgInst is not yet implemented for x86-64!"); 5859 abort(); 5860 return SDValue(); 5861} 5862 5863SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 5864 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 5865 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 5866 SDValue Chain = Op.getOperand(0); 5867 SDValue DstPtr = Op.getOperand(1); 5868 SDValue SrcPtr = Op.getOperand(2); 5869 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 5870 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 5871 DebugLoc dl = Op.getDebugLoc(); 5872 5873 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 5874 DAG.getIntPtrConstant(24), 8, false, 5875 DstSV, 0, SrcSV, 0); 5876} 5877 5878SDValue 5879X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 5880 DebugLoc dl = Op.getDebugLoc(); 5881 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5882 switch (IntNo) { 5883 default: return SDValue(); // Don't custom lower most intrinsics. 5884 // Comparison intrinsics. 5885 case Intrinsic::x86_sse_comieq_ss: 5886 case Intrinsic::x86_sse_comilt_ss: 5887 case Intrinsic::x86_sse_comile_ss: 5888 case Intrinsic::x86_sse_comigt_ss: 5889 case Intrinsic::x86_sse_comige_ss: 5890 case Intrinsic::x86_sse_comineq_ss: 5891 case Intrinsic::x86_sse_ucomieq_ss: 5892 case Intrinsic::x86_sse_ucomilt_ss: 5893 case Intrinsic::x86_sse_ucomile_ss: 5894 case Intrinsic::x86_sse_ucomigt_ss: 5895 case Intrinsic::x86_sse_ucomige_ss: 5896 case Intrinsic::x86_sse_ucomineq_ss: 5897 case Intrinsic::x86_sse2_comieq_sd: 5898 case Intrinsic::x86_sse2_comilt_sd: 5899 case Intrinsic::x86_sse2_comile_sd: 5900 case Intrinsic::x86_sse2_comigt_sd: 5901 case Intrinsic::x86_sse2_comige_sd: 5902 case Intrinsic::x86_sse2_comineq_sd: 5903 case Intrinsic::x86_sse2_ucomieq_sd: 5904 case Intrinsic::x86_sse2_ucomilt_sd: 5905 case Intrinsic::x86_sse2_ucomile_sd: 5906 case Intrinsic::x86_sse2_ucomigt_sd: 5907 case Intrinsic::x86_sse2_ucomige_sd: 5908 case Intrinsic::x86_sse2_ucomineq_sd: { 5909 unsigned Opc = 0; 5910 ISD::CondCode CC = ISD::SETCC_INVALID; 5911 switch (IntNo) { 5912 default: break; 5913 case Intrinsic::x86_sse_comieq_ss: 5914 case Intrinsic::x86_sse2_comieq_sd: 5915 Opc = X86ISD::COMI; 5916 CC = ISD::SETEQ; 5917 break; 5918 case Intrinsic::x86_sse_comilt_ss: 5919 case Intrinsic::x86_sse2_comilt_sd: 5920 Opc = X86ISD::COMI; 5921 CC = ISD::SETLT; 5922 break; 5923 case Intrinsic::x86_sse_comile_ss: 5924 case Intrinsic::x86_sse2_comile_sd: 5925 Opc = X86ISD::COMI; 5926 CC = ISD::SETLE; 5927 break; 5928 case Intrinsic::x86_sse_comigt_ss: 5929 case Intrinsic::x86_sse2_comigt_sd: 5930 Opc = X86ISD::COMI; 5931 CC = ISD::SETGT; 5932 break; 5933 case Intrinsic::x86_sse_comige_ss: 5934 case Intrinsic::x86_sse2_comige_sd: 5935 Opc = X86ISD::COMI; 5936 CC = ISD::SETGE; 5937 break; 5938 case Intrinsic::x86_sse_comineq_ss: 5939 case Intrinsic::x86_sse2_comineq_sd: 5940 Opc = X86ISD::COMI; 5941 CC = ISD::SETNE; 5942 break; 5943 case Intrinsic::x86_sse_ucomieq_ss: 5944 case Intrinsic::x86_sse2_ucomieq_sd: 5945 Opc = X86ISD::UCOMI; 5946 CC = ISD::SETEQ; 5947 break; 5948 case Intrinsic::x86_sse_ucomilt_ss: 5949 case Intrinsic::x86_sse2_ucomilt_sd: 5950 Opc = X86ISD::UCOMI; 5951 CC = ISD::SETLT; 5952 break; 5953 case Intrinsic::x86_sse_ucomile_ss: 5954 case Intrinsic::x86_sse2_ucomile_sd: 5955 Opc = X86ISD::UCOMI; 5956 CC = ISD::SETLE; 5957 break; 5958 case Intrinsic::x86_sse_ucomigt_ss: 5959 case Intrinsic::x86_sse2_ucomigt_sd: 5960 Opc = X86ISD::UCOMI; 5961 CC = ISD::SETGT; 5962 break; 5963 case Intrinsic::x86_sse_ucomige_ss: 5964 case Intrinsic::x86_sse2_ucomige_sd: 5965 Opc = X86ISD::UCOMI; 5966 CC = ISD::SETGE; 5967 break; 5968 case Intrinsic::x86_sse_ucomineq_ss: 5969 case Intrinsic::x86_sse2_ucomineq_sd: 5970 Opc = X86ISD::UCOMI; 5971 CC = ISD::SETNE; 5972 break; 5973 } 5974 5975 SDValue LHS = Op.getOperand(1); 5976 SDValue RHS = Op.getOperand(2); 5977 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 5978 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 5979 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5980 DAG.getConstant(X86CC, MVT::i8), Cond); 5981 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 5982 } 5983 5984 // Fix vector shift instructions where the last operand is a non-immediate 5985 // i32 value. 5986 case Intrinsic::x86_sse2_pslli_w: 5987 case Intrinsic::x86_sse2_pslli_d: 5988 case Intrinsic::x86_sse2_pslli_q: 5989 case Intrinsic::x86_sse2_psrli_w: 5990 case Intrinsic::x86_sse2_psrli_d: 5991 case Intrinsic::x86_sse2_psrli_q: 5992 case Intrinsic::x86_sse2_psrai_w: 5993 case Intrinsic::x86_sse2_psrai_d: 5994 case Intrinsic::x86_mmx_pslli_w: 5995 case Intrinsic::x86_mmx_pslli_d: 5996 case Intrinsic::x86_mmx_pslli_q: 5997 case Intrinsic::x86_mmx_psrli_w: 5998 case Intrinsic::x86_mmx_psrli_d: 5999 case Intrinsic::x86_mmx_psrli_q: 6000 case Intrinsic::x86_mmx_psrai_w: 6001 case Intrinsic::x86_mmx_psrai_d: { 6002 SDValue ShAmt = Op.getOperand(2); 6003 if (isa<ConstantSDNode>(ShAmt)) 6004 return SDValue(); 6005 6006 unsigned NewIntNo = 0; 6007 MVT ShAmtVT = MVT::v4i32; 6008 switch (IntNo) { 6009 case Intrinsic::x86_sse2_pslli_w: 6010 NewIntNo = Intrinsic::x86_sse2_psll_w; 6011 break; 6012 case Intrinsic::x86_sse2_pslli_d: 6013 NewIntNo = Intrinsic::x86_sse2_psll_d; 6014 break; 6015 case Intrinsic::x86_sse2_pslli_q: 6016 NewIntNo = Intrinsic::x86_sse2_psll_q; 6017 break; 6018 case Intrinsic::x86_sse2_psrli_w: 6019 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6020 break; 6021 case Intrinsic::x86_sse2_psrli_d: 6022 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6023 break; 6024 case Intrinsic::x86_sse2_psrli_q: 6025 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6026 break; 6027 case Intrinsic::x86_sse2_psrai_w: 6028 NewIntNo = Intrinsic::x86_sse2_psra_w; 6029 break; 6030 case Intrinsic::x86_sse2_psrai_d: 6031 NewIntNo = Intrinsic::x86_sse2_psra_d; 6032 break; 6033 default: { 6034 ShAmtVT = MVT::v2i32; 6035 switch (IntNo) { 6036 case Intrinsic::x86_mmx_pslli_w: 6037 NewIntNo = Intrinsic::x86_mmx_psll_w; 6038 break; 6039 case Intrinsic::x86_mmx_pslli_d: 6040 NewIntNo = Intrinsic::x86_mmx_psll_d; 6041 break; 6042 case Intrinsic::x86_mmx_pslli_q: 6043 NewIntNo = Intrinsic::x86_mmx_psll_q; 6044 break; 6045 case Intrinsic::x86_mmx_psrli_w: 6046 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6047 break; 6048 case Intrinsic::x86_mmx_psrli_d: 6049 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6050 break; 6051 case Intrinsic::x86_mmx_psrli_q: 6052 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6053 break; 6054 case Intrinsic::x86_mmx_psrai_w: 6055 NewIntNo = Intrinsic::x86_mmx_psra_w; 6056 break; 6057 case Intrinsic::x86_mmx_psrai_d: 6058 NewIntNo = Intrinsic::x86_mmx_psra_d; 6059 break; 6060 default: abort(); // Can't reach here. 6061 } 6062 break; 6063 } 6064 } 6065 MVT VT = Op.getValueType(); 6066 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6067 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt)); 6068 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6069 DAG.getConstant(NewIntNo, MVT::i32), 6070 Op.getOperand(1), ShAmt); 6071 } 6072 } 6073} 6074 6075SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6076 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6077 DebugLoc dl = Op.getDebugLoc(); 6078 6079 if (Depth > 0) { 6080 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6081 SDValue Offset = 6082 DAG.getConstant(TD->getPointerSize(), 6083 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6084 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6085 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6086 FrameAddr, Offset), 6087 NULL, 0); 6088 } 6089 6090 // Just load the return address. 6091 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6092 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6093 RetAddrFI, NULL, 0); 6094} 6095 6096SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6097 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6098 MFI->setFrameAddressIsTaken(true); 6099 MVT VT = Op.getValueType(); 6100 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6101 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6102 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 6103 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6104 while (Depth--) 6105 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); 6106 return FrameAddr; 6107} 6108 6109SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 6110 SelectionDAG &DAG) { 6111 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 6112} 6113 6114SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 6115{ 6116 MachineFunction &MF = DAG.getMachineFunction(); 6117 SDValue Chain = Op.getOperand(0); 6118 SDValue Offset = Op.getOperand(1); 6119 SDValue Handler = Op.getOperand(2); 6120 DebugLoc dl = Op.getDebugLoc(); 6121 6122 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 6123 getPointerTy()); 6124 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 6125 6126 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 6127 DAG.getIntPtrConstant(-TD->getPointerSize())); 6128 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 6129 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); 6130 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 6131 MF.getRegInfo().addLiveOut(StoreAddrReg); 6132 6133 return DAG.getNode(X86ISD::EH_RETURN, dl, 6134 MVT::Other, 6135 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 6136} 6137 6138SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 6139 SelectionDAG &DAG) { 6140 SDValue Root = Op.getOperand(0); 6141 SDValue Trmp = Op.getOperand(1); // trampoline 6142 SDValue FPtr = Op.getOperand(2); // nested function 6143 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 6144 DebugLoc dl = Op.getDebugLoc(); 6145 6146 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6147 6148 const X86InstrInfo *TII = 6149 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 6150 6151 if (Subtarget->is64Bit()) { 6152 SDValue OutChains[6]; 6153 6154 // Large code-model. 6155 6156 const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r); 6157 const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri); 6158 6159 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 6160 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 6161 6162 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 6163 6164 // Load the pointer to the nested function into R11. 6165 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 6166 SDValue Addr = Trmp; 6167 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6168 Addr, TrmpAddr, 0); 6169 6170 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6171 DAG.getConstant(2, MVT::i64)); 6172 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); 6173 6174 // Load the 'nest' parameter value into R10. 6175 // R10 is specified in X86CallingConv.td 6176 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 6177 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6178 DAG.getConstant(10, MVT::i64)); 6179 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6180 Addr, TrmpAddr, 10); 6181 6182 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6183 DAG.getConstant(12, MVT::i64)); 6184 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); 6185 6186 // Jump to the nested function. 6187 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 6188 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6189 DAG.getConstant(20, MVT::i64)); 6190 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6191 Addr, TrmpAddr, 20); 6192 6193 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 6194 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6195 DAG.getConstant(22, MVT::i64)); 6196 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 6197 TrmpAddr, 22); 6198 6199 SDValue Ops[] = 6200 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 6201 return DAG.getMergeValues(Ops, 2, dl); 6202 } else { 6203 const Function *Func = 6204 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 6205 unsigned CC = Func->getCallingConv(); 6206 unsigned NestReg; 6207 6208 switch (CC) { 6209 default: 6210 assert(0 && "Unsupported calling convention"); 6211 case CallingConv::C: 6212 case CallingConv::X86_StdCall: { 6213 // Pass 'nest' parameter in ECX. 6214 // Must be kept in sync with X86CallingConv.td 6215 NestReg = X86::ECX; 6216 6217 // Check that ECX wasn't needed by an 'inreg' parameter. 6218 const FunctionType *FTy = Func->getFunctionType(); 6219 const AttrListPtr &Attrs = Func->getAttributes(); 6220 6221 if (!Attrs.isEmpty() && !Func->isVarArg()) { 6222 unsigned InRegCount = 0; 6223 unsigned Idx = 1; 6224 6225 for (FunctionType::param_iterator I = FTy->param_begin(), 6226 E = FTy->param_end(); I != E; ++I, ++Idx) 6227 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 6228 // FIXME: should only count parameters that are lowered to integers. 6229 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 6230 6231 if (InRegCount > 2) { 6232 cerr << "Nest register in use - reduce number of inreg parameters!\n"; 6233 abort(); 6234 } 6235 } 6236 break; 6237 } 6238 case CallingConv::X86_FastCall: 6239 case CallingConv::Fast: 6240 // Pass 'nest' parameter in EAX. 6241 // Must be kept in sync with X86CallingConv.td 6242 NestReg = X86::EAX; 6243 break; 6244 } 6245 6246 SDValue OutChains[4]; 6247 SDValue Addr, Disp; 6248 6249 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6250 DAG.getConstant(10, MVT::i32)); 6251 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 6252 6253 const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri); 6254 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 6255 OutChains[0] = DAG.getStore(Root, dl, 6256 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 6257 Trmp, TrmpAddr, 0); 6258 6259 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6260 DAG.getConstant(1, MVT::i32)); 6261 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); 6262 6263 const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP); 6264 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6265 DAG.getConstant(5, MVT::i32)); 6266 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 6267 TrmpAddr, 5, false, 1); 6268 6269 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6270 DAG.getConstant(6, MVT::i32)); 6271 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); 6272 6273 SDValue Ops[] = 6274 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 6275 return DAG.getMergeValues(Ops, 2, dl); 6276 } 6277} 6278 6279SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 6280 /* 6281 The rounding mode is in bits 11:10 of FPSR, and has the following 6282 settings: 6283 00 Round to nearest 6284 01 Round to -inf 6285 10 Round to +inf 6286 11 Round to 0 6287 6288 FLT_ROUNDS, on the other hand, expects the following: 6289 -1 Undefined 6290 0 Round to 0 6291 1 Round to nearest 6292 2 Round to +inf 6293 3 Round to -inf 6294 6295 To perform the conversion, we do: 6296 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 6297 */ 6298 6299 MachineFunction &MF = DAG.getMachineFunction(); 6300 const TargetMachine &TM = MF.getTarget(); 6301 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 6302 unsigned StackAlignment = TFI.getStackAlignment(); 6303 MVT VT = Op.getValueType(); 6304 DebugLoc dl = Op.getDebugLoc(); 6305 6306 // Save FP Control Word to stack slot 6307 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment); 6308 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6309 6310 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 6311 DAG.getEntryNode(), StackSlot); 6312 6313 // Load FP Control Word from stack slot 6314 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); 6315 6316 // Transform as necessary 6317 SDValue CWD1 = 6318 DAG.getNode(ISD::SRL, dl, MVT::i16, 6319 DAG.getNode(ISD::AND, dl, MVT::i16, 6320 CWD, DAG.getConstant(0x800, MVT::i16)), 6321 DAG.getConstant(11, MVT::i8)); 6322 SDValue CWD2 = 6323 DAG.getNode(ISD::SRL, dl, MVT::i16, 6324 DAG.getNode(ISD::AND, dl, MVT::i16, 6325 CWD, DAG.getConstant(0x400, MVT::i16)), 6326 DAG.getConstant(9, MVT::i8)); 6327 6328 SDValue RetVal = 6329 DAG.getNode(ISD::AND, dl, MVT::i16, 6330 DAG.getNode(ISD::ADD, dl, MVT::i16, 6331 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 6332 DAG.getConstant(1, MVT::i16)), 6333 DAG.getConstant(3, MVT::i16)); 6334 6335 6336 return DAG.getNode((VT.getSizeInBits() < 16 ? 6337 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 6338} 6339 6340SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 6341 MVT VT = Op.getValueType(); 6342 MVT OpVT = VT; 6343 unsigned NumBits = VT.getSizeInBits(); 6344 DebugLoc dl = Op.getDebugLoc(); 6345 6346 Op = Op.getOperand(0); 6347 if (VT == MVT::i8) { 6348 // Zero extend to i32 since there is not an i8 bsr. 6349 OpVT = MVT::i32; 6350 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6351 } 6352 6353 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 6354 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6355 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 6356 6357 // If src is zero (i.e. bsr sets ZF), returns NumBits. 6358 SmallVector<SDValue, 4> Ops; 6359 Ops.push_back(Op); 6360 Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT)); 6361 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6362 Ops.push_back(Op.getValue(1)); 6363 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6364 6365 // Finally xor with NumBits-1. 6366 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 6367 6368 if (VT == MVT::i8) 6369 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6370 return Op; 6371} 6372 6373SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 6374 MVT VT = Op.getValueType(); 6375 MVT OpVT = VT; 6376 unsigned NumBits = VT.getSizeInBits(); 6377 DebugLoc dl = Op.getDebugLoc(); 6378 6379 Op = Op.getOperand(0); 6380 if (VT == MVT::i8) { 6381 OpVT = MVT::i32; 6382 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6383 } 6384 6385 // Issue a bsf (scan bits forward) which also sets EFLAGS. 6386 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6387 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 6388 6389 // If src is zero (i.e. bsf sets ZF), returns NumBits. 6390 SmallVector<SDValue, 4> Ops; 6391 Ops.push_back(Op); 6392 Ops.push_back(DAG.getConstant(NumBits, OpVT)); 6393 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6394 Ops.push_back(Op.getValue(1)); 6395 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6396 6397 if (VT == MVT::i8) 6398 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6399 return Op; 6400} 6401 6402SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 6403 MVT VT = Op.getValueType(); 6404 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 6405 DebugLoc dl = Op.getDebugLoc(); 6406 6407 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 6408 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 6409 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 6410 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 6411 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 6412 // 6413 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 6414 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 6415 // return AloBlo + AloBhi + AhiBlo; 6416 6417 SDValue A = Op.getOperand(0); 6418 SDValue B = Op.getOperand(1); 6419 6420 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6421 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6422 A, DAG.getConstant(32, MVT::i32)); 6423 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6424 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6425 B, DAG.getConstant(32, MVT::i32)); 6426 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6427 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6428 A, B); 6429 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6430 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6431 A, Bhi); 6432 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6433 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6434 Ahi, B); 6435 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6436 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6437 AloBhi, DAG.getConstant(32, MVT::i32)); 6438 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6439 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6440 AhiBlo, DAG.getConstant(32, MVT::i32)); 6441 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 6442 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 6443 return Res; 6444} 6445 6446 6447SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 6448 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 6449 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 6450 // looks for this combo and may remove the "setcc" instruction if the "setcc" 6451 // has only one use. 6452 SDNode *N = Op.getNode(); 6453 SDValue LHS = N->getOperand(0); 6454 SDValue RHS = N->getOperand(1); 6455 unsigned BaseOp = 0; 6456 unsigned Cond = 0; 6457 DebugLoc dl = Op.getDebugLoc(); 6458 6459 switch (Op.getOpcode()) { 6460 default: assert(0 && "Unknown ovf instruction!"); 6461 case ISD::SADDO: 6462 // A subtract of one will be selected as a INC. Note that INC doesn't 6463 // set CF, so we can't do this for UADDO. 6464 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6465 if (C->getAPIntValue() == 1) { 6466 BaseOp = X86ISD::INC; 6467 Cond = X86::COND_O; 6468 break; 6469 } 6470 BaseOp = X86ISD::ADD; 6471 Cond = X86::COND_O; 6472 break; 6473 case ISD::UADDO: 6474 BaseOp = X86ISD::ADD; 6475 Cond = X86::COND_B; 6476 break; 6477 case ISD::SSUBO: 6478 // A subtract of one will be selected as a DEC. Note that DEC doesn't 6479 // set CF, so we can't do this for USUBO. 6480 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6481 if (C->getAPIntValue() == 1) { 6482 BaseOp = X86ISD::DEC; 6483 Cond = X86::COND_O; 6484 break; 6485 } 6486 BaseOp = X86ISD::SUB; 6487 Cond = X86::COND_O; 6488 break; 6489 case ISD::USUBO: 6490 BaseOp = X86ISD::SUB; 6491 Cond = X86::COND_B; 6492 break; 6493 case ISD::SMULO: 6494 BaseOp = X86ISD::SMUL; 6495 Cond = X86::COND_O; 6496 break; 6497 case ISD::UMULO: 6498 BaseOp = X86ISD::UMUL; 6499 Cond = X86::COND_B; 6500 break; 6501 } 6502 6503 // Also sets EFLAGS. 6504 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 6505 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 6506 6507 SDValue SetCC = 6508 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 6509 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 6510 6511 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 6512 return Sum; 6513} 6514 6515SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 6516 MVT T = Op.getValueType(); 6517 DebugLoc dl = Op.getDebugLoc(); 6518 unsigned Reg = 0; 6519 unsigned size = 0; 6520 switch(T.getSimpleVT()) { 6521 default: 6522 assert(false && "Invalid value type!"); 6523 case MVT::i8: Reg = X86::AL; size = 1; break; 6524 case MVT::i16: Reg = X86::AX; size = 2; break; 6525 case MVT::i32: Reg = X86::EAX; size = 4; break; 6526 case MVT::i64: 6527 assert(Subtarget->is64Bit() && "Node not type legal!"); 6528 Reg = X86::RAX; size = 8; 6529 break; 6530 } 6531 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 6532 Op.getOperand(2), SDValue()); 6533 SDValue Ops[] = { cpIn.getValue(0), 6534 Op.getOperand(1), 6535 Op.getOperand(3), 6536 DAG.getTargetConstant(size, MVT::i8), 6537 cpIn.getValue(1) }; 6538 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6539 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 6540 SDValue cpOut = 6541 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 6542 return cpOut; 6543} 6544 6545SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 6546 SelectionDAG &DAG) { 6547 assert(Subtarget->is64Bit() && "Result not type legalized?"); 6548 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6549 SDValue TheChain = Op.getOperand(0); 6550 DebugLoc dl = Op.getDebugLoc(); 6551 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6552 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 6553 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 6554 rax.getValue(2)); 6555 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 6556 DAG.getConstant(32, MVT::i8)); 6557 SDValue Ops[] = { 6558 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 6559 rdx.getValue(1) 6560 }; 6561 return DAG.getMergeValues(Ops, 2, dl); 6562} 6563 6564SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 6565 SDNode *Node = Op.getNode(); 6566 DebugLoc dl = Node->getDebugLoc(); 6567 MVT T = Node->getValueType(0); 6568 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 6569 DAG.getConstant(0, T), Node->getOperand(2)); 6570 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 6571 cast<AtomicSDNode>(Node)->getMemoryVT(), 6572 Node->getOperand(0), 6573 Node->getOperand(1), negOp, 6574 cast<AtomicSDNode>(Node)->getSrcValue(), 6575 cast<AtomicSDNode>(Node)->getAlignment()); 6576} 6577 6578/// LowerOperation - Provide custom lowering hooks for some operations. 6579/// 6580SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 6581 switch (Op.getOpcode()) { 6582 default: assert(0 && "Should not custom lower this!"); 6583 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 6584 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 6585 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 6586 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 6587 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 6588 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 6589 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 6590 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 6591 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 6592 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 6593 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 6594 case ISD::SHL_PARTS: 6595 case ISD::SRA_PARTS: 6596 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 6597 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 6598 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 6599 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 6600 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 6601 case ISD::FABS: return LowerFABS(Op, DAG); 6602 case ISD::FNEG: return LowerFNEG(Op, DAG); 6603 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 6604 case ISD::SETCC: return LowerSETCC(Op, DAG); 6605 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 6606 case ISD::SELECT: return LowerSELECT(Op, DAG); 6607 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 6608 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 6609 case ISD::CALL: return LowerCALL(Op, DAG); 6610 case ISD::RET: return LowerRET(Op, DAG); 6611 case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG); 6612 case ISD::VASTART: return LowerVASTART(Op, DAG); 6613 case ISD::VAARG: return LowerVAARG(Op, DAG); 6614 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 6615 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 6616 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 6617 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 6618 case ISD::FRAME_TO_ARGS_OFFSET: 6619 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 6620 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 6621 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 6622 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 6623 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 6624 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 6625 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 6626 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 6627 case ISD::SADDO: 6628 case ISD::UADDO: 6629 case ISD::SSUBO: 6630 case ISD::USUBO: 6631 case ISD::SMULO: 6632 case ISD::UMULO: return LowerXALUO(Op, DAG); 6633 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 6634 } 6635} 6636 6637void X86TargetLowering:: 6638ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 6639 SelectionDAG &DAG, unsigned NewOp) { 6640 MVT T = Node->getValueType(0); 6641 DebugLoc dl = Node->getDebugLoc(); 6642 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 6643 6644 SDValue Chain = Node->getOperand(0); 6645 SDValue In1 = Node->getOperand(1); 6646 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6647 Node->getOperand(2), DAG.getIntPtrConstant(0)); 6648 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6649 Node->getOperand(2), DAG.getIntPtrConstant(1)); 6650 // This is a generalized SDNode, not an AtomicSDNode, so it doesn't 6651 // have a MemOperand. Pass the info through as a normal operand. 6652 SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand()); 6653 SDValue Ops[] = { Chain, In1, In2L, In2H, LSI }; 6654 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 6655 SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5); 6656 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 6657 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 6658 Results.push_back(Result.getValue(2)); 6659} 6660 6661/// ReplaceNodeResults - Replace a node with an illegal result type 6662/// with a new node built out of custom code. 6663void X86TargetLowering::ReplaceNodeResults(SDNode *N, 6664 SmallVectorImpl<SDValue>&Results, 6665 SelectionDAG &DAG) { 6666 DebugLoc dl = N->getDebugLoc(); 6667 switch (N->getOpcode()) { 6668 default: 6669 assert(false && "Do not know how to custom type legalize this operation!"); 6670 return; 6671 case ISD::FP_TO_SINT: { 6672 std::pair<SDValue,SDValue> Vals = 6673 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 6674 SDValue FIST = Vals.first, StackSlot = Vals.second; 6675 if (FIST.getNode() != 0) { 6676 MVT VT = N->getValueType(0); 6677 // Return a load from the stack slot. 6678 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); 6679 } 6680 return; 6681 } 6682 case ISD::READCYCLECOUNTER: { 6683 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6684 SDValue TheChain = N->getOperand(0); 6685 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6686 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 6687 rd.getValue(1)); 6688 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 6689 eax.getValue(2)); 6690 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 6691 SDValue Ops[] = { eax, edx }; 6692 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 6693 Results.push_back(edx.getValue(1)); 6694 return; 6695 } 6696 case ISD::ATOMIC_CMP_SWAP: { 6697 MVT T = N->getValueType(0); 6698 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 6699 SDValue cpInL, cpInH; 6700 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 6701 DAG.getConstant(0, MVT::i32)); 6702 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 6703 DAG.getConstant(1, MVT::i32)); 6704 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 6705 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 6706 cpInL.getValue(1)); 6707 SDValue swapInL, swapInH; 6708 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 6709 DAG.getConstant(0, MVT::i32)); 6710 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 6711 DAG.getConstant(1, MVT::i32)); 6712 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 6713 cpInH.getValue(1)); 6714 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 6715 swapInL.getValue(1)); 6716 SDValue Ops[] = { swapInH.getValue(0), 6717 N->getOperand(1), 6718 swapInH.getValue(1) }; 6719 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6720 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 6721 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 6722 MVT::i32, Result.getValue(1)); 6723 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 6724 MVT::i32, cpOutL.getValue(2)); 6725 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 6726 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 6727 Results.push_back(cpOutH.getValue(1)); 6728 return; 6729 } 6730 case ISD::ATOMIC_LOAD_ADD: 6731 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 6732 return; 6733 case ISD::ATOMIC_LOAD_AND: 6734 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 6735 return; 6736 case ISD::ATOMIC_LOAD_NAND: 6737 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 6738 return; 6739 case ISD::ATOMIC_LOAD_OR: 6740 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 6741 return; 6742 case ISD::ATOMIC_LOAD_SUB: 6743 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 6744 return; 6745 case ISD::ATOMIC_LOAD_XOR: 6746 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 6747 return; 6748 case ISD::ATOMIC_SWAP: 6749 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 6750 return; 6751 } 6752} 6753 6754const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 6755 switch (Opcode) { 6756 default: return NULL; 6757 case X86ISD::BSF: return "X86ISD::BSF"; 6758 case X86ISD::BSR: return "X86ISD::BSR"; 6759 case X86ISD::SHLD: return "X86ISD::SHLD"; 6760 case X86ISD::SHRD: return "X86ISD::SHRD"; 6761 case X86ISD::FAND: return "X86ISD::FAND"; 6762 case X86ISD::FOR: return "X86ISD::FOR"; 6763 case X86ISD::FXOR: return "X86ISD::FXOR"; 6764 case X86ISD::FSRL: return "X86ISD::FSRL"; 6765 case X86ISD::FILD: return "X86ISD::FILD"; 6766 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 6767 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 6768 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 6769 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 6770 case X86ISD::FLD: return "X86ISD::FLD"; 6771 case X86ISD::FST: return "X86ISD::FST"; 6772 case X86ISD::CALL: return "X86ISD::CALL"; 6773 case X86ISD::TAILCALL: return "X86ISD::TAILCALL"; 6774 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 6775 case X86ISD::BT: return "X86ISD::BT"; 6776 case X86ISD::CMP: return "X86ISD::CMP"; 6777 case X86ISD::COMI: return "X86ISD::COMI"; 6778 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 6779 case X86ISD::SETCC: return "X86ISD::SETCC"; 6780 case X86ISD::CMOV: return "X86ISD::CMOV"; 6781 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 6782 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 6783 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 6784 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 6785 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 6786 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 6787 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 6788 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 6789 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 6790 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 6791 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 6792 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 6793 case X86ISD::FMAX: return "X86ISD::FMAX"; 6794 case X86ISD::FMIN: return "X86ISD::FMIN"; 6795 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 6796 case X86ISD::FRCP: return "X86ISD::FRCP"; 6797 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 6798 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 6799 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 6800 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 6801 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 6802 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 6803 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 6804 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 6805 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 6806 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 6807 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 6808 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 6809 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 6810 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 6811 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 6812 case X86ISD::VSHL: return "X86ISD::VSHL"; 6813 case X86ISD::VSRL: return "X86ISD::VSRL"; 6814 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 6815 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 6816 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 6817 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 6818 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 6819 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 6820 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 6821 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 6822 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 6823 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 6824 case X86ISD::ADD: return "X86ISD::ADD"; 6825 case X86ISD::SUB: return "X86ISD::SUB"; 6826 case X86ISD::SMUL: return "X86ISD::SMUL"; 6827 case X86ISD::UMUL: return "X86ISD::UMUL"; 6828 case X86ISD::INC: return "X86ISD::INC"; 6829 case X86ISD::DEC: return "X86ISD::DEC"; 6830 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 6831 } 6832} 6833 6834// isLegalAddressingMode - Return true if the addressing mode represented 6835// by AM is legal for this target, for a load/store of the specified type. 6836bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 6837 const Type *Ty) const { 6838 // X86 supports extremely general addressing modes. 6839 6840 // X86 allows a sign-extended 32-bit immediate field as a displacement. 6841 if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1) 6842 return false; 6843 6844 if (AM.BaseGV) { 6845 // We can only fold this if we don't need an extra load. 6846 if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false)) 6847 return false; 6848 // If BaseGV requires a register, we cannot also have a BaseReg. 6849 if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) && 6850 AM.HasBaseReg) 6851 return false; 6852 6853 // X86-64 only supports addr of globals in small code model. 6854 if (Subtarget->is64Bit()) { 6855 if (getTargetMachine().getCodeModel() != CodeModel::Small) 6856 return false; 6857 // If lower 4G is not available, then we must use rip-relative addressing. 6858 if (AM.BaseOffs || AM.Scale > 1) 6859 return false; 6860 } 6861 } 6862 6863 switch (AM.Scale) { 6864 case 0: 6865 case 1: 6866 case 2: 6867 case 4: 6868 case 8: 6869 // These scales always work. 6870 break; 6871 case 3: 6872 case 5: 6873 case 9: 6874 // These scales are formed with basereg+scalereg. Only accept if there is 6875 // no basereg yet. 6876 if (AM.HasBaseReg) 6877 return false; 6878 break; 6879 default: // Other stuff never works. 6880 return false; 6881 } 6882 6883 return true; 6884} 6885 6886 6887bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 6888 if (!Ty1->isInteger() || !Ty2->isInteger()) 6889 return false; 6890 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6891 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6892 if (NumBits1 <= NumBits2) 6893 return false; 6894 return Subtarget->is64Bit() || NumBits1 < 64; 6895} 6896 6897bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const { 6898 if (!VT1.isInteger() || !VT2.isInteger()) 6899 return false; 6900 unsigned NumBits1 = VT1.getSizeInBits(); 6901 unsigned NumBits2 = VT2.getSizeInBits(); 6902 if (NumBits1 <= NumBits2) 6903 return false; 6904 return Subtarget->is64Bit() || NumBits1 < 64; 6905} 6906 6907bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 6908 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 6909 return Ty1 == Type::Int32Ty && Ty2 == Type::Int64Ty && Subtarget->is64Bit(); 6910} 6911 6912bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const { 6913 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 6914 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 6915} 6916 6917bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const { 6918 // i16 instructions are longer (0x66 prefix) and potentially slower. 6919 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 6920} 6921 6922/// isShuffleMaskLegal - Targets can use this to indicate that they only 6923/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 6924/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 6925/// are assumed to be legal. 6926bool 6927X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 6928 MVT VT) const { 6929 // Only do shuffles on 128-bit vector types for now. 6930 if (VT.getSizeInBits() == 64) 6931 return false; 6932 6933 // FIXME: pshufb, blends, palignr, shifts. 6934 return (VT.getVectorNumElements() == 2 || 6935 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 6936 isMOVLMask(M, VT) || 6937 isSHUFPMask(M, VT) || 6938 isPSHUFDMask(M, VT) || 6939 isPSHUFHWMask(M, VT) || 6940 isPSHUFLWMask(M, VT) || 6941 isUNPCKLMask(M, VT) || 6942 isUNPCKHMask(M, VT) || 6943 isUNPCKL_v_undef_Mask(M, VT) || 6944 isUNPCKH_v_undef_Mask(M, VT)); 6945} 6946 6947bool 6948X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 6949 MVT VT) const { 6950 unsigned NumElts = VT.getVectorNumElements(); 6951 // FIXME: This collection of masks seems suspect. 6952 if (NumElts == 2) 6953 return true; 6954 if (NumElts == 4 && VT.getSizeInBits() == 128) { 6955 return (isMOVLMask(Mask, VT) || 6956 isCommutedMOVLMask(Mask, VT, true) || 6957 isSHUFPMask(Mask, VT) || 6958 isCommutedSHUFPMask(Mask, VT)); 6959 } 6960 return false; 6961} 6962 6963//===----------------------------------------------------------------------===// 6964// X86 Scheduler Hooks 6965//===----------------------------------------------------------------------===// 6966 6967// private utility function 6968MachineBasicBlock * 6969X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 6970 MachineBasicBlock *MBB, 6971 unsigned regOpc, 6972 unsigned immOpc, 6973 unsigned LoadOpc, 6974 unsigned CXchgOpc, 6975 unsigned copyOpc, 6976 unsigned notOpc, 6977 unsigned EAXreg, 6978 TargetRegisterClass *RC, 6979 bool invSrc) const { 6980 // For the atomic bitwise operator, we generate 6981 // thisMBB: 6982 // newMBB: 6983 // ld t1 = [bitinstr.addr] 6984 // op t2 = t1, [bitinstr.val] 6985 // mov EAX = t1 6986 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 6987 // bz newMBB 6988 // fallthrough -->nextMBB 6989 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 6990 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 6991 MachineFunction::iterator MBBIter = MBB; 6992 ++MBBIter; 6993 6994 /// First build the CFG 6995 MachineFunction *F = MBB->getParent(); 6996 MachineBasicBlock *thisMBB = MBB; 6997 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 6998 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 6999 F->insert(MBBIter, newMBB); 7000 F->insert(MBBIter, nextMBB); 7001 7002 // Move all successors to thisMBB to nextMBB 7003 nextMBB->transferSuccessors(thisMBB); 7004 7005 // Update thisMBB to fall through to newMBB 7006 thisMBB->addSuccessor(newMBB); 7007 7008 // newMBB jumps to itself and fall through to nextMBB 7009 newMBB->addSuccessor(nextMBB); 7010 newMBB->addSuccessor(newMBB); 7011 7012 // Insert instructions into newMBB based on incoming instruction 7013 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7014 "unexpected number of operands"); 7015 DebugLoc dl = bInstr->getDebugLoc(); 7016 MachineOperand& destOper = bInstr->getOperand(0); 7017 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7018 int numArgs = bInstr->getNumOperands() - 1; 7019 for (int i=0; i < numArgs; ++i) 7020 argOpers[i] = &bInstr->getOperand(i+1); 7021 7022 // x86 address has 4 operands: base, index, scale, and displacement 7023 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7024 int valArgIndx = lastAddrIndx + 1; 7025 7026 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7027 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7028 for (int i=0; i <= lastAddrIndx; ++i) 7029 (*MIB).addOperand(*argOpers[i]); 7030 7031 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7032 if (invSrc) { 7033 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7034 } 7035 else 7036 tt = t1; 7037 7038 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7039 assert((argOpers[valArgIndx]->isReg() || 7040 argOpers[valArgIndx]->isImm()) && 7041 "invalid operand"); 7042 if (argOpers[valArgIndx]->isReg()) 7043 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7044 else 7045 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7046 MIB.addReg(tt); 7047 (*MIB).addOperand(*argOpers[valArgIndx]); 7048 7049 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7050 MIB.addReg(t1); 7051 7052 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7053 for (int i=0; i <= lastAddrIndx; ++i) 7054 (*MIB).addOperand(*argOpers[i]); 7055 MIB.addReg(t2); 7056 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7057 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); 7058 7059 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7060 MIB.addReg(EAXreg); 7061 7062 // insert branch 7063 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7064 7065 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7066 return nextMBB; 7067} 7068 7069// private utility function: 64 bit atomics on 32 bit host. 7070MachineBasicBlock * 7071X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7072 MachineBasicBlock *MBB, 7073 unsigned regOpcL, 7074 unsigned regOpcH, 7075 unsigned immOpcL, 7076 unsigned immOpcH, 7077 bool invSrc) const { 7078 // For the atomic bitwise operator, we generate 7079 // thisMBB (instructions are in pairs, except cmpxchg8b) 7080 // ld t1,t2 = [bitinstr.addr] 7081 // newMBB: 7082 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7083 // op t5, t6 <- out1, out2, [bitinstr.val] 7084 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7085 // mov ECX, EBX <- t5, t6 7086 // mov EAX, EDX <- t1, t2 7087 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7088 // mov t3, t4 <- EAX, EDX 7089 // bz newMBB 7090 // result in out1, out2 7091 // fallthrough -->nextMBB 7092 7093 const TargetRegisterClass *RC = X86::GR32RegisterClass; 7094 const unsigned LoadOpc = X86::MOV32rm; 7095 const unsigned copyOpc = X86::MOV32rr; 7096 const unsigned NotOpc = X86::NOT32r; 7097 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7098 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7099 MachineFunction::iterator MBBIter = MBB; 7100 ++MBBIter; 7101 7102 /// First build the CFG 7103 MachineFunction *F = MBB->getParent(); 7104 MachineBasicBlock *thisMBB = MBB; 7105 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7106 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7107 F->insert(MBBIter, newMBB); 7108 F->insert(MBBIter, nextMBB); 7109 7110 // Move all successors to thisMBB to nextMBB 7111 nextMBB->transferSuccessors(thisMBB); 7112 7113 // Update thisMBB to fall through to newMBB 7114 thisMBB->addSuccessor(newMBB); 7115 7116 // newMBB jumps to itself and fall through to nextMBB 7117 newMBB->addSuccessor(nextMBB); 7118 newMBB->addSuccessor(newMBB); 7119 7120 DebugLoc dl = bInstr->getDebugLoc(); 7121 // Insert instructions into newMBB based on incoming instruction 7122 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 7123 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 7124 "unexpected number of operands"); 7125 MachineOperand& dest1Oper = bInstr->getOperand(0); 7126 MachineOperand& dest2Oper = bInstr->getOperand(1); 7127 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7128 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 7129 argOpers[i] = &bInstr->getOperand(i+2); 7130 7131 // x86 address has 4 operands: base, index, scale, and displacement 7132 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7133 7134 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7135 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 7136 for (int i=0; i <= lastAddrIndx; ++i) 7137 (*MIB).addOperand(*argOpers[i]); 7138 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7139 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 7140 // add 4 to displacement. 7141 for (int i=0; i <= lastAddrIndx-2; ++i) 7142 (*MIB).addOperand(*argOpers[i]); 7143 MachineOperand newOp3 = *(argOpers[3]); 7144 if (newOp3.isImm()) 7145 newOp3.setImm(newOp3.getImm()+4); 7146 else 7147 newOp3.setOffset(newOp3.getOffset()+4); 7148 (*MIB).addOperand(newOp3); 7149 (*MIB).addOperand(*argOpers[lastAddrIndx]); 7150 7151 // t3/4 are defined later, at the bottom of the loop 7152 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 7153 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 7154 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 7155 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 7156 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 7157 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 7158 7159 unsigned tt1 = F->getRegInfo().createVirtualRegister(RC); 7160 unsigned tt2 = F->getRegInfo().createVirtualRegister(RC); 7161 if (invSrc) { 7162 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1); 7163 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2); 7164 } else { 7165 tt1 = t1; 7166 tt2 = t2; 7167 } 7168 7169 int valArgIndx = lastAddrIndx + 1; 7170 assert((argOpers[valArgIndx]->isReg() || 7171 argOpers[valArgIndx]->isImm()) && 7172 "invalid operand"); 7173 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 7174 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 7175 if (argOpers[valArgIndx]->isReg()) 7176 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 7177 else 7178 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 7179 if (regOpcL != X86::MOV32rr) 7180 MIB.addReg(tt1); 7181 (*MIB).addOperand(*argOpers[valArgIndx]); 7182 assert(argOpers[valArgIndx + 1]->isReg() == 7183 argOpers[valArgIndx]->isReg()); 7184 assert(argOpers[valArgIndx + 1]->isImm() == 7185 argOpers[valArgIndx]->isImm()); 7186 if (argOpers[valArgIndx + 1]->isReg()) 7187 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 7188 else 7189 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 7190 if (regOpcH != X86::MOV32rr) 7191 MIB.addReg(tt2); 7192 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 7193 7194 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 7195 MIB.addReg(t1); 7196 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 7197 MIB.addReg(t2); 7198 7199 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 7200 MIB.addReg(t5); 7201 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 7202 MIB.addReg(t6); 7203 7204 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 7205 for (int i=0; i <= lastAddrIndx; ++i) 7206 (*MIB).addOperand(*argOpers[i]); 7207 7208 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7209 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); 7210 7211 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 7212 MIB.addReg(X86::EAX); 7213 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 7214 MIB.addReg(X86::EDX); 7215 7216 // insert branch 7217 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7218 7219 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7220 return nextMBB; 7221} 7222 7223// private utility function 7224MachineBasicBlock * 7225X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 7226 MachineBasicBlock *MBB, 7227 unsigned cmovOpc) const { 7228 // For the atomic min/max operator, we generate 7229 // thisMBB: 7230 // newMBB: 7231 // ld t1 = [min/max.addr] 7232 // mov t2 = [min/max.val] 7233 // cmp t1, t2 7234 // cmov[cond] t2 = t1 7235 // mov EAX = t1 7236 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7237 // bz newMBB 7238 // fallthrough -->nextMBB 7239 // 7240 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7241 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7242 MachineFunction::iterator MBBIter = MBB; 7243 ++MBBIter; 7244 7245 /// First build the CFG 7246 MachineFunction *F = MBB->getParent(); 7247 MachineBasicBlock *thisMBB = MBB; 7248 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7249 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7250 F->insert(MBBIter, newMBB); 7251 F->insert(MBBIter, nextMBB); 7252 7253 // Move all successors to thisMBB to nextMBB 7254 nextMBB->transferSuccessors(thisMBB); 7255 7256 // Update thisMBB to fall through to newMBB 7257 thisMBB->addSuccessor(newMBB); 7258 7259 // newMBB jumps to newMBB and fall through to nextMBB 7260 newMBB->addSuccessor(nextMBB); 7261 newMBB->addSuccessor(newMBB); 7262 7263 DebugLoc dl = mInstr->getDebugLoc(); 7264 // Insert instructions into newMBB based on incoming instruction 7265 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 7266 "unexpected number of operands"); 7267 MachineOperand& destOper = mInstr->getOperand(0); 7268 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7269 int numArgs = mInstr->getNumOperands() - 1; 7270 for (int i=0; i < numArgs; ++i) 7271 argOpers[i] = &mInstr->getOperand(i+1); 7272 7273 // x86 address has 4 operands: base, index, scale, and displacement 7274 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7275 int valArgIndx = lastAddrIndx + 1; 7276 7277 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7278 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 7279 for (int i=0; i <= lastAddrIndx; ++i) 7280 (*MIB).addOperand(*argOpers[i]); 7281 7282 // We only support register and immediate values 7283 assert((argOpers[valArgIndx]->isReg() || 7284 argOpers[valArgIndx]->isImm()) && 7285 "invalid operand"); 7286 7287 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7288 if (argOpers[valArgIndx]->isReg()) 7289 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7290 else 7291 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7292 (*MIB).addOperand(*argOpers[valArgIndx]); 7293 7294 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 7295 MIB.addReg(t1); 7296 7297 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 7298 MIB.addReg(t1); 7299 MIB.addReg(t2); 7300 7301 // Generate movc 7302 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7303 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 7304 MIB.addReg(t2); 7305 MIB.addReg(t1); 7306 7307 // Cmp and exchange if none has modified the memory location 7308 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 7309 for (int i=0; i <= lastAddrIndx; ++i) 7310 (*MIB).addOperand(*argOpers[i]); 7311 MIB.addReg(t3); 7312 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7313 (*MIB).addMemOperand(*F, *mInstr->memoperands_begin()); 7314 7315 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 7316 MIB.addReg(X86::EAX); 7317 7318 // insert branch 7319 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7320 7321 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 7322 return nextMBB; 7323} 7324 7325 7326MachineBasicBlock * 7327X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 7328 MachineBasicBlock *BB) const { 7329 DebugLoc dl = MI->getDebugLoc(); 7330 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7331 switch (MI->getOpcode()) { 7332 default: assert(false && "Unexpected instr type to insert"); 7333 case X86::CMOV_V1I64: 7334 case X86::CMOV_FR32: 7335 case X86::CMOV_FR64: 7336 case X86::CMOV_V4F32: 7337 case X86::CMOV_V2F64: 7338 case X86::CMOV_V2I64: { 7339 // To "insert" a SELECT_CC instruction, we actually have to insert the 7340 // diamond control-flow pattern. The incoming instruction knows the 7341 // destination vreg to set, the condition code register to branch on, the 7342 // true/false values to select between, and a branch opcode to use. 7343 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7344 MachineFunction::iterator It = BB; 7345 ++It; 7346 7347 // thisMBB: 7348 // ... 7349 // TrueVal = ... 7350 // cmpTY ccX, r1, r2 7351 // bCC copy1MBB 7352 // fallthrough --> copy0MBB 7353 MachineBasicBlock *thisMBB = BB; 7354 MachineFunction *F = BB->getParent(); 7355 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 7356 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 7357 unsigned Opc = 7358 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 7359 BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB); 7360 F->insert(It, copy0MBB); 7361 F->insert(It, sinkMBB); 7362 // Update machine-CFG edges by transferring all successors of the current 7363 // block to the new block which will contain the Phi node for the select. 7364 sinkMBB->transferSuccessors(BB); 7365 7366 // Add the true and fallthrough blocks as its successors. 7367 BB->addSuccessor(copy0MBB); 7368 BB->addSuccessor(sinkMBB); 7369 7370 // copy0MBB: 7371 // %FalseValue = ... 7372 // # fallthrough to sinkMBB 7373 BB = copy0MBB; 7374 7375 // Update machine-CFG edges 7376 BB->addSuccessor(sinkMBB); 7377 7378 // sinkMBB: 7379 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 7380 // ... 7381 BB = sinkMBB; 7382 BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg()) 7383 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 7384 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 7385 7386 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7387 return BB; 7388 } 7389 7390 case X86::FP32_TO_INT16_IN_MEM: 7391 case X86::FP32_TO_INT32_IN_MEM: 7392 case X86::FP32_TO_INT64_IN_MEM: 7393 case X86::FP64_TO_INT16_IN_MEM: 7394 case X86::FP64_TO_INT32_IN_MEM: 7395 case X86::FP64_TO_INT64_IN_MEM: 7396 case X86::FP80_TO_INT16_IN_MEM: 7397 case X86::FP80_TO_INT32_IN_MEM: 7398 case X86::FP80_TO_INT64_IN_MEM: { 7399 // Change the floating point control register to use "round towards zero" 7400 // mode when truncating to an integer value. 7401 MachineFunction *F = BB->getParent(); 7402 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2); 7403 addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx); 7404 7405 // Load the old value of the high byte of the control word... 7406 unsigned OldCW = 7407 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 7408 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW), 7409 CWFrameIdx); 7410 7411 // Set the high part to be round to zero... 7412 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx) 7413 .addImm(0xC7F); 7414 7415 // Reload the modified control word now... 7416 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); 7417 7418 // Restore the memory image of control word to original value 7419 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx) 7420 .addReg(OldCW); 7421 7422 // Get the X86 opcode to use. 7423 unsigned Opc; 7424 switch (MI->getOpcode()) { 7425 default: assert(0 && "illegal opcode!"); 7426 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 7427 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 7428 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 7429 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 7430 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 7431 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 7432 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 7433 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 7434 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 7435 } 7436 7437 X86AddressMode AM; 7438 MachineOperand &Op = MI->getOperand(0); 7439 if (Op.isReg()) { 7440 AM.BaseType = X86AddressMode::RegBase; 7441 AM.Base.Reg = Op.getReg(); 7442 } else { 7443 AM.BaseType = X86AddressMode::FrameIndexBase; 7444 AM.Base.FrameIndex = Op.getIndex(); 7445 } 7446 Op = MI->getOperand(1); 7447 if (Op.isImm()) 7448 AM.Scale = Op.getImm(); 7449 Op = MI->getOperand(2); 7450 if (Op.isImm()) 7451 AM.IndexReg = Op.getImm(); 7452 Op = MI->getOperand(3); 7453 if (Op.isGlobal()) { 7454 AM.GV = Op.getGlobal(); 7455 } else { 7456 AM.Disp = Op.getImm(); 7457 } 7458 addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM) 7459 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 7460 7461 // Reload the original control word now. 7462 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); 7463 7464 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7465 return BB; 7466 } 7467 case X86::ATOMAND32: 7468 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7469 X86::AND32ri, X86::MOV32rm, 7470 X86::LCMPXCHG32, X86::MOV32rr, 7471 X86::NOT32r, X86::EAX, 7472 X86::GR32RegisterClass); 7473 case X86::ATOMOR32: 7474 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 7475 X86::OR32ri, X86::MOV32rm, 7476 X86::LCMPXCHG32, X86::MOV32rr, 7477 X86::NOT32r, X86::EAX, 7478 X86::GR32RegisterClass); 7479 case X86::ATOMXOR32: 7480 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 7481 X86::XOR32ri, X86::MOV32rm, 7482 X86::LCMPXCHG32, X86::MOV32rr, 7483 X86::NOT32r, X86::EAX, 7484 X86::GR32RegisterClass); 7485 case X86::ATOMNAND32: 7486 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7487 X86::AND32ri, X86::MOV32rm, 7488 X86::LCMPXCHG32, X86::MOV32rr, 7489 X86::NOT32r, X86::EAX, 7490 X86::GR32RegisterClass, true); 7491 case X86::ATOMMIN32: 7492 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 7493 case X86::ATOMMAX32: 7494 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 7495 case X86::ATOMUMIN32: 7496 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 7497 case X86::ATOMUMAX32: 7498 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 7499 7500 case X86::ATOMAND16: 7501 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7502 X86::AND16ri, X86::MOV16rm, 7503 X86::LCMPXCHG16, X86::MOV16rr, 7504 X86::NOT16r, X86::AX, 7505 X86::GR16RegisterClass); 7506 case X86::ATOMOR16: 7507 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 7508 X86::OR16ri, X86::MOV16rm, 7509 X86::LCMPXCHG16, X86::MOV16rr, 7510 X86::NOT16r, X86::AX, 7511 X86::GR16RegisterClass); 7512 case X86::ATOMXOR16: 7513 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 7514 X86::XOR16ri, X86::MOV16rm, 7515 X86::LCMPXCHG16, X86::MOV16rr, 7516 X86::NOT16r, X86::AX, 7517 X86::GR16RegisterClass); 7518 case X86::ATOMNAND16: 7519 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7520 X86::AND16ri, X86::MOV16rm, 7521 X86::LCMPXCHG16, X86::MOV16rr, 7522 X86::NOT16r, X86::AX, 7523 X86::GR16RegisterClass, true); 7524 case X86::ATOMMIN16: 7525 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 7526 case X86::ATOMMAX16: 7527 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 7528 case X86::ATOMUMIN16: 7529 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 7530 case X86::ATOMUMAX16: 7531 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 7532 7533 case X86::ATOMAND8: 7534 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 7535 X86::AND8ri, X86::MOV8rm, 7536 X86::LCMPXCHG8, X86::MOV8rr, 7537 X86::NOT8r, X86::AL, 7538 X86::GR8RegisterClass); 7539 case X86::ATOMOR8: 7540 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 7541 X86::OR8ri, X86::MOV8rm, 7542 X86::LCMPXCHG8, X86::MOV8rr, 7543 X86::NOT8r, X86::AL, 7544 X86::GR8RegisterClass); 7545 case X86::ATOMXOR8: 7546 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 7547 X86::XOR8ri, X86::MOV8rm, 7548 X86::LCMPXCHG8, X86::MOV8rr, 7549 X86::NOT8r, X86::AL, 7550 X86::GR8RegisterClass); 7551 case X86::ATOMNAND8: 7552 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 7553 X86::AND8ri, X86::MOV8rm, 7554 X86::LCMPXCHG8, X86::MOV8rr, 7555 X86::NOT8r, X86::AL, 7556 X86::GR8RegisterClass, true); 7557 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 7558 // This group is for 64-bit host. 7559 case X86::ATOMAND64: 7560 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 7561 X86::AND64ri32, X86::MOV64rm, 7562 X86::LCMPXCHG64, X86::MOV64rr, 7563 X86::NOT64r, X86::RAX, 7564 X86::GR64RegisterClass); 7565 case X86::ATOMOR64: 7566 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 7567 X86::OR64ri32, X86::MOV64rm, 7568 X86::LCMPXCHG64, X86::MOV64rr, 7569 X86::NOT64r, X86::RAX, 7570 X86::GR64RegisterClass); 7571 case X86::ATOMXOR64: 7572 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 7573 X86::XOR64ri32, X86::MOV64rm, 7574 X86::LCMPXCHG64, X86::MOV64rr, 7575 X86::NOT64r, X86::RAX, 7576 X86::GR64RegisterClass); 7577 case X86::ATOMNAND64: 7578 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 7579 X86::AND64ri32, X86::MOV64rm, 7580 X86::LCMPXCHG64, X86::MOV64rr, 7581 X86::NOT64r, X86::RAX, 7582 X86::GR64RegisterClass, true); 7583 case X86::ATOMMIN64: 7584 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 7585 case X86::ATOMMAX64: 7586 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 7587 case X86::ATOMUMIN64: 7588 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 7589 case X86::ATOMUMAX64: 7590 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 7591 7592 // This group does 64-bit operations on a 32-bit host. 7593 case X86::ATOMAND6432: 7594 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7595 X86::AND32rr, X86::AND32rr, 7596 X86::AND32ri, X86::AND32ri, 7597 false); 7598 case X86::ATOMOR6432: 7599 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7600 X86::OR32rr, X86::OR32rr, 7601 X86::OR32ri, X86::OR32ri, 7602 false); 7603 case X86::ATOMXOR6432: 7604 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7605 X86::XOR32rr, X86::XOR32rr, 7606 X86::XOR32ri, X86::XOR32ri, 7607 false); 7608 case X86::ATOMNAND6432: 7609 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7610 X86::AND32rr, X86::AND32rr, 7611 X86::AND32ri, X86::AND32ri, 7612 true); 7613 case X86::ATOMADD6432: 7614 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7615 X86::ADD32rr, X86::ADC32rr, 7616 X86::ADD32ri, X86::ADC32ri, 7617 false); 7618 case X86::ATOMSUB6432: 7619 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7620 X86::SUB32rr, X86::SBB32rr, 7621 X86::SUB32ri, X86::SBB32ri, 7622 false); 7623 case X86::ATOMSWAP6432: 7624 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7625 X86::MOV32rr, X86::MOV32rr, 7626 X86::MOV32ri, X86::MOV32ri, 7627 false); 7628 } 7629} 7630 7631//===----------------------------------------------------------------------===// 7632// X86 Optimization Hooks 7633//===----------------------------------------------------------------------===// 7634 7635void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 7636 const APInt &Mask, 7637 APInt &KnownZero, 7638 APInt &KnownOne, 7639 const SelectionDAG &DAG, 7640 unsigned Depth) const { 7641 unsigned Opc = Op.getOpcode(); 7642 assert((Opc >= ISD::BUILTIN_OP_END || 7643 Opc == ISD::INTRINSIC_WO_CHAIN || 7644 Opc == ISD::INTRINSIC_W_CHAIN || 7645 Opc == ISD::INTRINSIC_VOID) && 7646 "Should use MaskedValueIsZero if you don't know whether Op" 7647 " is a target node!"); 7648 7649 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 7650 switch (Opc) { 7651 default: break; 7652 case X86ISD::ADD: 7653 case X86ISD::SUB: 7654 case X86ISD::SMUL: 7655 case X86ISD::UMUL: 7656 case X86ISD::INC: 7657 case X86ISD::DEC: 7658 // These nodes' second result is a boolean. 7659 if (Op.getResNo() == 0) 7660 break; 7661 // Fallthrough 7662 case X86ISD::SETCC: 7663 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 7664 Mask.getBitWidth() - 1); 7665 break; 7666 } 7667} 7668 7669/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 7670/// node is a GlobalAddress + offset. 7671bool X86TargetLowering::isGAPlusOffset(SDNode *N, 7672 GlobalValue* &GA, int64_t &Offset) const{ 7673 if (N->getOpcode() == X86ISD::Wrapper) { 7674 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 7675 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 7676 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 7677 return true; 7678 } 7679 } 7680 return TargetLowering::isGAPlusOffset(N, GA, Offset); 7681} 7682 7683static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, 7684 const TargetLowering &TLI) { 7685 GlobalValue *GV; 7686 int64_t Offset = 0; 7687 if (TLI.isGAPlusOffset(Base, GV, Offset)) 7688 return (GV->getAlignment() >= N && (Offset % N) == 0); 7689 // DAG combine handles the stack object case. 7690 return false; 7691} 7692 7693static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
| 1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#include "X86.h" 16#include "X86InstrBuilder.h" 17#include "X86ISelLowering.h" 18#include "X86TargetMachine.h" 19#include "llvm/CallingConv.h" 20#include "llvm/Constants.h" 21#include "llvm/DerivedTypes.h" 22#include "llvm/GlobalVariable.h" 23#include "llvm/Function.h" 24#include "llvm/Intrinsics.h" 25#include "llvm/ADT/BitVector.h" 26#include "llvm/ADT/VectorExtras.h" 27#include "llvm/CodeGen/MachineFrameInfo.h" 28#include "llvm/CodeGen/MachineFunction.h" 29#include "llvm/CodeGen/MachineInstrBuilder.h" 30#include "llvm/CodeGen/MachineModuleInfo.h" 31#include "llvm/CodeGen/MachineRegisterInfo.h" 32#include "llvm/CodeGen/PseudoSourceValue.h" 33#include "llvm/Support/MathExtras.h" 34#include "llvm/Support/Debug.h" 35#include "llvm/Target/TargetOptions.h" 36#include "llvm/ADT/SmallSet.h" 37#include "llvm/ADT/StringExtras.h" 38#include "llvm/Support/CommandLine.h" 39using namespace llvm; 40 41static cl::opt<bool> 42DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 43 44// Forward declarations. 45static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 46 SDValue V2); 47 48X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 49 : TargetLowering(TM) { 50 Subtarget = &TM.getSubtarget<X86Subtarget>(); 51 X86ScalarSSEf64 = Subtarget->hasSSE2(); 52 X86ScalarSSEf32 = Subtarget->hasSSE1(); 53 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 54 55 RegInfo = TM.getRegisterInfo(); 56 TD = getTargetData(); 57 58 // Set up the TargetLowering object. 59 60 // X86 is weird, it always uses i8 for shift amounts and setcc results. 61 setShiftAmountType(MVT::i8); 62 setBooleanContents(ZeroOrOneBooleanContent); 63 setSchedulingPreference(SchedulingForRegPressure); 64 setShiftAmountFlavor(Mask); // shl X, 32 == shl X, 0 65 setStackPointerRegisterToSaveRestore(X86StackPtr); 66 67 if (Subtarget->isTargetDarwin()) { 68 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 69 setUseUnderscoreSetJmp(false); 70 setUseUnderscoreLongJmp(false); 71 } else if (Subtarget->isTargetMingw()) { 72 // MS runtime is weird: it exports _setjmp, but longjmp! 73 setUseUnderscoreSetJmp(true); 74 setUseUnderscoreLongJmp(false); 75 } else { 76 setUseUnderscoreSetJmp(true); 77 setUseUnderscoreLongJmp(true); 78 } 79 80 // Set up the register classes. 81 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 82 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 83 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 84 if (Subtarget->is64Bit()) 85 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 86 87 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 88 89 // We don't accept any truncstore of integer registers. 90 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 91 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 92 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 93 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 94 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 95 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 96 97 // SETOEQ and SETUNE require checking two conditions. 98 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 99 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 100 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 101 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 102 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 103 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 104 105 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 106 // operation. 107 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 108 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 109 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 110 111 if (Subtarget->is64Bit()) { 112 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 113 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 114 } else if (!UseSoftFloat) { 115 if (X86ScalarSSEf64) { 116 // We have an impenetrably clever algorithm for ui64->double only. 117 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 118 } 119 // We have an algorithm for SSE2, and we turn this into a 64-bit 120 // FILD for other targets. 121 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 122 } 123 124 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 125 // this operation. 126 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 127 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 128 129 if (!UseSoftFloat) { 130 // SSE has no i16 to fp conversion, only i32 131 if (X86ScalarSSEf32) { 132 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 133 // f32 and f64 cases are Legal, f80 case is not 134 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 135 } else { 136 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 137 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 138 } 139 } else { 140 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 141 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 142 } 143 144 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 145 // are Legal, f80 is custom lowered. 146 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 147 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 148 149 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 150 // this operation. 151 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 152 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 153 154 if (X86ScalarSSEf32) { 155 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 156 // f32 and f64 cases are Legal, f80 case is not 157 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 158 } else { 159 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 160 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 161 } 162 163 // Handle FP_TO_UINT by promoting the destination to a larger signed 164 // conversion. 165 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 166 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 167 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 168 169 if (Subtarget->is64Bit()) { 170 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 171 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 172 } else if (!UseSoftFloat) { 173 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 174 // Expand FP_TO_UINT into a select. 175 // FIXME: We would like to use a Custom expander here eventually to do 176 // the optimal thing for SSE vs. the default expansion in the legalizer. 177 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 178 else 179 // With SSE3 we can use fisttpll to convert to a signed i64; without 180 // SSE, we're stuck with a fistpll. 181 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 182 } 183 184 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 185 if (!X86ScalarSSEf64) { 186 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 187 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 188 } 189 190 // Scalar integer divide and remainder are lowered to use operations that 191 // produce two results, to match the available instructions. This exposes 192 // the two-result form to trivial CSE, which is able to combine x/y and x%y 193 // into a single instruction. 194 // 195 // Scalar integer multiply-high is also lowered to use two-result 196 // operations, to match the available instructions. However, plain multiply 197 // (low) operations are left as Legal, as there are single-result 198 // instructions for this in x86. Using the two-result multiply instructions 199 // when both high and low results are needed must be arranged by dagcombine. 200 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 201 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 202 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 203 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 204 setOperationAction(ISD::SREM , MVT::i8 , Expand); 205 setOperationAction(ISD::UREM , MVT::i8 , Expand); 206 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 207 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 208 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 209 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 210 setOperationAction(ISD::SREM , MVT::i16 , Expand); 211 setOperationAction(ISD::UREM , MVT::i16 , Expand); 212 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 213 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 214 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 215 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 216 setOperationAction(ISD::SREM , MVT::i32 , Expand); 217 setOperationAction(ISD::UREM , MVT::i32 , Expand); 218 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 219 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 220 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 221 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 222 setOperationAction(ISD::SREM , MVT::i64 , Expand); 223 setOperationAction(ISD::UREM , MVT::i64 , Expand); 224 225 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 226 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 227 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 228 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 229 if (Subtarget->is64Bit()) 230 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 231 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 232 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 233 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 234 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 235 setOperationAction(ISD::FREM , MVT::f32 , Expand); 236 setOperationAction(ISD::FREM , MVT::f64 , Expand); 237 setOperationAction(ISD::FREM , MVT::f80 , Expand); 238 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 239 240 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 241 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 242 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 243 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 244 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 245 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 246 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 247 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 248 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 249 if (Subtarget->is64Bit()) { 250 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 251 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 252 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 253 } 254 255 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 256 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 257 258 // These should be promoted to a larger select which is supported. 259 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 260 setOperationAction(ISD::SELECT , MVT::i8 , Promote); 261 // X86 wants to expand cmov itself. 262 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 263 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 264 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 265 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 266 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 267 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 268 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 269 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 270 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 271 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 272 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 273 if (Subtarget->is64Bit()) { 274 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 275 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 276 } 277 // X86 ret instruction may pop stack. 278 setOperationAction(ISD::RET , MVT::Other, Custom); 279 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 280 281 // Darwin ABI issue. 282 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 283 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 284 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 285 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 286 if (Subtarget->is64Bit()) 287 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 288 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 289 if (Subtarget->is64Bit()) { 290 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 291 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 292 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 293 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 294 } 295 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 296 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 297 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 298 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 299 if (Subtarget->is64Bit()) { 300 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 301 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 302 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 303 } 304 305 if (Subtarget->hasSSE1()) 306 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 307 308 if (!Subtarget->hasSSE2()) 309 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 310 311 // Expand certain atomics 312 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 313 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 314 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 315 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 316 317 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 318 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 319 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 320 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 321 322 if (!Subtarget->is64Bit()) { 323 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 324 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 325 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 326 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 327 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 328 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 329 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 330 } 331 332 // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion. 333 setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand); 334 // FIXME - use subtarget debug flags 335 if (!Subtarget->isTargetDarwin() && 336 !Subtarget->isTargetELF() && 337 !Subtarget->isTargetCygMing()) { 338 setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand); 339 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 340 } 341 342 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 343 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 344 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 345 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 346 if (Subtarget->is64Bit()) { 347 setExceptionPointerRegister(X86::RAX); 348 setExceptionSelectorRegister(X86::RDX); 349 } else { 350 setExceptionPointerRegister(X86::EAX); 351 setExceptionSelectorRegister(X86::EDX); 352 } 353 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 354 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 355 356 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 357 358 setOperationAction(ISD::TRAP, MVT::Other, Legal); 359 360 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 361 setOperationAction(ISD::VASTART , MVT::Other, Custom); 362 setOperationAction(ISD::VAEND , MVT::Other, Expand); 363 if (Subtarget->is64Bit()) { 364 setOperationAction(ISD::VAARG , MVT::Other, Custom); 365 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 366 } else { 367 setOperationAction(ISD::VAARG , MVT::Other, Expand); 368 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 369 } 370 371 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 372 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 373 if (Subtarget->is64Bit()) 374 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 375 if (Subtarget->isTargetCygMing()) 376 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 377 else 378 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 379 380 if (!UseSoftFloat && X86ScalarSSEf64) { 381 // f32 and f64 use SSE. 382 // Set up the FP register classes. 383 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 384 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 385 386 // Use ANDPD to simulate FABS. 387 setOperationAction(ISD::FABS , MVT::f64, Custom); 388 setOperationAction(ISD::FABS , MVT::f32, Custom); 389 390 // Use XORP to simulate FNEG. 391 setOperationAction(ISD::FNEG , MVT::f64, Custom); 392 setOperationAction(ISD::FNEG , MVT::f32, Custom); 393 394 // Use ANDPD and ORPD to simulate FCOPYSIGN. 395 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 396 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 397 398 // We don't support sin/cos/fmod 399 setOperationAction(ISD::FSIN , MVT::f64, Expand); 400 setOperationAction(ISD::FCOS , MVT::f64, Expand); 401 setOperationAction(ISD::FSIN , MVT::f32, Expand); 402 setOperationAction(ISD::FCOS , MVT::f32, Expand); 403 404 // Expand FP immediates into loads from the stack, except for the special 405 // cases we handle. 406 addLegalFPImmediate(APFloat(+0.0)); // xorpd 407 addLegalFPImmediate(APFloat(+0.0f)); // xorps 408 } else if (!UseSoftFloat && X86ScalarSSEf32) { 409 // Use SSE for f32, x87 for f64. 410 // Set up the FP register classes. 411 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 412 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 413 414 // Use ANDPS to simulate FABS. 415 setOperationAction(ISD::FABS , MVT::f32, Custom); 416 417 // Use XORP to simulate FNEG. 418 setOperationAction(ISD::FNEG , MVT::f32, Custom); 419 420 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 421 422 // Use ANDPS and ORPS to simulate FCOPYSIGN. 423 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 424 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 425 426 // We don't support sin/cos/fmod 427 setOperationAction(ISD::FSIN , MVT::f32, Expand); 428 setOperationAction(ISD::FCOS , MVT::f32, Expand); 429 430 // Special cases we handle for FP constants. 431 addLegalFPImmediate(APFloat(+0.0f)); // xorps 432 addLegalFPImmediate(APFloat(+0.0)); // FLD0 433 addLegalFPImmediate(APFloat(+1.0)); // FLD1 434 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 435 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 436 437 if (!UnsafeFPMath) { 438 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 439 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 440 } 441 } else if (!UseSoftFloat) { 442 // f32 and f64 in x87. 443 // Set up the FP register classes. 444 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 445 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 446 447 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 448 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 449 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 450 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 451 452 if (!UnsafeFPMath) { 453 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 454 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 455 } 456 addLegalFPImmediate(APFloat(+0.0)); // FLD0 457 addLegalFPImmediate(APFloat(+1.0)); // FLD1 458 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 459 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 460 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 461 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 462 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 463 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 464 } 465 466 // Long double always uses X87. 467 if (!UseSoftFloat) { 468 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 469 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 470 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 471 { 472 bool ignored; 473 APFloat TmpFlt(+0.0); 474 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 475 &ignored); 476 addLegalFPImmediate(TmpFlt); // FLD0 477 TmpFlt.changeSign(); 478 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 479 APFloat TmpFlt2(+1.0); 480 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 481 &ignored); 482 addLegalFPImmediate(TmpFlt2); // FLD1 483 TmpFlt2.changeSign(); 484 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 485 } 486 487 if (!UnsafeFPMath) { 488 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 489 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 490 } 491 } 492 493 // Always use a library call for pow. 494 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 495 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 496 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 497 498 setOperationAction(ISD::FLOG, MVT::f80, Expand); 499 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 500 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 501 setOperationAction(ISD::FEXP, MVT::f80, Expand); 502 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 503 504 // First set operation action for all vector types to either promote 505 // (for widening) or expand (for scalarization). Then we will selectively 506 // turn on ones that can be effectively codegen'd. 507 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 508 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 509 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 510 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 511 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 512 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 513 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 514 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 515 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 516 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 517 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 518 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 519 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 520 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 521 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 522 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 523 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 524 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 525 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 526 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 527 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 528 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 529 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 530 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 531 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 532 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 533 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 534 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 535 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 536 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 537 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 538 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 539 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 540 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 541 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 542 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 543 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 544 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 545 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 546 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 547 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 548 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 549 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 550 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 551 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 557 } 558 559 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 560 // with -msoft-float, disable use of MMX as well. 561 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 562 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 563 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 564 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 565 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 566 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 567 568 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 569 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 570 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 571 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 572 573 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 574 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 575 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 576 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 577 578 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 579 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 580 581 setOperationAction(ISD::AND, MVT::v8i8, Promote); 582 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 583 setOperationAction(ISD::AND, MVT::v4i16, Promote); 584 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 585 setOperationAction(ISD::AND, MVT::v2i32, Promote); 586 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 587 setOperationAction(ISD::AND, MVT::v1i64, Legal); 588 589 setOperationAction(ISD::OR, MVT::v8i8, Promote); 590 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 591 setOperationAction(ISD::OR, MVT::v4i16, Promote); 592 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 593 setOperationAction(ISD::OR, MVT::v2i32, Promote); 594 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 595 setOperationAction(ISD::OR, MVT::v1i64, Legal); 596 597 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 598 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 599 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 600 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 601 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 602 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 603 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 604 605 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 606 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 607 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 608 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 609 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 610 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 611 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 612 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 613 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 614 615 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 616 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 617 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 618 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 619 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 620 621 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 622 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 623 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 624 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 625 626 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 627 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 628 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 629 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 630 631 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 632 633 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 634 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand); 635 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 636 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 637 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 638 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 639 } 640 641 if (!UseSoftFloat && Subtarget->hasSSE1()) { 642 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 643 644 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 645 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 646 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 647 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 648 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 649 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 650 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 651 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 652 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 653 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 654 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 655 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 656 } 657 658 if (!UseSoftFloat && Subtarget->hasSSE2()) { 659 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 660 661 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 662 // registers cannot be used even for integer operations. 663 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 664 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 665 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 666 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 667 668 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 669 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 670 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 671 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 672 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 673 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 674 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 675 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 676 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 677 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 678 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 679 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 680 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 681 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 682 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 683 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 684 685 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 686 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 689 690 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 691 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 692 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 693 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 694 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 695 696 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 697 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 698 MVT VT = (MVT::SimpleValueType)i; 699 // Do not attempt to custom lower non-power-of-2 vectors 700 if (!isPowerOf2_32(VT.getVectorNumElements())) 701 continue; 702 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 703 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 704 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 705 } 706 707 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 708 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 709 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 711 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 712 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 713 714 if (Subtarget->is64Bit()) { 715 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 716 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 717 } 718 719 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 720 for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) { 721 setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote); 722 AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v2i64); 723 setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote); 724 AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v2i64); 725 setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote); 726 AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v2i64); 727 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote); 728 AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v2i64); 729 setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote); 730 AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v2i64); 731 } 732 733 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 734 735 // Custom lower v2i64 and v2f64 selects. 736 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 737 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 738 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 739 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 740 741 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 742 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 743 if (!DisableMMX && Subtarget->hasMMX()) { 744 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 745 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 746 } 747 } 748 749 if (Subtarget->hasSSE41()) { 750 // FIXME: Do we need to handle scalar-to-vector here? 751 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 752 753 // i8 and i16 vectors are custom , because the source register and source 754 // source memory operand types are not the same width. f32 vectors are 755 // custom since the immediate controlling the insert encodes additional 756 // information. 757 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 758 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 759 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 760 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 761 762 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 763 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 764 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 765 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 766 767 if (Subtarget->is64Bit()) { 768 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 769 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 770 } 771 } 772 773 if (Subtarget->hasSSE42()) { 774 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 775 } 776 777 // We want to custom lower some of our intrinsics. 778 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 779 780 // Add/Sub/Mul with overflow operations are custom lowered. 781 setOperationAction(ISD::SADDO, MVT::i32, Custom); 782 setOperationAction(ISD::SADDO, MVT::i64, Custom); 783 setOperationAction(ISD::UADDO, MVT::i32, Custom); 784 setOperationAction(ISD::UADDO, MVT::i64, Custom); 785 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 786 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 787 setOperationAction(ISD::USUBO, MVT::i32, Custom); 788 setOperationAction(ISD::USUBO, MVT::i64, Custom); 789 setOperationAction(ISD::SMULO, MVT::i32, Custom); 790 setOperationAction(ISD::SMULO, MVT::i64, Custom); 791 setOperationAction(ISD::UMULO, MVT::i32, Custom); 792 setOperationAction(ISD::UMULO, MVT::i64, Custom); 793 794 if (!Subtarget->is64Bit()) { 795 // These libcalls are not available in 32-bit. 796 setLibcallName(RTLIB::SHL_I128, 0); 797 setLibcallName(RTLIB::SRL_I128, 0); 798 setLibcallName(RTLIB::SRA_I128, 0); 799 } 800 801 // We have target-specific dag combine patterns for the following nodes: 802 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 803 setTargetDAGCombine(ISD::BUILD_VECTOR); 804 setTargetDAGCombine(ISD::SELECT); 805 setTargetDAGCombine(ISD::SHL); 806 setTargetDAGCombine(ISD::SRA); 807 setTargetDAGCombine(ISD::SRL); 808 setTargetDAGCombine(ISD::STORE); 809 if (Subtarget->is64Bit()) 810 setTargetDAGCombine(ISD::MUL); 811 812 computeRegisterProperties(); 813 814 // FIXME: These should be based on subtarget info. Plus, the values should 815 // be smaller when we are in optimizing for size mode. 816 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 817 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 818 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 819 allowUnalignedMemoryAccesses = true; // x86 supports it! 820 setPrefLoopAlignment(16); 821 benefitFromCodePlacementOpt = true; 822} 823 824 825MVT X86TargetLowering::getSetCCResultType(MVT VT) const { 826 return MVT::i8; 827} 828 829 830/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 831/// the desired ByVal argument alignment. 832static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 833 if (MaxAlign == 16) 834 return; 835 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 836 if (VTy->getBitWidth() == 128) 837 MaxAlign = 16; 838 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 839 unsigned EltAlign = 0; 840 getMaxByValAlign(ATy->getElementType(), EltAlign); 841 if (EltAlign > MaxAlign) 842 MaxAlign = EltAlign; 843 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 844 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 845 unsigned EltAlign = 0; 846 getMaxByValAlign(STy->getElementType(i), EltAlign); 847 if (EltAlign > MaxAlign) 848 MaxAlign = EltAlign; 849 if (MaxAlign == 16) 850 break; 851 } 852 } 853 return; 854} 855 856/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 857/// function arguments in the caller parameter area. For X86, aggregates 858/// that contain SSE vectors are placed at 16-byte boundaries while the rest 859/// are at 4-byte boundaries. 860unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 861 if (Subtarget->is64Bit()) { 862 // Max of 8 and alignment of type. 863 unsigned TyAlign = TD->getABITypeAlignment(Ty); 864 if (TyAlign > 8) 865 return TyAlign; 866 return 8; 867 } 868 869 unsigned Align = 4; 870 if (Subtarget->hasSSE1()) 871 getMaxByValAlign(Ty, Align); 872 return Align; 873} 874 875/// getOptimalMemOpType - Returns the target specific optimal type for load 876/// and store operations as a result of memset, memcpy, and memmove 877/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 878/// determining it. 879MVT 880X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 881 bool isSrcConst, bool isSrcStr, 882 SelectionDAG &DAG) const { 883 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 884 // linux. This is because the stack realignment code can't handle certain 885 // cases like PR2962. This should be removed when PR2962 is fixed. 886 const Function *F = DAG.getMachineFunction().getFunction(); 887 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 888 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { 889 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 890 return MVT::v4i32; 891 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 892 return MVT::v4f32; 893 } 894 if (Subtarget->is64Bit() && Size >= 8) 895 return MVT::i64; 896 return MVT::i32; 897} 898 899/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 900/// jumptable. 901SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 902 SelectionDAG &DAG) const { 903 if (usesGlobalOffsetTable()) 904 return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy()); 905 if (!Subtarget->isPICStyleRIPRel()) 906 // This doesn't have DebugLoc associated with it, but is not really the 907 // same as a Register. 908 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 909 getPointerTy()); 910 return Table; 911} 912 913//===----------------------------------------------------------------------===// 914// Return Value Calling Convention Implementation 915//===----------------------------------------------------------------------===// 916 917#include "X86GenCallingConv.inc" 918 919/// LowerRET - Lower an ISD::RET node. 920SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { 921 DebugLoc dl = Op.getDebugLoc(); 922 assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args"); 923 924 SmallVector<CCValAssign, 16> RVLocs; 925 unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv(); 926 bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); 927 CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs); 928 CCInfo.AnalyzeReturn(Op.getNode(), RetCC_X86); 929 930 // If this is the first return lowered for this function, add the regs to the 931 // liveout set for the function. 932 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { 933 for (unsigned i = 0; i != RVLocs.size(); ++i) 934 if (RVLocs[i].isRegLoc()) 935 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 936 } 937 SDValue Chain = Op.getOperand(0); 938 939 // Handle tail call return. 940 Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL); 941 if (Chain.getOpcode() == X86ISD::TAILCALL) { 942 SDValue TailCall = Chain; 943 SDValue TargetAddress = TailCall.getOperand(1); 944 SDValue StackAdjustment = TailCall.getOperand(2); 945 assert(((TargetAddress.getOpcode() == ISD::Register && 946 (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX || 947 cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) || 948 TargetAddress.getOpcode() == ISD::TargetExternalSymbol || 949 TargetAddress.getOpcode() == ISD::TargetGlobalAddress) && 950 "Expecting an global address, external symbol, or register"); 951 assert(StackAdjustment.getOpcode() == ISD::Constant && 952 "Expecting a const value"); 953 954 SmallVector<SDValue,8> Operands; 955 Operands.push_back(Chain.getOperand(0)); 956 Operands.push_back(TargetAddress); 957 Operands.push_back(StackAdjustment); 958 // Copy registers used by the call. Last operand is a flag so it is not 959 // copied. 960 for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) { 961 Operands.push_back(Chain.getOperand(i)); 962 } 963 return DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, &Operands[0], 964 Operands.size()); 965 } 966 967 // Regular return. 968 SDValue Flag; 969 970 SmallVector<SDValue, 6> RetOps; 971 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 972 // Operand #1 = Bytes To Pop 973 RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16)); 974 975 // Copy the result values into the output registers. 976 for (unsigned i = 0; i != RVLocs.size(); ++i) { 977 CCValAssign &VA = RVLocs[i]; 978 assert(VA.isRegLoc() && "Can only return in registers!"); 979 SDValue ValToCopy = Op.getOperand(i*2+1); 980 981 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 982 // the RET instruction and handled by the FP Stackifier. 983 if (VA.getLocReg() == X86::ST0 || 984 VA.getLocReg() == X86::ST1) { 985 // If this is a copy from an xmm register to ST(0), use an FPExtend to 986 // change the value to the FP stack register class. 987 if (isScalarFPTypeInSSEReg(VA.getValVT())) 988 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 989 RetOps.push_back(ValToCopy); 990 // Don't emit a copytoreg. 991 continue; 992 } 993 994 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 995 // which is returned in RAX / RDX. 996 if (Subtarget->is64Bit()) { 997 MVT ValVT = ValToCopy.getValueType(); 998 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 999 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1000 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1001 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1002 } 1003 } 1004 1005 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1006 Flag = Chain.getValue(1); 1007 } 1008 1009 // The x86-64 ABI for returning structs by value requires that we copy 1010 // the sret argument into %rax for the return. We saved the argument into 1011 // a virtual register in the entry block, so now we copy the value out 1012 // and into %rax. 1013 if (Subtarget->is64Bit() && 1014 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1015 MachineFunction &MF = DAG.getMachineFunction(); 1016 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1017 unsigned Reg = FuncInfo->getSRetReturnReg(); 1018 if (!Reg) { 1019 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1020 FuncInfo->setSRetReturnReg(Reg); 1021 } 1022 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1023 1024 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1025 Flag = Chain.getValue(1); 1026 } 1027 1028 RetOps[0] = Chain; // Update chain. 1029 1030 // Add the flag if we have it. 1031 if (Flag.getNode()) 1032 RetOps.push_back(Flag); 1033 1034 return DAG.getNode(X86ISD::RET_FLAG, dl, 1035 MVT::Other, &RetOps[0], RetOps.size()); 1036} 1037 1038 1039/// LowerCallResult - Lower the result values of an ISD::CALL into the 1040/// appropriate copies out of appropriate physical registers. This assumes that 1041/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call 1042/// being lowered. The returns a SDNode with the same number of values as the 1043/// ISD::CALL. 1044SDNode *X86TargetLowering:: 1045LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, 1046 unsigned CallingConv, SelectionDAG &DAG) { 1047 1048 DebugLoc dl = TheCall->getDebugLoc(); 1049 // Assign locations to each value returned by this call. 1050 SmallVector<CCValAssign, 16> RVLocs; 1051 bool isVarArg = TheCall->isVarArg(); 1052 bool Is64Bit = Subtarget->is64Bit(); 1053 CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs); 1054 CCInfo.AnalyzeCallResult(TheCall, RetCC_X86); 1055 1056 SmallVector<SDValue, 8> ResultVals; 1057 1058 // Copy all of the result registers out of their specified physreg. 1059 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1060 CCValAssign &VA = RVLocs[i]; 1061 MVT CopyVT = VA.getValVT(); 1062 1063 // If this is x86-64, and we disabled SSE, we can't return FP values 1064 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1065 ((Is64Bit || TheCall->isInreg()) && !Subtarget->hasSSE1())) { 1066 cerr << "SSE register return with SSE disabled\n"; 1067 exit(1); 1068 } 1069 1070 // If this is a call to a function that returns an fp value on the floating 1071 // point stack, but where we prefer to use the value in xmm registers, copy 1072 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1073 if ((VA.getLocReg() == X86::ST0 || 1074 VA.getLocReg() == X86::ST1) && 1075 isScalarFPTypeInSSEReg(VA.getValVT())) { 1076 CopyVT = MVT::f80; 1077 } 1078 1079 SDValue Val; 1080 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1081 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1082 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1083 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1084 MVT::v2i64, InFlag).getValue(1); 1085 Val = Chain.getValue(0); 1086 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1087 Val, DAG.getConstant(0, MVT::i64)); 1088 } else { 1089 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1090 MVT::i64, InFlag).getValue(1); 1091 Val = Chain.getValue(0); 1092 } 1093 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1094 } else { 1095 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1096 CopyVT, InFlag).getValue(1); 1097 Val = Chain.getValue(0); 1098 } 1099 InFlag = Chain.getValue(2); 1100 1101 if (CopyVT != VA.getValVT()) { 1102 // Round the F80 the right size, which also moves to the appropriate xmm 1103 // register. 1104 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1105 // This truncation won't change the value. 1106 DAG.getIntPtrConstant(1)); 1107 } 1108 1109 ResultVals.push_back(Val); 1110 } 1111 1112 // Merge everything together with a MERGE_VALUES node. 1113 ResultVals.push_back(Chain); 1114 return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(), 1115 &ResultVals[0], ResultVals.size()).getNode(); 1116} 1117 1118 1119//===----------------------------------------------------------------------===// 1120// C & StdCall & Fast Calling Convention implementation 1121//===----------------------------------------------------------------------===// 1122// StdCall calling convention seems to be standard for many Windows' API 1123// routines and around. It differs from C calling convention just a little: 1124// callee should clean up the stack, not caller. Symbols should be also 1125// decorated in some fancy way :) It doesn't support any vector arguments. 1126// For info on fast calling convention see Fast Calling Convention (tail call) 1127// implementation LowerX86_32FastCCCallTo. 1128 1129/// CallIsStructReturn - Determines whether a CALL node uses struct return 1130/// semantics. 1131static bool CallIsStructReturn(CallSDNode *TheCall) { 1132 unsigned NumOps = TheCall->getNumArgs(); 1133 if (!NumOps) 1134 return false; 1135 1136 return TheCall->getArgFlags(0).isSRet(); 1137} 1138 1139/// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct 1140/// return semantics. 1141static bool ArgsAreStructReturn(SDValue Op) { 1142 unsigned NumArgs = Op.getNode()->getNumValues() - 1; 1143 if (!NumArgs) 1144 return false; 1145 1146 return cast<ARG_FLAGSSDNode>(Op.getOperand(3))->getArgFlags().isSRet(); 1147} 1148 1149/// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires 1150/// the callee to pop its own arguments. Callee pop is necessary to support tail 1151/// calls. 1152bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) { 1153 if (IsVarArg) 1154 return false; 1155 1156 switch (CallingConv) { 1157 default: 1158 return false; 1159 case CallingConv::X86_StdCall: 1160 return !Subtarget->is64Bit(); 1161 case CallingConv::X86_FastCall: 1162 return !Subtarget->is64Bit(); 1163 case CallingConv::Fast: 1164 return PerformTailCallOpt; 1165 } 1166} 1167 1168/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1169/// given CallingConvention value. 1170CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const { 1171 if (Subtarget->is64Bit()) { 1172 if (Subtarget->isTargetWin64()) 1173 return CC_X86_Win64_C; 1174 else if (CC == CallingConv::Fast && PerformTailCallOpt) 1175 return CC_X86_64_TailCall; 1176 else 1177 return CC_X86_64_C; 1178 } 1179 1180 if (CC == CallingConv::X86_FastCall) 1181 return CC_X86_32_FastCall; 1182 else if (CC == CallingConv::Fast) 1183 return CC_X86_32_FastCC; 1184 else 1185 return CC_X86_32_C; 1186} 1187 1188/// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to 1189/// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node. 1190NameDecorationStyle 1191X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) { 1192 unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1193 if (CC == CallingConv::X86_FastCall) 1194 return FastCall; 1195 else if (CC == CallingConv::X86_StdCall) 1196 return StdCall; 1197 return None; 1198} 1199 1200 1201/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer 1202/// in a register before calling. 1203bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) { 1204 return !IsTailCall && !Is64Bit && 1205 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1206 Subtarget->isPICStyleGOT(); 1207} 1208 1209/// CallRequiresFnAddressInReg - Check whether the call requires the function 1210/// address to be loaded in a register. 1211bool 1212X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) { 1213 return !Is64Bit && IsTailCall && 1214 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1215 Subtarget->isPICStyleGOT(); 1216} 1217 1218/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1219/// by "Src" to address "Dst" with size and alignment information specified by 1220/// the specific parameter attribute. The copy will be passed as a byval 1221/// function parameter. 1222static SDValue 1223CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1224 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1225 DebugLoc dl) { 1226 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1227 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1228 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1229} 1230 1231SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG, 1232 const CCValAssign &VA, 1233 MachineFrameInfo *MFI, 1234 unsigned CC, 1235 SDValue Root, unsigned i) { 1236 // Create the nodes corresponding to a load from this parameter slot. 1237 ISD::ArgFlagsTy Flags = 1238 cast<ARG_FLAGSSDNode>(Op.getOperand(3 + i))->getArgFlags(); 1239 bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt; 1240 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1241 1242 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1243 // changed with more analysis. 1244 // In case of tail call optimization mark all arguments mutable. Since they 1245 // could be overwritten by lowering of arguments in case of a tail call. 1246 int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8, 1247 VA.getLocMemOffset(), isImmutable); 1248 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1249 if (Flags.isByVal()) 1250 return FIN; 1251 return DAG.getLoad(VA.getValVT(), Op.getDebugLoc(), Root, FIN, 1252 PseudoSourceValue::getFixedStack(FI), 0); 1253} 1254 1255SDValue 1256X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { 1257 MachineFunction &MF = DAG.getMachineFunction(); 1258 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1259 DebugLoc dl = Op.getDebugLoc(); 1260 1261 const Function* Fn = MF.getFunction(); 1262 if (Fn->hasExternalLinkage() && 1263 Subtarget->isTargetCygMing() && 1264 Fn->getName() == "main") 1265 FuncInfo->setForceFramePointer(true); 1266 1267 // Decorate the function name. 1268 FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op)); 1269 1270 MachineFrameInfo *MFI = MF.getFrameInfo(); 1271 SDValue Root = Op.getOperand(0); 1272 bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0; 1273 unsigned CC = MF.getFunction()->getCallingConv(); 1274 bool Is64Bit = Subtarget->is64Bit(); 1275 bool IsWin64 = Subtarget->isTargetWin64(); 1276 1277 assert(!(isVarArg && CC == CallingConv::Fast) && 1278 "Var args not supported with calling convention fastcc"); 1279 1280 // Assign locations to all of the incoming arguments. 1281 SmallVector<CCValAssign, 16> ArgLocs; 1282 CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); 1283 CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC)); 1284 1285 SmallVector<SDValue, 8> ArgValues; 1286 unsigned LastVal = ~0U; 1287 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1288 CCValAssign &VA = ArgLocs[i]; 1289 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1290 // places. 1291 assert(VA.getValNo() != LastVal && 1292 "Don't support value assigned to multiple locs yet"); 1293 LastVal = VA.getValNo(); 1294 1295 if (VA.isRegLoc()) { 1296 MVT RegVT = VA.getLocVT(); 1297 TargetRegisterClass *RC = NULL; 1298 if (RegVT == MVT::i32) 1299 RC = X86::GR32RegisterClass; 1300 else if (Is64Bit && RegVT == MVT::i64) 1301 RC = X86::GR64RegisterClass; 1302 else if (RegVT == MVT::f32) 1303 RC = X86::FR32RegisterClass; 1304 else if (RegVT == MVT::f64) 1305 RC = X86::FR64RegisterClass; 1306 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1307 RC = X86::VR128RegisterClass; 1308 else if (RegVT.isVector()) { 1309 assert(RegVT.getSizeInBits() == 64); 1310 if (!Is64Bit) 1311 RC = X86::VR64RegisterClass; // MMX values are passed in MMXs. 1312 else { 1313 // Darwin calling convention passes MMX values in either GPRs or 1314 // XMMs in x86-64. Other targets pass them in memory. 1315 if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) { 1316 RC = X86::VR128RegisterClass; // MMX values are passed in XMMs. 1317 RegVT = MVT::v2i64; 1318 } else { 1319 RC = X86::GR64RegisterClass; // v1i64 values are passed in GPRs. 1320 RegVT = MVT::i64; 1321 } 1322 } 1323 } else { 1324 assert(0 && "Unknown argument type!"); 1325 } 1326 1327 unsigned Reg = DAG.getMachineFunction().addLiveIn(VA.getLocReg(), RC); 1328 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT); 1329 1330 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1331 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1332 // right size. 1333 if (VA.getLocInfo() == CCValAssign::SExt) 1334 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1335 DAG.getValueType(VA.getValVT())); 1336 else if (VA.getLocInfo() == CCValAssign::ZExt) 1337 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1338 DAG.getValueType(VA.getValVT())); 1339 1340 if (VA.getLocInfo() != CCValAssign::Full) 1341 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1342 1343 // Handle MMX values passed in GPRs. 1344 if (Is64Bit && RegVT != VA.getLocVT()) { 1345 if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass) 1346 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue); 1347 else if (RC == X86::VR128RegisterClass) { 1348 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1349 ArgValue, DAG.getConstant(0, MVT::i64)); 1350 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue); 1351 } 1352 } 1353 1354 ArgValues.push_back(ArgValue); 1355 } else { 1356 assert(VA.isMemLoc()); 1357 ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i)); 1358 } 1359 } 1360 1361 // The x86-64 ABI for returning structs by value requires that we copy 1362 // the sret argument into %rax for the return. Save the argument into 1363 // a virtual register so that we can access it from the return points. 1364 if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1365 MachineFunction &MF = DAG.getMachineFunction(); 1366 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1367 unsigned Reg = FuncInfo->getSRetReturnReg(); 1368 if (!Reg) { 1369 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1370 FuncInfo->setSRetReturnReg(Reg); 1371 } 1372 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]); 1373 Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root); 1374 } 1375 1376 unsigned StackSize = CCInfo.getNextStackOffset(); 1377 // align stack specially for tail calls 1378 if (PerformTailCallOpt && CC == CallingConv::Fast) 1379 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1380 1381 // If the function takes variable number of arguments, make a frame index for 1382 // the start of the first vararg value... for expansion of llvm.va_start. 1383 if (isVarArg) { 1384 if (Is64Bit || CC != CallingConv::X86_FastCall) { 1385 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); 1386 } 1387 if (Is64Bit) { 1388 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1389 1390 // FIXME: We should really autogenerate these arrays 1391 static const unsigned GPR64ArgRegsWin64[] = { 1392 X86::RCX, X86::RDX, X86::R8, X86::R9 1393 }; 1394 static const unsigned XMMArgRegsWin64[] = { 1395 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1396 }; 1397 static const unsigned GPR64ArgRegs64Bit[] = { 1398 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1399 }; 1400 static const unsigned XMMArgRegs64Bit[] = { 1401 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1402 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1403 }; 1404 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1405 1406 if (IsWin64) { 1407 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1408 GPR64ArgRegs = GPR64ArgRegsWin64; 1409 XMMArgRegs = XMMArgRegsWin64; 1410 } else { 1411 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1412 GPR64ArgRegs = GPR64ArgRegs64Bit; 1413 XMMArgRegs = XMMArgRegs64Bit; 1414 } 1415 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1416 TotalNumIntRegs); 1417 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1418 TotalNumXMMRegs); 1419 1420 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1421 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1422 "SSE register cannot be used when SSE is disabled!"); 1423 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1424 "SSE register cannot be used when SSE is disabled!"); 1425 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1426 // Kernel mode asks for SSE to be disabled, so don't push them 1427 // on the stack. 1428 TotalNumXMMRegs = 0; 1429 1430 // For X86-64, if there are vararg parameters that are passed via 1431 // registers, then we must store them to their spots on the stack so they 1432 // may be loaded by deferencing the result of va_next. 1433 VarArgsGPOffset = NumIntRegs * 8; 1434 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1435 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1436 TotalNumXMMRegs * 16, 16); 1437 1438 // Store the integer parameter registers. 1439 SmallVector<SDValue, 8> MemOps; 1440 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1441 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1442 DAG.getIntPtrConstant(VarArgsGPOffset)); 1443 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1444 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1445 X86::GR64RegisterClass); 1446 SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64); 1447 SDValue Store = 1448 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1449 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); 1450 MemOps.push_back(Store); 1451 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, 1452 DAG.getIntPtrConstant(8)); 1453 } 1454 1455 // Now store the XMM (fp + vector) parameter registers. 1456 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1457 DAG.getIntPtrConstant(VarArgsFPOffset)); 1458 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1459 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1460 X86::VR128RegisterClass); 1461 SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::v4f32); 1462 SDValue Store = 1463 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1464 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); 1465 MemOps.push_back(Store); 1466 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, 1467 DAG.getIntPtrConstant(16)); 1468 } 1469 if (!MemOps.empty()) 1470 Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1471 &MemOps[0], MemOps.size()); 1472 } 1473 } 1474 1475 ArgValues.push_back(Root); 1476 1477 // Some CCs need callee pop. 1478 if (IsCalleePop(isVarArg, CC)) { 1479 BytesToPopOnReturn = StackSize; // Callee pops everything. 1480 BytesCallerReserves = 0; 1481 } else { 1482 BytesToPopOnReturn = 0; // Callee pops nothing. 1483 // If this is an sret function, the return should pop the hidden pointer. 1484 if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op)) 1485 BytesToPopOnReturn = 4; 1486 BytesCallerReserves = StackSize; 1487 } 1488 1489 if (!Is64Bit) { 1490 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1491 if (CC == CallingConv::X86_FastCall) 1492 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1493 } 1494 1495 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1496 1497 // Return the new list of results. 1498 return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(), 1499 &ArgValues[0], ArgValues.size()).getValue(Op.getResNo()); 1500} 1501 1502SDValue 1503X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, 1504 const SDValue &StackPtr, 1505 const CCValAssign &VA, 1506 SDValue Chain, 1507 SDValue Arg, ISD::ArgFlagsTy Flags) { 1508 DebugLoc dl = TheCall->getDebugLoc(); 1509 unsigned LocMemOffset = VA.getLocMemOffset(); 1510 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1511 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1512 if (Flags.isByVal()) { 1513 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1514 } 1515 return DAG.getStore(Chain, dl, Arg, PtrOff, 1516 PseudoSourceValue::getStack(), LocMemOffset); 1517} 1518 1519/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1520/// optimization is performed and it is required. 1521SDValue 1522X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1523 SDValue &OutRetAddr, 1524 SDValue Chain, 1525 bool IsTailCall, 1526 bool Is64Bit, 1527 int FPDiff, 1528 DebugLoc dl) { 1529 if (!IsTailCall || FPDiff==0) return Chain; 1530 1531 // Adjust the Return address stack slot. 1532 MVT VT = getPointerTy(); 1533 OutRetAddr = getReturnAddressFrameIndex(DAG); 1534 1535 // Load the "old" Return address. 1536 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); 1537 return SDValue(OutRetAddr.getNode(), 1); 1538} 1539 1540/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1541/// optimization is performed and it is required (FPDiff!=0). 1542static SDValue 1543EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1544 SDValue Chain, SDValue RetAddrFrIdx, 1545 bool Is64Bit, int FPDiff, DebugLoc dl) { 1546 // Store the return address to the appropriate stack slot. 1547 if (!FPDiff) return Chain; 1548 // Calculate the new stack slot for the return address. 1549 int SlotSize = Is64Bit ? 8 : 4; 1550 int NewReturnAddrFI = 1551 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); 1552 MVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1553 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1554 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1555 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); 1556 return Chain; 1557} 1558 1559SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { 1560 MachineFunction &MF = DAG.getMachineFunction(); 1561 CallSDNode *TheCall = cast<CallSDNode>(Op.getNode()); 1562 SDValue Chain = TheCall->getChain(); 1563 unsigned CC = TheCall->getCallingConv(); 1564 bool isVarArg = TheCall->isVarArg(); 1565 bool IsTailCall = TheCall->isTailCall() && 1566 CC == CallingConv::Fast && PerformTailCallOpt; 1567 SDValue Callee = TheCall->getCallee(); 1568 bool Is64Bit = Subtarget->is64Bit(); 1569 bool IsStructRet = CallIsStructReturn(TheCall); 1570 DebugLoc dl = TheCall->getDebugLoc(); 1571 1572 assert(!(isVarArg && CC == CallingConv::Fast) && 1573 "Var args not supported with calling convention fastcc"); 1574 1575 // Analyze operands of the call, assigning locations to each operand. 1576 SmallVector<CCValAssign, 16> ArgLocs; 1577 CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); 1578 CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC)); 1579 1580 // Get a count of how many bytes are to be pushed on the stack. 1581 unsigned NumBytes = CCInfo.getNextStackOffset(); 1582 if (PerformTailCallOpt && CC == CallingConv::Fast) 1583 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1584 1585 int FPDiff = 0; 1586 if (IsTailCall) { 1587 // Lower arguments at fp - stackoffset + fpdiff. 1588 unsigned NumBytesCallerPushed = 1589 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1590 FPDiff = NumBytesCallerPushed - NumBytes; 1591 1592 // Set the delta of movement of the returnaddr stackslot. 1593 // But only set if delta is greater than previous delta. 1594 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1595 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1596 } 1597 1598 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1599 1600 SDValue RetAddrFrIdx; 1601 // Load return adress for tail calls. 1602 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit, 1603 FPDiff, dl); 1604 1605 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1606 SmallVector<SDValue, 8> MemOpChains; 1607 SDValue StackPtr; 1608 1609 // Walk the register/memloc assignments, inserting copies/loads. In the case 1610 // of tail call optimization arguments are handle later. 1611 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1612 CCValAssign &VA = ArgLocs[i]; 1613 SDValue Arg = TheCall->getArg(i); 1614 ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i); 1615 bool isByVal = Flags.isByVal(); 1616 1617 // Promote the value if needed. 1618 switch (VA.getLocInfo()) { 1619 default: assert(0 && "Unknown loc info!"); 1620 case CCValAssign::Full: break; 1621 case CCValAssign::SExt: 1622 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1623 break; 1624 case CCValAssign::ZExt: 1625 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1626 break; 1627 case CCValAssign::AExt: 1628 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1629 break; 1630 } 1631 1632 if (VA.isRegLoc()) { 1633 if (Is64Bit) { 1634 MVT RegVT = VA.getLocVT(); 1635 if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1636 switch (VA.getLocReg()) { 1637 default: 1638 break; 1639 case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX: 1640 case X86::R8: { 1641 // Special case: passing MMX values in GPR registers. 1642 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1643 break; 1644 } 1645 case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3: 1646 case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: { 1647 // Special case: passing MMX values in XMM registers. 1648 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1649 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1650 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1651 break; 1652 } 1653 } 1654 } 1655 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1656 } else { 1657 if (!IsTailCall || (IsTailCall && isByVal)) { 1658 assert(VA.isMemLoc()); 1659 if (StackPtr.getNode() == 0) 1660 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1661 1662 MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, 1663 Chain, Arg, Flags)); 1664 } 1665 } 1666 } 1667 1668 if (!MemOpChains.empty()) 1669 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1670 &MemOpChains[0], MemOpChains.size()); 1671 1672 // Build a sequence of copy-to-reg nodes chained together with token chain 1673 // and flag operands which copy the outgoing args into registers. 1674 SDValue InFlag; 1675 // Tail call byval lowering might overwrite argument registers so in case of 1676 // tail call optimization the copies to registers are lowered later. 1677 if (!IsTailCall) 1678 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1679 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1680 RegsToPass[i].second, InFlag); 1681 InFlag = Chain.getValue(1); 1682 } 1683 1684 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1685 // GOT pointer. 1686 if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) { 1687 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1688 DAG.getNode(X86ISD::GlobalBaseReg, 1689 DebugLoc::getUnknownLoc(), 1690 getPointerTy()), 1691 InFlag); 1692 InFlag = Chain.getValue(1); 1693 } 1694 // If we are tail calling and generating PIC/GOT style code load the address 1695 // of the callee into ecx. The value in ecx is used as target of the tail 1696 // jump. This is done to circumvent the ebx/callee-saved problem for tail 1697 // calls on PIC/GOT architectures. Normally we would just put the address of 1698 // GOT into ebx and then call target@PLT. But for tail callss ebx would be 1699 // restored (since ebx is callee saved) before jumping to the target@PLT. 1700 if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) { 1701 // Note: The actual moving to ecx is done further down. 1702 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1703 if (G && !G->getGlobal()->hasHiddenVisibility() && 1704 !G->getGlobal()->hasProtectedVisibility()) 1705 Callee = LowerGlobalAddress(Callee, DAG); 1706 else if (isa<ExternalSymbolSDNode>(Callee)) 1707 Callee = LowerExternalSymbol(Callee,DAG); 1708 } 1709 1710 if (Is64Bit && isVarArg) { 1711 // From AMD64 ABI document: 1712 // For calls that may call functions that use varargs or stdargs 1713 // (prototype-less calls or calls to functions containing ellipsis (...) in 1714 // the declaration) %al is used as hidden argument to specify the number 1715 // of SSE registers used. The contents of %al do not need to match exactly 1716 // the number of registers, but must be an ubound on the number of SSE 1717 // registers used and is in the range 0 - 8 inclusive. 1718 1719 // FIXME: Verify this on Win64 1720 // Count the number of XMM registers allocated. 1721 static const unsigned XMMArgRegs[] = { 1722 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1723 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1724 }; 1725 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1726 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1727 && "SSE registers cannot be used when SSE is disabled"); 1728 1729 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1730 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1731 InFlag = Chain.getValue(1); 1732 } 1733 1734 1735 // For tail calls lower the arguments to the 'real' stack slot. 1736 if (IsTailCall) { 1737 SmallVector<SDValue, 8> MemOpChains2; 1738 SDValue FIN; 1739 int FI = 0; 1740 // Do not flag preceeding copytoreg stuff together with the following stuff. 1741 InFlag = SDValue(); 1742 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1743 CCValAssign &VA = ArgLocs[i]; 1744 if (!VA.isRegLoc()) { 1745 assert(VA.isMemLoc()); 1746 SDValue Arg = TheCall->getArg(i); 1747 ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i); 1748 // Create frame index. 1749 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1750 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1751 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); 1752 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1753 1754 if (Flags.isByVal()) { 1755 // Copy relative to framepointer. 1756 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1757 if (StackPtr.getNode() == 0) 1758 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1759 getPointerTy()); 1760 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1761 1762 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain, 1763 Flags, DAG, dl)); 1764 } else { 1765 // Store relative to framepointer. 1766 MemOpChains2.push_back( 1767 DAG.getStore(Chain, dl, Arg, FIN, 1768 PseudoSourceValue::getFixedStack(FI), 0)); 1769 } 1770 } 1771 } 1772 1773 if (!MemOpChains2.empty()) 1774 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1775 &MemOpChains2[0], MemOpChains2.size()); 1776 1777 // Copy arguments to their registers. 1778 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1779 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1780 RegsToPass[i].second, InFlag); 1781 InFlag = Chain.getValue(1); 1782 } 1783 InFlag =SDValue(); 1784 1785 // Store the return address to the appropriate stack slot. 1786 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 1787 FPDiff, dl); 1788 } 1789 1790 // If the callee is a GlobalAddress node (quite common, every direct call is) 1791 // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. 1792 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1793 // We should use extra load for direct calls to dllimported functions in 1794 // non-JIT mode. 1795 if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(), 1796 getTargetMachine(), true)) 1797 Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy(), 1798 G->getOffset()); 1799 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1800 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); 1801 } else if (IsTailCall) { 1802 unsigned Opc = Is64Bit ? X86::R9 : X86::EAX; 1803 1804 Chain = DAG.getCopyToReg(Chain, dl, 1805 DAG.getRegister(Opc, getPointerTy()), 1806 Callee,InFlag); 1807 Callee = DAG.getRegister(Opc, getPointerTy()); 1808 // Add register as live out. 1809 DAG.getMachineFunction().getRegInfo().addLiveOut(Opc); 1810 } 1811 1812 // Returns a chain & a flag for retval copy to use. 1813 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1814 SmallVector<SDValue, 8> Ops; 1815 1816 if (IsTailCall) { 1817 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1818 DAG.getIntPtrConstant(0, true), InFlag); 1819 InFlag = Chain.getValue(1); 1820 1821 // Returns a chain & a flag for retval copy to use. 1822 NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1823 Ops.clear(); 1824 } 1825 1826 Ops.push_back(Chain); 1827 Ops.push_back(Callee); 1828 1829 if (IsTailCall) 1830 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 1831 1832 // Add argument registers to the end of the list so that they are known live 1833 // into the call. 1834 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1835 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1836 RegsToPass[i].second.getValueType())); 1837 1838 // Add an implicit use GOT pointer in EBX. 1839 if (!IsTailCall && !Is64Bit && 1840 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1841 Subtarget->isPICStyleGOT()) 1842 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 1843 1844 // Add an implicit use of AL for x86 vararg functions. 1845 if (Is64Bit && isVarArg) 1846 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 1847 1848 if (InFlag.getNode()) 1849 Ops.push_back(InFlag); 1850 1851 if (IsTailCall) { 1852 assert(InFlag.getNode() && 1853 "Flag must be set. Depend on flag being set in LowerRET"); 1854 Chain = DAG.getNode(X86ISD::TAILCALL, dl, 1855 TheCall->getVTList(), &Ops[0], Ops.size()); 1856 1857 return SDValue(Chain.getNode(), Op.getResNo()); 1858 } 1859 1860 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 1861 InFlag = Chain.getValue(1); 1862 1863 // Create the CALLSEQ_END node. 1864 unsigned NumBytesForCalleeToPush; 1865 if (IsCalleePop(isVarArg, CC)) 1866 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 1867 else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet) 1868 // If this is is a call to a struct-return function, the callee 1869 // pops the hidden struct pointer, so we have to push it back. 1870 // This is common for Darwin/X86, Linux & Mingw32 targets. 1871 NumBytesForCalleeToPush = 4; 1872 else 1873 NumBytesForCalleeToPush = 0; // Callee pops nothing. 1874 1875 // Returns a flag for retval copy to use. 1876 Chain = DAG.getCALLSEQ_END(Chain, 1877 DAG.getIntPtrConstant(NumBytes, true), 1878 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 1879 true), 1880 InFlag); 1881 InFlag = Chain.getValue(1); 1882 1883 // Handle result values, copying them out of physregs into vregs that we 1884 // return. 1885 return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG), 1886 Op.getResNo()); 1887} 1888 1889 1890//===----------------------------------------------------------------------===// 1891// Fast Calling Convention (tail call) implementation 1892//===----------------------------------------------------------------------===// 1893 1894// Like std call, callee cleans arguments, convention except that ECX is 1895// reserved for storing the tail called function address. Only 2 registers are 1896// free for argument passing (inreg). Tail call optimization is performed 1897// provided: 1898// * tailcallopt is enabled 1899// * caller/callee are fastcc 1900// On X86_64 architecture with GOT-style position independent code only local 1901// (within module) calls are supported at the moment. 1902// To keep the stack aligned according to platform abi the function 1903// GetAlignedArgumentStackSize ensures that argument delta is always multiples 1904// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 1905// If a tail called function callee has more arguments than the caller the 1906// caller needs to make sure that there is room to move the RETADDR to. This is 1907// achieved by reserving an area the size of the argument delta right after the 1908// original REtADDR, but before the saved framepointer or the spilled registers 1909// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 1910// stack layout: 1911// arg1 1912// arg2 1913// RETADDR 1914// [ new RETADDR 1915// move area ] 1916// (possible EBP) 1917// ESI 1918// EDI 1919// local1 .. 1920 1921/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 1922/// for a 16 byte align requirement. 1923unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 1924 SelectionDAG& DAG) { 1925 MachineFunction &MF = DAG.getMachineFunction(); 1926 const TargetMachine &TM = MF.getTarget(); 1927 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 1928 unsigned StackAlignment = TFI.getStackAlignment(); 1929 uint64_t AlignMask = StackAlignment - 1; 1930 int64_t Offset = StackSize; 1931 uint64_t SlotSize = TD->getPointerSize(); 1932 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 1933 // Number smaller than 12 so just add the difference. 1934 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 1935 } else { 1936 // Mask out lower bits, add stackalignment once plus the 12 bytes. 1937 Offset = ((~AlignMask) & Offset) + StackAlignment + 1938 (StackAlignment-SlotSize); 1939 } 1940 return Offset; 1941} 1942 1943/// IsEligibleForTailCallElimination - Check to see whether the next instruction 1944/// following the call is a return. A function is eligible if caller/callee 1945/// calling conventions match, currently only fastcc supports tail calls, and 1946/// the function CALL is immediatly followed by a RET. 1947bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall, 1948 SDValue Ret, 1949 SelectionDAG& DAG) const { 1950 if (!PerformTailCallOpt) 1951 return false; 1952 1953 if (CheckTailCallReturnConstraints(TheCall, Ret)) { 1954 MachineFunction &MF = DAG.getMachineFunction(); 1955 unsigned CallerCC = MF.getFunction()->getCallingConv(); 1956 unsigned CalleeCC= TheCall->getCallingConv(); 1957 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { 1958 SDValue Callee = TheCall->getCallee(); 1959 // On x86/32Bit PIC/GOT tail calls are supported. 1960 if (getTargetMachine().getRelocationModel() != Reloc::PIC_ || 1961 !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit()) 1962 return true; 1963 1964 // Can only do local tail calls (in same module, hidden or protected) on 1965 // x86_64 PIC/GOT at the moment. 1966 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1967 return G->getGlobal()->hasHiddenVisibility() 1968 || G->getGlobal()->hasProtectedVisibility(); 1969 } 1970 } 1971 1972 return false; 1973} 1974 1975FastISel * 1976X86TargetLowering::createFastISel(MachineFunction &mf, 1977 MachineModuleInfo *mmo, 1978 DwarfWriter *dw, 1979 DenseMap<const Value *, unsigned> &vm, 1980 DenseMap<const BasicBlock *, 1981 MachineBasicBlock *> &bm, 1982 DenseMap<const AllocaInst *, int> &am 1983#ifndef NDEBUG 1984 , SmallSet<Instruction*, 8> &cil 1985#endif 1986 ) { 1987 return X86::createFastISel(mf, mmo, dw, vm, bm, am 1988#ifndef NDEBUG 1989 , cil 1990#endif 1991 ); 1992} 1993 1994 1995//===----------------------------------------------------------------------===// 1996// Other Lowering Hooks 1997//===----------------------------------------------------------------------===// 1998 1999 2000SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2001 MachineFunction &MF = DAG.getMachineFunction(); 2002 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2003 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2004 2005 if (ReturnAddrIndex == 0) { 2006 // Set up a frame object for the return address. 2007 uint64_t SlotSize = TD->getPointerSize(); 2008 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize); 2009 FuncInfo->setRAIndex(ReturnAddrIndex); 2010 } 2011 2012 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2013} 2014 2015 2016/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2017/// specific condition code, returning the condition code and the LHS/RHS of the 2018/// comparison to make. 2019static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2020 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2021 if (!isFP) { 2022 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2023 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2024 // X > -1 -> X == 0, jump !sign. 2025 RHS = DAG.getConstant(0, RHS.getValueType()); 2026 return X86::COND_NS; 2027 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2028 // X < 0 -> X == 0, jump on sign. 2029 return X86::COND_S; 2030 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2031 // X < 1 -> X <= 0 2032 RHS = DAG.getConstant(0, RHS.getValueType()); 2033 return X86::COND_LE; 2034 } 2035 } 2036 2037 switch (SetCCOpcode) { 2038 default: assert(0 && "Invalid integer condition!"); 2039 case ISD::SETEQ: return X86::COND_E; 2040 case ISD::SETGT: return X86::COND_G; 2041 case ISD::SETGE: return X86::COND_GE; 2042 case ISD::SETLT: return X86::COND_L; 2043 case ISD::SETLE: return X86::COND_LE; 2044 case ISD::SETNE: return X86::COND_NE; 2045 case ISD::SETULT: return X86::COND_B; 2046 case ISD::SETUGT: return X86::COND_A; 2047 case ISD::SETULE: return X86::COND_BE; 2048 case ISD::SETUGE: return X86::COND_AE; 2049 } 2050 } 2051 2052 // First determine if it is required or is profitable to flip the operands. 2053 2054 // If LHS is a foldable load, but RHS is not, flip the condition. 2055 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2056 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2057 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2058 std::swap(LHS, RHS); 2059 } 2060 2061 switch (SetCCOpcode) { 2062 default: break; 2063 case ISD::SETOLT: 2064 case ISD::SETOLE: 2065 case ISD::SETUGT: 2066 case ISD::SETUGE: 2067 std::swap(LHS, RHS); 2068 break; 2069 } 2070 2071 // On a floating point condition, the flags are set as follows: 2072 // ZF PF CF op 2073 // 0 | 0 | 0 | X > Y 2074 // 0 | 0 | 1 | X < Y 2075 // 1 | 0 | 0 | X == Y 2076 // 1 | 1 | 1 | unordered 2077 switch (SetCCOpcode) { 2078 default: assert(0 && "Condcode should be pre-legalized away"); 2079 case ISD::SETUEQ: 2080 case ISD::SETEQ: return X86::COND_E; 2081 case ISD::SETOLT: // flipped 2082 case ISD::SETOGT: 2083 case ISD::SETGT: return X86::COND_A; 2084 case ISD::SETOLE: // flipped 2085 case ISD::SETOGE: 2086 case ISD::SETGE: return X86::COND_AE; 2087 case ISD::SETUGT: // flipped 2088 case ISD::SETULT: 2089 case ISD::SETLT: return X86::COND_B; 2090 case ISD::SETUGE: // flipped 2091 case ISD::SETULE: 2092 case ISD::SETLE: return X86::COND_BE; 2093 case ISD::SETONE: 2094 case ISD::SETNE: return X86::COND_NE; 2095 case ISD::SETUO: return X86::COND_P; 2096 case ISD::SETO: return X86::COND_NP; 2097 } 2098} 2099 2100/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2101/// code. Current x86 isa includes the following FP cmov instructions: 2102/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2103static bool hasFPCMov(unsigned X86CC) { 2104 switch (X86CC) { 2105 default: 2106 return false; 2107 case X86::COND_B: 2108 case X86::COND_BE: 2109 case X86::COND_E: 2110 case X86::COND_P: 2111 case X86::COND_A: 2112 case X86::COND_AE: 2113 case X86::COND_NE: 2114 case X86::COND_NP: 2115 return true; 2116 } 2117} 2118 2119/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2120/// the specified range (L, H]. 2121static bool isUndefOrInRange(int Val, int Low, int Hi) { 2122 return (Val < 0) || (Val >= Low && Val < Hi); 2123} 2124 2125/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2126/// specified value. 2127static bool isUndefOrEqual(int Val, int CmpVal) { 2128 if (Val < 0 || Val == CmpVal) 2129 return true; 2130 return false; 2131} 2132 2133/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2134/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2135/// the second operand. 2136static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2137 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2138 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2139 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2140 return (Mask[0] < 2 && Mask[1] < 2); 2141 return false; 2142} 2143 2144bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2145 SmallVector<int, 8> M; 2146 N->getMask(M); 2147 return ::isPSHUFDMask(M, N->getValueType(0)); 2148} 2149 2150/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2151/// is suitable for input to PSHUFHW. 2152static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2153 if (VT != MVT::v8i16) 2154 return false; 2155 2156 // Lower quadword copied in order or undef. 2157 for (int i = 0; i != 4; ++i) 2158 if (Mask[i] >= 0 && Mask[i] != i) 2159 return false; 2160 2161 // Upper quadword shuffled. 2162 for (int i = 4; i != 8; ++i) 2163 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2164 return false; 2165 2166 return true; 2167} 2168 2169bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2170 SmallVector<int, 8> M; 2171 N->getMask(M); 2172 return ::isPSHUFHWMask(M, N->getValueType(0)); 2173} 2174 2175/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2176/// is suitable for input to PSHUFLW. 2177static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2178 if (VT != MVT::v8i16) 2179 return false; 2180 2181 // Upper quadword copied in order. 2182 for (int i = 4; i != 8; ++i) 2183 if (Mask[i] >= 0 && Mask[i] != i) 2184 return false; 2185 2186 // Lower quadword shuffled. 2187 for (int i = 0; i != 4; ++i) 2188 if (Mask[i] >= 4) 2189 return false; 2190 2191 return true; 2192} 2193 2194bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2195 SmallVector<int, 8> M; 2196 N->getMask(M); 2197 return ::isPSHUFLWMask(M, N->getValueType(0)); 2198} 2199 2200/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2201/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2202static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2203 int NumElems = VT.getVectorNumElements(); 2204 if (NumElems != 2 && NumElems != 4) 2205 return false; 2206 2207 int Half = NumElems / 2; 2208 for (int i = 0; i < Half; ++i) 2209 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2210 return false; 2211 for (int i = Half; i < NumElems; ++i) 2212 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2213 return false; 2214 2215 return true; 2216} 2217 2218bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2219 SmallVector<int, 8> M; 2220 N->getMask(M); 2221 return ::isSHUFPMask(M, N->getValueType(0)); 2222} 2223 2224/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2225/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2226/// half elements to come from vector 1 (which would equal the dest.) and 2227/// the upper half to come from vector 2. 2228static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2229 int NumElems = VT.getVectorNumElements(); 2230 2231 if (NumElems != 2 && NumElems != 4) 2232 return false; 2233 2234 int Half = NumElems / 2; 2235 for (int i = 0; i < Half; ++i) 2236 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2237 return false; 2238 for (int i = Half; i < NumElems; ++i) 2239 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2240 return false; 2241 return true; 2242} 2243 2244static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2245 SmallVector<int, 8> M; 2246 N->getMask(M); 2247 return isCommutedSHUFPMask(M, N->getValueType(0)); 2248} 2249 2250/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2251/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2252bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2253 if (N->getValueType(0).getVectorNumElements() != 4) 2254 return false; 2255 2256 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2257 return isUndefOrEqual(N->getMaskElt(0), 6) && 2258 isUndefOrEqual(N->getMaskElt(1), 7) && 2259 isUndefOrEqual(N->getMaskElt(2), 2) && 2260 isUndefOrEqual(N->getMaskElt(3), 3); 2261} 2262 2263/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2264/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2265bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2266 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2267 2268 if (NumElems != 2 && NumElems != 4) 2269 return false; 2270 2271 for (unsigned i = 0; i < NumElems/2; ++i) 2272 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2273 return false; 2274 2275 for (unsigned i = NumElems/2; i < NumElems; ++i) 2276 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2277 return false; 2278 2279 return true; 2280} 2281 2282/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand 2283/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D} 2284/// and MOVLHPS. 2285bool X86::isMOVHPMask(ShuffleVectorSDNode *N) { 2286 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2287 2288 if (NumElems != 2 && NumElems != 4) 2289 return false; 2290 2291 for (unsigned i = 0; i < NumElems/2; ++i) 2292 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2293 return false; 2294 2295 for (unsigned i = 0; i < NumElems/2; ++i) 2296 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2297 return false; 2298 2299 return true; 2300} 2301 2302/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2303/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2304/// <2, 3, 2, 3> 2305bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2306 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2307 2308 if (NumElems != 4) 2309 return false; 2310 2311 return isUndefOrEqual(N->getMaskElt(0), 2) && 2312 isUndefOrEqual(N->getMaskElt(1), 3) && 2313 isUndefOrEqual(N->getMaskElt(2), 2) && 2314 isUndefOrEqual(N->getMaskElt(3), 3); 2315} 2316 2317/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2318/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2319static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, MVT VT, 2320 bool V2IsSplat = false) { 2321 int NumElts = VT.getVectorNumElements(); 2322 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2323 return false; 2324 2325 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2326 int BitI = Mask[i]; 2327 int BitI1 = Mask[i+1]; 2328 if (!isUndefOrEqual(BitI, j)) 2329 return false; 2330 if (V2IsSplat) { 2331 if (!isUndefOrEqual(BitI1, NumElts)) 2332 return false; 2333 } else { 2334 if (!isUndefOrEqual(BitI1, j + NumElts)) 2335 return false; 2336 } 2337 } 2338 return true; 2339} 2340 2341bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2342 SmallVector<int, 8> M; 2343 N->getMask(M); 2344 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2345} 2346 2347/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2348/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2349static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, MVT VT, 2350 bool V2IsSplat = false) { 2351 int NumElts = VT.getVectorNumElements(); 2352 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2353 return false; 2354 2355 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2356 int BitI = Mask[i]; 2357 int BitI1 = Mask[i+1]; 2358 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2359 return false; 2360 if (V2IsSplat) { 2361 if (isUndefOrEqual(BitI1, NumElts)) 2362 return false; 2363 } else { 2364 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2365 return false; 2366 } 2367 } 2368 return true; 2369} 2370 2371bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2372 SmallVector<int, 8> M; 2373 N->getMask(M); 2374 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2375} 2376 2377/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2378/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2379/// <0, 0, 1, 1> 2380static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) { 2381 int NumElems = VT.getVectorNumElements(); 2382 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2383 return false; 2384 2385 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2386 int BitI = Mask[i]; 2387 int BitI1 = Mask[i+1]; 2388 if (!isUndefOrEqual(BitI, j)) 2389 return false; 2390 if (!isUndefOrEqual(BitI1, j)) 2391 return false; 2392 } 2393 return true; 2394} 2395 2396bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2397 SmallVector<int, 8> M; 2398 N->getMask(M); 2399 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2400} 2401 2402/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2403/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2404/// <2, 2, 3, 3> 2405static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) { 2406 int NumElems = VT.getVectorNumElements(); 2407 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2408 return false; 2409 2410 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2411 int BitI = Mask[i]; 2412 int BitI1 = Mask[i+1]; 2413 if (!isUndefOrEqual(BitI, j)) 2414 return false; 2415 if (!isUndefOrEqual(BitI1, j)) 2416 return false; 2417 } 2418 return true; 2419} 2420 2421bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2422 SmallVector<int, 8> M; 2423 N->getMask(M); 2424 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2425} 2426 2427/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2428/// specifies a shuffle of elements that is suitable for input to MOVSS, 2429/// MOVSD, and MOVD, i.e. setting the lowest element. 2430static bool isMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2431 if (VT.getVectorElementType().getSizeInBits() < 32) 2432 return false; 2433 2434 int NumElts = VT.getVectorNumElements(); 2435 2436 if (!isUndefOrEqual(Mask[0], NumElts)) 2437 return false; 2438 2439 for (int i = 1; i < NumElts; ++i) 2440 if (!isUndefOrEqual(Mask[i], i)) 2441 return false; 2442 2443 return true; 2444} 2445 2446bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2447 SmallVector<int, 8> M; 2448 N->getMask(M); 2449 return ::isMOVLMask(M, N->getValueType(0)); 2450} 2451 2452/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2453/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2454/// element of vector 2 and the other elements to come from vector 1 in order. 2455static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT, 2456 bool V2IsSplat = false, bool V2IsUndef = false) { 2457 int NumOps = VT.getVectorNumElements(); 2458 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2459 return false; 2460 2461 if (!isUndefOrEqual(Mask[0], 0)) 2462 return false; 2463 2464 for (int i = 1; i < NumOps; ++i) 2465 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2466 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2467 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2468 return false; 2469 2470 return true; 2471} 2472 2473static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2474 bool V2IsUndef = false) { 2475 SmallVector<int, 8> M; 2476 N->getMask(M); 2477 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 2478} 2479 2480/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2481/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2482bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 2483 if (N->getValueType(0).getVectorNumElements() != 4) 2484 return false; 2485 2486 // Expect 1, 1, 3, 3 2487 for (unsigned i = 0; i < 2; ++i) { 2488 int Elt = N->getMaskElt(i); 2489 if (Elt >= 0 && Elt != 1) 2490 return false; 2491 } 2492 2493 bool HasHi = false; 2494 for (unsigned i = 2; i < 4; ++i) { 2495 int Elt = N->getMaskElt(i); 2496 if (Elt >= 0 && Elt != 3) 2497 return false; 2498 if (Elt == 3) 2499 HasHi = true; 2500 } 2501 // Don't use movshdup if it can be done with a shufps. 2502 // FIXME: verify that matching u, u, 3, 3 is what we want. 2503 return HasHi; 2504} 2505 2506/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2507/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2508bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 2509 if (N->getValueType(0).getVectorNumElements() != 4) 2510 return false; 2511 2512 // Expect 0, 0, 2, 2 2513 for (unsigned i = 0; i < 2; ++i) 2514 if (N->getMaskElt(i) > 0) 2515 return false; 2516 2517 bool HasHi = false; 2518 for (unsigned i = 2; i < 4; ++i) { 2519 int Elt = N->getMaskElt(i); 2520 if (Elt >= 0 && Elt != 2) 2521 return false; 2522 if (Elt == 2) 2523 HasHi = true; 2524 } 2525 // Don't use movsldup if it can be done with a shufps. 2526 return HasHi; 2527} 2528 2529/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2530/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 2531bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 2532 int e = N->getValueType(0).getVectorNumElements() / 2; 2533 2534 for (int i = 0; i < e; ++i) 2535 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2536 return false; 2537 for (int i = 0; i < e; ++i) 2538 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 2539 return false; 2540 return true; 2541} 2542 2543/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 2544/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* 2545/// instructions. 2546unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 2547 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2548 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 2549 2550 unsigned Shift = (NumOperands == 4) ? 2 : 1; 2551 unsigned Mask = 0; 2552 for (int i = 0; i < NumOperands; ++i) { 2553 int Val = SVOp->getMaskElt(NumOperands-i-1); 2554 if (Val < 0) Val = 0; 2555 if (Val >= NumOperands) Val -= NumOperands; 2556 Mask |= Val; 2557 if (i != NumOperands - 1) 2558 Mask <<= Shift; 2559 } 2560 return Mask; 2561} 2562 2563/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 2564/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW 2565/// instructions. 2566unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 2567 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2568 unsigned Mask = 0; 2569 // 8 nodes, but we only care about the last 4. 2570 for (unsigned i = 7; i >= 4; --i) { 2571 int Val = SVOp->getMaskElt(i); 2572 if (Val >= 0) 2573 Mask |= (Val - 4); 2574 if (i != 4) 2575 Mask <<= 2; 2576 } 2577 return Mask; 2578} 2579 2580/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 2581/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW 2582/// instructions. 2583unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 2584 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2585 unsigned Mask = 0; 2586 // 8 nodes, but we only care about the first 4. 2587 for (int i = 3; i >= 0; --i) { 2588 int Val = SVOp->getMaskElt(i); 2589 if (Val >= 0) 2590 Mask |= Val; 2591 if (i != 0) 2592 Mask <<= 2; 2593 } 2594 return Mask; 2595} 2596 2597/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 2598/// their permute mask. 2599static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 2600 SelectionDAG &DAG) { 2601 MVT VT = SVOp->getValueType(0); 2602 unsigned NumElems = VT.getVectorNumElements(); 2603 SmallVector<int, 8> MaskVec; 2604 2605 for (unsigned i = 0; i != NumElems; ++i) { 2606 int idx = SVOp->getMaskElt(i); 2607 if (idx < 0) 2608 MaskVec.push_back(idx); 2609 else if (idx < (int)NumElems) 2610 MaskVec.push_back(idx + NumElems); 2611 else 2612 MaskVec.push_back(idx - NumElems); 2613 } 2614 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 2615 SVOp->getOperand(0), &MaskVec[0]); 2616} 2617 2618/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 2619/// the two vector operands have swapped position. 2620static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, MVT VT) { 2621 unsigned NumElems = VT.getVectorNumElements(); 2622 for (unsigned i = 0; i != NumElems; ++i) { 2623 int idx = Mask[i]; 2624 if (idx < 0) 2625 continue; 2626 else if (idx < (int)NumElems) 2627 Mask[i] = idx + NumElems; 2628 else 2629 Mask[i] = idx - NumElems; 2630 } 2631} 2632 2633/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 2634/// match movhlps. The lower half elements should come from upper half of 2635/// V1 (and in order), and the upper half elements should come from the upper 2636/// half of V2 (and in order). 2637static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 2638 if (Op->getValueType(0).getVectorNumElements() != 4) 2639 return false; 2640 for (unsigned i = 0, e = 2; i != e; ++i) 2641 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 2642 return false; 2643 for (unsigned i = 2; i != 4; ++i) 2644 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 2645 return false; 2646 return true; 2647} 2648 2649/// isScalarLoadToVector - Returns true if the node is a scalar load that 2650/// is promoted to a vector. It also returns the LoadSDNode by reference if 2651/// required. 2652static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 2653 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 2654 return false; 2655 N = N->getOperand(0).getNode(); 2656 if (!ISD::isNON_EXTLoad(N)) 2657 return false; 2658 if (LD) 2659 *LD = cast<LoadSDNode>(N); 2660 return true; 2661} 2662 2663/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 2664/// match movlp{s|d}. The lower half elements should come from lower half of 2665/// V1 (and in order), and the upper half elements should come from the upper 2666/// half of V2 (and in order). And since V1 will become the source of the 2667/// MOVLP, it must be either a vector load or a scalar load to vector. 2668static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 2669 ShuffleVectorSDNode *Op) { 2670 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 2671 return false; 2672 // Is V2 is a vector load, don't do this transformation. We will try to use 2673 // load folding shufps op. 2674 if (ISD::isNON_EXTLoad(V2)) 2675 return false; 2676 2677 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 2678 2679 if (NumElems != 2 && NumElems != 4) 2680 return false; 2681 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 2682 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 2683 return false; 2684 for (unsigned i = NumElems/2; i != NumElems; ++i) 2685 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 2686 return false; 2687 return true; 2688} 2689 2690/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 2691/// all the same. 2692static bool isSplatVector(SDNode *N) { 2693 if (N->getOpcode() != ISD::BUILD_VECTOR) 2694 return false; 2695 2696 SDValue SplatValue = N->getOperand(0); 2697 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 2698 if (N->getOperand(i) != SplatValue) 2699 return false; 2700 return true; 2701} 2702 2703/// isZeroNode - Returns true if Elt is a constant zero or a floating point 2704/// constant +0.0. 2705static inline bool isZeroNode(SDValue Elt) { 2706 return ((isa<ConstantSDNode>(Elt) && 2707 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 2708 (isa<ConstantFPSDNode>(Elt) && 2709 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 2710} 2711 2712/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 2713/// to an zero vector. 2714/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 2715static bool isZeroShuffle(ShuffleVectorSDNode *N) { 2716 SDValue V1 = N->getOperand(0); 2717 SDValue V2 = N->getOperand(1); 2718 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2719 for (unsigned i = 0; i != NumElems; ++i) { 2720 int Idx = N->getMaskElt(i); 2721 if (Idx >= (int)NumElems) { 2722 unsigned Opc = V2.getOpcode(); 2723 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 2724 continue; 2725 if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V2.getOperand(Idx-NumElems))) 2726 return false; 2727 } else if (Idx >= 0) { 2728 unsigned Opc = V1.getOpcode(); 2729 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 2730 continue; 2731 if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V1.getOperand(Idx))) 2732 return false; 2733 } 2734 } 2735 return true; 2736} 2737 2738/// getZeroVector - Returns a vector of specified type with all zero elements. 2739/// 2740static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG, 2741 DebugLoc dl) { 2742 assert(VT.isVector() && "Expected a vector type"); 2743 2744 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2745 // type. This ensures they get CSE'd. 2746 SDValue Vec; 2747 if (VT.getSizeInBits() == 64) { // MMX 2748 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2749 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2750 } else if (HasSSE2) { // SSE2 2751 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2752 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2753 } else { // SSE1 2754 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 2755 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 2756 } 2757 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2758} 2759 2760/// getOnesVector - Returns a vector of specified type with all bits set. 2761/// 2762static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { 2763 assert(VT.isVector() && "Expected a vector type"); 2764 2765 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2766 // type. This ensures they get CSE'd. 2767 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 2768 SDValue Vec; 2769 if (VT.getSizeInBits() == 64) // MMX 2770 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2771 else // SSE 2772 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2773 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2774} 2775 2776 2777/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 2778/// that point to V2 points to its first element. 2779static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 2780 MVT VT = SVOp->getValueType(0); 2781 unsigned NumElems = VT.getVectorNumElements(); 2782 2783 bool Changed = false; 2784 SmallVector<int, 8> MaskVec; 2785 SVOp->getMask(MaskVec); 2786 2787 for (unsigned i = 0; i != NumElems; ++i) { 2788 if (MaskVec[i] > (int)NumElems) { 2789 MaskVec[i] = NumElems; 2790 Changed = true; 2791 } 2792 } 2793 if (Changed) 2794 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 2795 SVOp->getOperand(1), &MaskVec[0]); 2796 return SDValue(SVOp, 0); 2797} 2798 2799/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 2800/// operation of specified width. 2801static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 2802 SDValue V2) { 2803 unsigned NumElems = VT.getVectorNumElements(); 2804 SmallVector<int, 8> Mask; 2805 Mask.push_back(NumElems); 2806 for (unsigned i = 1; i != NumElems; ++i) 2807 Mask.push_back(i); 2808 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2809} 2810 2811/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 2812static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 2813 SDValue V2) { 2814 unsigned NumElems = VT.getVectorNumElements(); 2815 SmallVector<int, 8> Mask; 2816 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 2817 Mask.push_back(i); 2818 Mask.push_back(i + NumElems); 2819 } 2820 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2821} 2822 2823/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 2824static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 2825 SDValue V2) { 2826 unsigned NumElems = VT.getVectorNumElements(); 2827 unsigned Half = NumElems/2; 2828 SmallVector<int, 8> Mask; 2829 for (unsigned i = 0; i != Half; ++i) { 2830 Mask.push_back(i + Half); 2831 Mask.push_back(i + NumElems + Half); 2832 } 2833 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2834} 2835 2836/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 2837static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 2838 bool HasSSE2) { 2839 if (SV->getValueType(0).getVectorNumElements() <= 4) 2840 return SDValue(SV, 0); 2841 2842 MVT PVT = MVT::v4f32; 2843 MVT VT = SV->getValueType(0); 2844 DebugLoc dl = SV->getDebugLoc(); 2845 SDValue V1 = SV->getOperand(0); 2846 int NumElems = VT.getVectorNumElements(); 2847 int EltNo = SV->getSplatIndex(); 2848 2849 // unpack elements to the correct location 2850 while (NumElems > 4) { 2851 if (EltNo < NumElems/2) { 2852 V1 = getUnpackl(DAG, dl, VT, V1, V1); 2853 } else { 2854 V1 = getUnpackh(DAG, dl, VT, V1, V1); 2855 EltNo -= NumElems/2; 2856 } 2857 NumElems >>= 1; 2858 } 2859 2860 // Perform the splat. 2861 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 2862 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 2863 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 2864 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 2865} 2866 2867/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 2868/// vector of zero or undef vector. This produces a shuffle where the low 2869/// element of V2 is swizzled into the zero/undef vector, landing at element 2870/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 2871static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 2872 bool isZero, bool HasSSE2, 2873 SelectionDAG &DAG) { 2874 MVT VT = V2.getValueType(); 2875 SDValue V1 = isZero 2876 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 2877 unsigned NumElems = VT.getVectorNumElements(); 2878 SmallVector<int, 16> MaskVec; 2879 for (unsigned i = 0; i != NumElems; ++i) 2880 // If this is the insertion idx, put the low elt of V2 here. 2881 MaskVec.push_back(i == Idx ? NumElems : i); 2882 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 2883} 2884 2885/// getNumOfConsecutiveZeros - Return the number of elements in a result of 2886/// a shuffle that is zero. 2887static 2888unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 2889 bool Low, SelectionDAG &DAG) { 2890 unsigned NumZeros = 0; 2891 for (int i = 0; i < NumElems; ++i) { 2892 unsigned Index = Low ? i : NumElems-i-1; 2893 int Idx = SVOp->getMaskElt(Index); 2894 if (Idx < 0) { 2895 ++NumZeros; 2896 continue; 2897 } 2898 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 2899 if (Elt.getNode() && isZeroNode(Elt)) 2900 ++NumZeros; 2901 else 2902 break; 2903 } 2904 return NumZeros; 2905} 2906 2907/// isVectorShift - Returns true if the shuffle can be implemented as a 2908/// logical left or right shift of a vector. 2909/// FIXME: split into pslldqi, psrldqi, palignr variants. 2910static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 2911 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 2912 int NumElems = SVOp->getValueType(0).getVectorNumElements(); 2913 2914 isLeft = true; 2915 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 2916 if (!NumZeros) { 2917 isLeft = false; 2918 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 2919 if (!NumZeros) 2920 return false; 2921 } 2922 bool SeenV1 = false; 2923 bool SeenV2 = false; 2924 for (int i = NumZeros; i < NumElems; ++i) { 2925 int Val = isLeft ? (i - NumZeros) : i; 2926 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 2927 if (Idx < 0) 2928 continue; 2929 if (Idx < NumElems) 2930 SeenV1 = true; 2931 else { 2932 Idx -= NumElems; 2933 SeenV2 = true; 2934 } 2935 if (Idx != Val) 2936 return false; 2937 } 2938 if (SeenV1 && SeenV2) 2939 return false; 2940 2941 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 2942 ShAmt = NumZeros; 2943 return true; 2944} 2945 2946 2947/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 2948/// 2949static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 2950 unsigned NumNonZero, unsigned NumZero, 2951 SelectionDAG &DAG, TargetLowering &TLI) { 2952 if (NumNonZero > 8) 2953 return SDValue(); 2954 2955 DebugLoc dl = Op.getDebugLoc(); 2956 SDValue V(0, 0); 2957 bool First = true; 2958 for (unsigned i = 0; i < 16; ++i) { 2959 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 2960 if (ThisIsNonZero && First) { 2961 if (NumZero) 2962 V = getZeroVector(MVT::v8i16, true, DAG, dl); 2963 else 2964 V = DAG.getUNDEF(MVT::v8i16); 2965 First = false; 2966 } 2967 2968 if ((i & 1) != 0) { 2969 SDValue ThisElt(0, 0), LastElt(0, 0); 2970 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 2971 if (LastIsNonZero) { 2972 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 2973 MVT::i16, Op.getOperand(i-1)); 2974 } 2975 if (ThisIsNonZero) { 2976 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 2977 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 2978 ThisElt, DAG.getConstant(8, MVT::i8)); 2979 if (LastIsNonZero) 2980 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 2981 } else 2982 ThisElt = LastElt; 2983 2984 if (ThisElt.getNode()) 2985 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 2986 DAG.getIntPtrConstant(i/2)); 2987 } 2988 } 2989 2990 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 2991} 2992 2993/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 2994/// 2995static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 2996 unsigned NumNonZero, unsigned NumZero, 2997 SelectionDAG &DAG, TargetLowering &TLI) { 2998 if (NumNonZero > 4) 2999 return SDValue(); 3000 3001 DebugLoc dl = Op.getDebugLoc(); 3002 SDValue V(0, 0); 3003 bool First = true; 3004 for (unsigned i = 0; i < 8; ++i) { 3005 bool isNonZero = (NonZeros & (1 << i)) != 0; 3006 if (isNonZero) { 3007 if (First) { 3008 if (NumZero) 3009 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3010 else 3011 V = DAG.getUNDEF(MVT::v8i16); 3012 First = false; 3013 } 3014 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3015 MVT::v8i16, V, Op.getOperand(i), 3016 DAG.getIntPtrConstant(i)); 3017 } 3018 } 3019 3020 return V; 3021} 3022 3023/// getVShift - Return a vector logical shift node. 3024/// 3025static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp, 3026 unsigned NumBits, SelectionDAG &DAG, 3027 const TargetLowering &TLI, DebugLoc dl) { 3028 bool isMMX = VT.getSizeInBits() == 64; 3029 MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3030 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3031 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3032 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3033 DAG.getNode(Opc, dl, ShVT, SrcOp, 3034 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3035} 3036 3037SDValue 3038X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3039 DebugLoc dl = Op.getDebugLoc(); 3040 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3041 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3042 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3043 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3044 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3045 // eliminated on x86-32 hosts. 3046 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3047 return Op; 3048 3049 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3050 return getOnesVector(Op.getValueType(), DAG, dl); 3051 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3052 } 3053 3054 MVT VT = Op.getValueType(); 3055 MVT EVT = VT.getVectorElementType(); 3056 unsigned EVTBits = EVT.getSizeInBits(); 3057 3058 unsigned NumElems = Op.getNumOperands(); 3059 unsigned NumZero = 0; 3060 unsigned NumNonZero = 0; 3061 unsigned NonZeros = 0; 3062 bool IsAllConstants = true; 3063 SmallSet<SDValue, 8> Values; 3064 for (unsigned i = 0; i < NumElems; ++i) { 3065 SDValue Elt = Op.getOperand(i); 3066 if (Elt.getOpcode() == ISD::UNDEF) 3067 continue; 3068 Values.insert(Elt); 3069 if (Elt.getOpcode() != ISD::Constant && 3070 Elt.getOpcode() != ISD::ConstantFP) 3071 IsAllConstants = false; 3072 if (isZeroNode(Elt)) 3073 NumZero++; 3074 else { 3075 NonZeros |= (1 << i); 3076 NumNonZero++; 3077 } 3078 } 3079 3080 if (NumNonZero == 0) { 3081 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3082 return DAG.getUNDEF(VT); 3083 } 3084 3085 // Special case for single non-zero, non-undef, element. 3086 if (NumNonZero == 1) { 3087 unsigned Idx = CountTrailingZeros_32(NonZeros); 3088 SDValue Item = Op.getOperand(Idx); 3089 3090 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3091 // the value are obviously zero, truncate the value to i32 and do the 3092 // insertion that way. Only do this if the value is non-constant or if the 3093 // value is a constant being inserted into element 0. It is cheaper to do 3094 // a constant pool load than it is to do a movd + shuffle. 3095 if (EVT == MVT::i64 && !Subtarget->is64Bit() && 3096 (!IsAllConstants || Idx == 0)) { 3097 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3098 // Handle MMX and SSE both. 3099 MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3100 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3101 3102 // Truncate the value (which may itself be a constant) to i32, and 3103 // convert it to a vector with movd (S2V+shuffle to zero extend). 3104 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3105 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3106 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3107 Subtarget->hasSSE2(), DAG); 3108 3109 // Now we have our 32-bit value zero extended in the low element of 3110 // a vector. If Idx != 0, swizzle it into place. 3111 if (Idx != 0) { 3112 SmallVector<int, 4> Mask; 3113 Mask.push_back(Idx); 3114 for (unsigned i = 1; i != VecElts; ++i) 3115 Mask.push_back(i); 3116 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3117 DAG.getUNDEF(Item.getValueType()), 3118 &Mask[0]); 3119 } 3120 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3121 } 3122 } 3123 3124 // If we have a constant or non-constant insertion into the low element of 3125 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3126 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3127 // depending on what the source datatype is. 3128 if (Idx == 0) { 3129 if (NumZero == 0) { 3130 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3131 } else if (EVT == MVT::i32 || EVT == MVT::f32 || EVT == MVT::f64 || 3132 (EVT == MVT::i64 && Subtarget->is64Bit())) { 3133 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3134 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3135 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3136 DAG); 3137 } else if (EVT == MVT::i16 || EVT == MVT::i8) { 3138 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3139 MVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3140 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3141 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3142 Subtarget->hasSSE2(), DAG); 3143 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3144 } 3145 } 3146 3147 // Is it a vector logical left shift? 3148 if (NumElems == 2 && Idx == 1 && 3149 isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) { 3150 unsigned NumBits = VT.getSizeInBits(); 3151 return getVShift(true, VT, 3152 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3153 VT, Op.getOperand(1)), 3154 NumBits/2, DAG, *this, dl); 3155 } 3156 3157 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3158 return SDValue(); 3159 3160 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3161 // is a non-constant being inserted into an element other than the low one, 3162 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3163 // movd/movss) to move this into the low element, then shuffle it into 3164 // place. 3165 if (EVTBits == 32) { 3166 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3167 3168 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3169 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3170 Subtarget->hasSSE2(), DAG); 3171 SmallVector<int, 8> MaskVec; 3172 for (unsigned i = 0; i < NumElems; i++) 3173 MaskVec.push_back(i == Idx ? 0 : 1); 3174 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3175 } 3176 } 3177 3178 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3179 if (Values.size() == 1) 3180 return SDValue(); 3181 3182 // A vector full of immediates; various special cases are already 3183 // handled, so this is best done with a single constant-pool load. 3184 if (IsAllConstants) 3185 return SDValue(); 3186 3187 // Let legalizer expand 2-wide build_vectors. 3188 if (EVTBits == 64) { 3189 if (NumNonZero == 1) { 3190 // One half is zero or undef. 3191 unsigned Idx = CountTrailingZeros_32(NonZeros); 3192 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3193 Op.getOperand(Idx)); 3194 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3195 Subtarget->hasSSE2(), DAG); 3196 } 3197 return SDValue(); 3198 } 3199 3200 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3201 if (EVTBits == 8 && NumElems == 16) { 3202 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3203 *this); 3204 if (V.getNode()) return V; 3205 } 3206 3207 if (EVTBits == 16 && NumElems == 8) { 3208 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3209 *this); 3210 if (V.getNode()) return V; 3211 } 3212 3213 // If element VT is == 32 bits, turn it into a number of shuffles. 3214 SmallVector<SDValue, 8> V; 3215 V.resize(NumElems); 3216 if (NumElems == 4 && NumZero > 0) { 3217 for (unsigned i = 0; i < 4; ++i) { 3218 bool isZero = !(NonZeros & (1 << i)); 3219 if (isZero) 3220 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3221 else 3222 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3223 } 3224 3225 for (unsigned i = 0; i < 2; ++i) { 3226 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3227 default: break; 3228 case 0: 3229 V[i] = V[i*2]; // Must be a zero vector. 3230 break; 3231 case 1: 3232 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3233 break; 3234 case 2: 3235 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3236 break; 3237 case 3: 3238 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3239 break; 3240 } 3241 } 3242 3243 SmallVector<int, 8> MaskVec; 3244 bool Reverse = (NonZeros & 0x3) == 2; 3245 for (unsigned i = 0; i < 2; ++i) 3246 MaskVec.push_back(Reverse ? 1-i : i); 3247 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3248 for (unsigned i = 0; i < 2; ++i) 3249 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3250 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3251 } 3252 3253 if (Values.size() > 2) { 3254 // If we have SSE 4.1, Expand into a number of inserts unless the number of 3255 // values to be inserted is equal to the number of elements, in which case 3256 // use the unpack code below in the hopes of matching the consecutive elts 3257 // load merge pattern for shuffles. 3258 // FIXME: We could probably just check that here directly. 3259 if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 3260 getSubtarget()->hasSSE41()) { 3261 V[0] = DAG.getUNDEF(VT); 3262 for (unsigned i = 0; i < NumElems; ++i) 3263 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3264 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3265 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3266 return V[0]; 3267 } 3268 // Expand into a number of unpckl*. 3269 // e.g. for v4f32 3270 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3271 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3272 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3273 for (unsigned i = 0; i < NumElems; ++i) 3274 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3275 NumElems >>= 1; 3276 while (NumElems != 0) { 3277 for (unsigned i = 0; i < NumElems; ++i) 3278 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3279 NumElems >>= 1; 3280 } 3281 return V[0]; 3282 } 3283 3284 return SDValue(); 3285} 3286 3287// v8i16 shuffles - Prefer shuffles in the following order: 3288// 1. [all] pshuflw, pshufhw, optional move 3289// 2. [ssse3] 1 x pshufb 3290// 3. [ssse3] 2 x pshufb + 1 x por 3291// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3292static 3293SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 3294 SelectionDAG &DAG, X86TargetLowering &TLI) { 3295 SDValue V1 = SVOp->getOperand(0); 3296 SDValue V2 = SVOp->getOperand(1); 3297 DebugLoc dl = SVOp->getDebugLoc(); 3298 SmallVector<int, 8> MaskVals; 3299 3300 // Determine if more than 1 of the words in each of the low and high quadwords 3301 // of the result come from the same quadword of one of the two inputs. Undef 3302 // mask values count as coming from any quadword, for better codegen. 3303 SmallVector<unsigned, 4> LoQuad(4); 3304 SmallVector<unsigned, 4> HiQuad(4); 3305 BitVector InputQuads(4); 3306 for (unsigned i = 0; i < 8; ++i) { 3307 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3308 int EltIdx = SVOp->getMaskElt(i); 3309 MaskVals.push_back(EltIdx); 3310 if (EltIdx < 0) { 3311 ++Quad[0]; 3312 ++Quad[1]; 3313 ++Quad[2]; 3314 ++Quad[3]; 3315 continue; 3316 } 3317 ++Quad[EltIdx / 4]; 3318 InputQuads.set(EltIdx / 4); 3319 } 3320 3321 int BestLoQuad = -1; 3322 unsigned MaxQuad = 1; 3323 for (unsigned i = 0; i < 4; ++i) { 3324 if (LoQuad[i] > MaxQuad) { 3325 BestLoQuad = i; 3326 MaxQuad = LoQuad[i]; 3327 } 3328 } 3329 3330 int BestHiQuad = -1; 3331 MaxQuad = 1; 3332 for (unsigned i = 0; i < 4; ++i) { 3333 if (HiQuad[i] > MaxQuad) { 3334 BestHiQuad = i; 3335 MaxQuad = HiQuad[i]; 3336 } 3337 } 3338 3339 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3340 // of the two input vectors, shuffle them into one input vector so only a 3341 // single pshufb instruction is necessary. If There are more than 2 input 3342 // quads, disable the next transformation since it does not help SSSE3. 3343 bool V1Used = InputQuads[0] || InputQuads[1]; 3344 bool V2Used = InputQuads[2] || InputQuads[3]; 3345 if (TLI.getSubtarget()->hasSSSE3()) { 3346 if (InputQuads.count() == 2 && V1Used && V2Used) { 3347 BestLoQuad = InputQuads.find_first(); 3348 BestHiQuad = InputQuads.find_next(BestLoQuad); 3349 } 3350 if (InputQuads.count() > 2) { 3351 BestLoQuad = -1; 3352 BestHiQuad = -1; 3353 } 3354 } 3355 3356 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3357 // the shuffle mask. If a quad is scored as -1, that means that it contains 3358 // words from all 4 input quadwords. 3359 SDValue NewV; 3360 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3361 SmallVector<int, 8> MaskV; 3362 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 3363 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 3364 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 3365 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3366 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 3367 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3368 3369 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3370 // source words for the shuffle, to aid later transformations. 3371 bool AllWordsInNewV = true; 3372 bool InOrder[2] = { true, true }; 3373 for (unsigned i = 0; i != 8; ++i) { 3374 int idx = MaskVals[i]; 3375 if (idx != (int)i) 3376 InOrder[i/4] = false; 3377 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3378 continue; 3379 AllWordsInNewV = false; 3380 break; 3381 } 3382 3383 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3384 if (AllWordsInNewV) { 3385 for (int i = 0; i != 8; ++i) { 3386 int idx = MaskVals[i]; 3387 if (idx < 0) 3388 continue; 3389 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3390 if ((idx != i) && idx < 4) 3391 pshufhw = false; 3392 if ((idx != i) && idx > 3) 3393 pshuflw = false; 3394 } 3395 V1 = NewV; 3396 V2Used = false; 3397 BestLoQuad = 0; 3398 BestHiQuad = 1; 3399 } 3400 3401 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3402 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3403 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3404 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 3405 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 3406 } 3407 } 3408 3409 // If we have SSSE3, and all words of the result are from 1 input vector, 3410 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 3411 // is present, fall back to case 4. 3412 if (TLI.getSubtarget()->hasSSSE3()) { 3413 SmallVector<SDValue,16> pshufbMask; 3414 3415 // If we have elements from both input vectors, set the high bit of the 3416 // shuffle mask element to zero out elements that come from V2 in the V1 3417 // mask, and elements that come from V1 in the V2 mask, so that the two 3418 // results can be OR'd together. 3419 bool TwoInputs = V1Used && V2Used; 3420 for (unsigned i = 0; i != 8; ++i) { 3421 int EltIdx = MaskVals[i] * 2; 3422 if (TwoInputs && (EltIdx >= 16)) { 3423 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3424 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3425 continue; 3426 } 3427 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3428 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 3429 } 3430 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 3431 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3432 DAG.getNode(ISD::BUILD_VECTOR, dl, 3433 MVT::v16i8, &pshufbMask[0], 16)); 3434 if (!TwoInputs) 3435 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3436 3437 // Calculate the shuffle mask for the second input, shuffle it, and 3438 // OR it with the first shuffled input. 3439 pshufbMask.clear(); 3440 for (unsigned i = 0; i != 8; ++i) { 3441 int EltIdx = MaskVals[i] * 2; 3442 if (EltIdx < 16) { 3443 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3444 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3445 continue; 3446 } 3447 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3448 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 3449 } 3450 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 3451 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3452 DAG.getNode(ISD::BUILD_VECTOR, dl, 3453 MVT::v16i8, &pshufbMask[0], 16)); 3454 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3455 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3456 } 3457 3458 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 3459 // and update MaskVals with new element order. 3460 BitVector InOrder(8); 3461 if (BestLoQuad >= 0) { 3462 SmallVector<int, 8> MaskV; 3463 for (int i = 0; i != 4; ++i) { 3464 int idx = MaskVals[i]; 3465 if (idx < 0) { 3466 MaskV.push_back(-1); 3467 InOrder.set(i); 3468 } else if ((idx / 4) == BestLoQuad) { 3469 MaskV.push_back(idx & 3); 3470 InOrder.set(i); 3471 } else { 3472 MaskV.push_back(-1); 3473 } 3474 } 3475 for (unsigned i = 4; i != 8; ++i) 3476 MaskV.push_back(i); 3477 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3478 &MaskV[0]); 3479 } 3480 3481 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 3482 // and update MaskVals with the new element order. 3483 if (BestHiQuad >= 0) { 3484 SmallVector<int, 8> MaskV; 3485 for (unsigned i = 0; i != 4; ++i) 3486 MaskV.push_back(i); 3487 for (unsigned i = 4; i != 8; ++i) { 3488 int idx = MaskVals[i]; 3489 if (idx < 0) { 3490 MaskV.push_back(-1); 3491 InOrder.set(i); 3492 } else if ((idx / 4) == BestHiQuad) { 3493 MaskV.push_back((idx & 3) + 4); 3494 InOrder.set(i); 3495 } else { 3496 MaskV.push_back(-1); 3497 } 3498 } 3499 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3500 &MaskV[0]); 3501 } 3502 3503 // In case BestHi & BestLo were both -1, which means each quadword has a word 3504 // from each of the four input quadwords, calculate the InOrder bitvector now 3505 // before falling through to the insert/extract cleanup. 3506 if (BestLoQuad == -1 && BestHiQuad == -1) { 3507 NewV = V1; 3508 for (int i = 0; i != 8; ++i) 3509 if (MaskVals[i] < 0 || MaskVals[i] == i) 3510 InOrder.set(i); 3511 } 3512 3513 // The other elements are put in the right place using pextrw and pinsrw. 3514 for (unsigned i = 0; i != 8; ++i) { 3515 if (InOrder[i]) 3516 continue; 3517 int EltIdx = MaskVals[i]; 3518 if (EltIdx < 0) 3519 continue; 3520 SDValue ExtOp = (EltIdx < 8) 3521 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 3522 DAG.getIntPtrConstant(EltIdx)) 3523 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 3524 DAG.getIntPtrConstant(EltIdx - 8)); 3525 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 3526 DAG.getIntPtrConstant(i)); 3527 } 3528 return NewV; 3529} 3530 3531// v16i8 shuffles - Prefer shuffles in the following order: 3532// 1. [ssse3] 1 x pshufb 3533// 2. [ssse3] 2 x pshufb + 1 x por 3534// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 3535static 3536SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 3537 SelectionDAG &DAG, X86TargetLowering &TLI) { 3538 SDValue V1 = SVOp->getOperand(0); 3539 SDValue V2 = SVOp->getOperand(1); 3540 DebugLoc dl = SVOp->getDebugLoc(); 3541 SmallVector<int, 16> MaskVals; 3542 SVOp->getMask(MaskVals); 3543 3544 // If we have SSSE3, case 1 is generated when all result bytes come from 3545 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 3546 // present, fall back to case 3. 3547 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 3548 bool V1Only = true; 3549 bool V2Only = true; 3550 for (unsigned i = 0; i < 16; ++i) { 3551 int EltIdx = MaskVals[i]; 3552 if (EltIdx < 0) 3553 continue; 3554 if (EltIdx < 16) 3555 V2Only = false; 3556 else 3557 V1Only = false; 3558 } 3559 3560 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 3561 if (TLI.getSubtarget()->hasSSSE3()) { 3562 SmallVector<SDValue,16> pshufbMask; 3563 3564 // If all result elements are from one input vector, then only translate 3565 // undef mask values to 0x80 (zero out result) in the pshufb mask. 3566 // 3567 // Otherwise, we have elements from both input vectors, and must zero out 3568 // elements that come from V2 in the first mask, and V1 in the second mask 3569 // so that we can OR them together. 3570 bool TwoInputs = !(V1Only || V2Only); 3571 for (unsigned i = 0; i != 16; ++i) { 3572 int EltIdx = MaskVals[i]; 3573 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 3574 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3575 continue; 3576 } 3577 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3578 } 3579 // If all the elements are from V2, assign it to V1 and return after 3580 // building the first pshufb. 3581 if (V2Only) 3582 V1 = V2; 3583 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3584 DAG.getNode(ISD::BUILD_VECTOR, dl, 3585 MVT::v16i8, &pshufbMask[0], 16)); 3586 if (!TwoInputs) 3587 return V1; 3588 3589 // Calculate the shuffle mask for the second input, shuffle it, and 3590 // OR it with the first shuffled input. 3591 pshufbMask.clear(); 3592 for (unsigned i = 0; i != 16; ++i) { 3593 int EltIdx = MaskVals[i]; 3594 if (EltIdx < 16) { 3595 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3596 continue; 3597 } 3598 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3599 } 3600 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3601 DAG.getNode(ISD::BUILD_VECTOR, dl, 3602 MVT::v16i8, &pshufbMask[0], 16)); 3603 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3604 } 3605 3606 // No SSSE3 - Calculate in place words and then fix all out of place words 3607 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 3608 // the 16 different words that comprise the two doublequadword input vectors. 3609 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3610 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 3611 SDValue NewV = V2Only ? V2 : V1; 3612 for (int i = 0; i != 8; ++i) { 3613 int Elt0 = MaskVals[i*2]; 3614 int Elt1 = MaskVals[i*2+1]; 3615 3616 // This word of the result is all undef, skip it. 3617 if (Elt0 < 0 && Elt1 < 0) 3618 continue; 3619 3620 // This word of the result is already in the correct place, skip it. 3621 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 3622 continue; 3623 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 3624 continue; 3625 3626 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 3627 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 3628 SDValue InsElt; 3629 3630 // If Elt0 and Elt1 are defined, are consecutive, and can be load 3631 // using a single extract together, load it and store it. 3632 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 3633 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3634 DAG.getIntPtrConstant(Elt1 / 2)); 3635 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3636 DAG.getIntPtrConstant(i)); 3637 continue; 3638 } 3639 3640 // If Elt1 is defined, extract it from the appropriate source. If the 3641 // source byte is not also odd, shift the extracted word left 8 bits 3642 // otherwise clear the bottom 8 bits if we need to do an or. 3643 if (Elt1 >= 0) { 3644 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3645 DAG.getIntPtrConstant(Elt1 / 2)); 3646 if ((Elt1 & 1) == 0) 3647 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 3648 DAG.getConstant(8, TLI.getShiftAmountTy())); 3649 else if (Elt0 >= 0) 3650 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 3651 DAG.getConstant(0xFF00, MVT::i16)); 3652 } 3653 // If Elt0 is defined, extract it from the appropriate source. If the 3654 // source byte is not also even, shift the extracted word right 8 bits. If 3655 // Elt1 was also defined, OR the extracted values together before 3656 // inserting them in the result. 3657 if (Elt0 >= 0) { 3658 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 3659 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 3660 if ((Elt0 & 1) != 0) 3661 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 3662 DAG.getConstant(8, TLI.getShiftAmountTy())); 3663 else if (Elt1 >= 0) 3664 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 3665 DAG.getConstant(0x00FF, MVT::i16)); 3666 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 3667 : InsElt0; 3668 } 3669 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3670 DAG.getIntPtrConstant(i)); 3671 } 3672 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 3673} 3674 3675/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 3676/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 3677/// done when every pair / quad of shuffle mask elements point to elements in 3678/// the right sequence. e.g. 3679/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 3680static 3681SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 3682 SelectionDAG &DAG, 3683 TargetLowering &TLI, DebugLoc dl) { 3684 MVT VT = SVOp->getValueType(0); 3685 SDValue V1 = SVOp->getOperand(0); 3686 SDValue V2 = SVOp->getOperand(1); 3687 unsigned NumElems = VT.getVectorNumElements(); 3688 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 3689 MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 3690 MVT MaskEltVT = MaskVT.getVectorElementType(); 3691 MVT NewVT = MaskVT; 3692 switch (VT.getSimpleVT()) { 3693 default: assert(false && "Unexpected!"); 3694 case MVT::v4f32: NewVT = MVT::v2f64; break; 3695 case MVT::v4i32: NewVT = MVT::v2i64; break; 3696 case MVT::v8i16: NewVT = MVT::v4i32; break; 3697 case MVT::v16i8: NewVT = MVT::v4i32; break; 3698 } 3699 3700 if (NewWidth == 2) { 3701 if (VT.isInteger()) 3702 NewVT = MVT::v2i64; 3703 else 3704 NewVT = MVT::v2f64; 3705 } 3706 int Scale = NumElems / NewWidth; 3707 SmallVector<int, 8> MaskVec; 3708 for (unsigned i = 0; i < NumElems; i += Scale) { 3709 int StartIdx = -1; 3710 for (int j = 0; j < Scale; ++j) { 3711 int EltIdx = SVOp->getMaskElt(i+j); 3712 if (EltIdx < 0) 3713 continue; 3714 if (StartIdx == -1) 3715 StartIdx = EltIdx - (EltIdx % Scale); 3716 if (EltIdx != StartIdx + j) 3717 return SDValue(); 3718 } 3719 if (StartIdx == -1) 3720 MaskVec.push_back(-1); 3721 else 3722 MaskVec.push_back(StartIdx / Scale); 3723 } 3724 3725 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 3726 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 3727 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 3728} 3729 3730/// getVZextMovL - Return a zero-extending vector move low node. 3731/// 3732static SDValue getVZextMovL(MVT VT, MVT OpVT, 3733 SDValue SrcOp, SelectionDAG &DAG, 3734 const X86Subtarget *Subtarget, DebugLoc dl) { 3735 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 3736 LoadSDNode *LD = NULL; 3737 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 3738 LD = dyn_cast<LoadSDNode>(SrcOp); 3739 if (!LD) { 3740 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 3741 // instead. 3742 MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 3743 if ((EVT != MVT::i64 || Subtarget->is64Bit()) && 3744 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 3745 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 3746 SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) { 3747 // PR2108 3748 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 3749 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3750 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 3751 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3752 OpVT, 3753 SrcOp.getOperand(0) 3754 .getOperand(0)))); 3755 } 3756 } 3757 } 3758 3759 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3760 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 3761 DAG.getNode(ISD::BIT_CONVERT, dl, 3762 OpVT, SrcOp))); 3763} 3764 3765/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 3766/// shuffles. 3767static SDValue 3768LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3769 SDValue V1 = SVOp->getOperand(0); 3770 SDValue V2 = SVOp->getOperand(1); 3771 DebugLoc dl = SVOp->getDebugLoc(); 3772 MVT VT = SVOp->getValueType(0); 3773 3774 SmallVector<std::pair<int, int>, 8> Locs; 3775 Locs.resize(4); 3776 SmallVector<int, 8> Mask1(4U, -1); 3777 SmallVector<int, 8> PermMask; 3778 SVOp->getMask(PermMask); 3779 3780 unsigned NumHi = 0; 3781 unsigned NumLo = 0; 3782 for (unsigned i = 0; i != 4; ++i) { 3783 int Idx = PermMask[i]; 3784 if (Idx < 0) { 3785 Locs[i] = std::make_pair(-1, -1); 3786 } else { 3787 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 3788 if (Idx < 4) { 3789 Locs[i] = std::make_pair(0, NumLo); 3790 Mask1[NumLo] = Idx; 3791 NumLo++; 3792 } else { 3793 Locs[i] = std::make_pair(1, NumHi); 3794 if (2+NumHi < 4) 3795 Mask1[2+NumHi] = Idx; 3796 NumHi++; 3797 } 3798 } 3799 } 3800 3801 if (NumLo <= 2 && NumHi <= 2) { 3802 // If no more than two elements come from either vector. This can be 3803 // implemented with two shuffles. First shuffle gather the elements. 3804 // The second shuffle, which takes the first shuffle as both of its 3805 // vector operands, put the elements into the right order. 3806 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 3807 3808 SmallVector<int, 8> Mask2(4U, -1); 3809 3810 for (unsigned i = 0; i != 4; ++i) { 3811 if (Locs[i].first == -1) 3812 continue; 3813 else { 3814 unsigned Idx = (i < 2) ? 0 : 4; 3815 Idx += Locs[i].first * 2 + Locs[i].second; 3816 Mask2[i] = Idx; 3817 } 3818 } 3819 3820 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 3821 } else if (NumLo == 3 || NumHi == 3) { 3822 // Otherwise, we must have three elements from one vector, call it X, and 3823 // one element from the other, call it Y. First, use a shufps to build an 3824 // intermediate vector with the one element from Y and the element from X 3825 // that will be in the same half in the final destination (the indexes don't 3826 // matter). Then, use a shufps to build the final vector, taking the half 3827 // containing the element from Y from the intermediate, and the other half 3828 // from X. 3829 if (NumHi == 3) { 3830 // Normalize it so the 3 elements come from V1. 3831 CommuteVectorShuffleMask(PermMask, VT); 3832 std::swap(V1, V2); 3833 } 3834 3835 // Find the element from V2. 3836 unsigned HiIndex; 3837 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 3838 int Val = PermMask[HiIndex]; 3839 if (Val < 0) 3840 continue; 3841 if (Val >= 4) 3842 break; 3843 } 3844 3845 Mask1[0] = PermMask[HiIndex]; 3846 Mask1[1] = -1; 3847 Mask1[2] = PermMask[HiIndex^1]; 3848 Mask1[3] = -1; 3849 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 3850 3851 if (HiIndex >= 2) { 3852 Mask1[0] = PermMask[0]; 3853 Mask1[1] = PermMask[1]; 3854 Mask1[2] = HiIndex & 1 ? 6 : 4; 3855 Mask1[3] = HiIndex & 1 ? 4 : 6; 3856 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 3857 } else { 3858 Mask1[0] = HiIndex & 1 ? 2 : 0; 3859 Mask1[1] = HiIndex & 1 ? 0 : 2; 3860 Mask1[2] = PermMask[2]; 3861 Mask1[3] = PermMask[3]; 3862 if (Mask1[2] >= 0) 3863 Mask1[2] += 4; 3864 if (Mask1[3] >= 0) 3865 Mask1[3] += 4; 3866 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 3867 } 3868 } 3869 3870 // Break it into (shuffle shuffle_hi, shuffle_lo). 3871 Locs.clear(); 3872 SmallVector<int,8> LoMask(4U, -1); 3873 SmallVector<int,8> HiMask(4U, -1); 3874 3875 SmallVector<int,8> *MaskPtr = &LoMask; 3876 unsigned MaskIdx = 0; 3877 unsigned LoIdx = 0; 3878 unsigned HiIdx = 2; 3879 for (unsigned i = 0; i != 4; ++i) { 3880 if (i == 2) { 3881 MaskPtr = &HiMask; 3882 MaskIdx = 1; 3883 LoIdx = 0; 3884 HiIdx = 2; 3885 } 3886 int Idx = PermMask[i]; 3887 if (Idx < 0) { 3888 Locs[i] = std::make_pair(-1, -1); 3889 } else if (Idx < 4) { 3890 Locs[i] = std::make_pair(MaskIdx, LoIdx); 3891 (*MaskPtr)[LoIdx] = Idx; 3892 LoIdx++; 3893 } else { 3894 Locs[i] = std::make_pair(MaskIdx, HiIdx); 3895 (*MaskPtr)[HiIdx] = Idx; 3896 HiIdx++; 3897 } 3898 } 3899 3900 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 3901 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 3902 SmallVector<int, 8> MaskOps; 3903 for (unsigned i = 0; i != 4; ++i) { 3904 if (Locs[i].first == -1) { 3905 MaskOps.push_back(-1); 3906 } else { 3907 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 3908 MaskOps.push_back(Idx); 3909 } 3910 } 3911 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 3912} 3913 3914SDValue 3915X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 3916 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 3917 SDValue V1 = Op.getOperand(0); 3918 SDValue V2 = Op.getOperand(1); 3919 MVT VT = Op.getValueType(); 3920 DebugLoc dl = Op.getDebugLoc(); 3921 unsigned NumElems = VT.getVectorNumElements(); 3922 bool isMMX = VT.getSizeInBits() == 64; 3923 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 3924 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 3925 bool V1IsSplat = false; 3926 bool V2IsSplat = false; 3927 3928 if (isZeroShuffle(SVOp)) 3929 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3930 3931 // Promote splats to v4f32. 3932 if (SVOp->isSplat()) { 3933 if (isMMX || NumElems < 4) 3934 return Op; 3935 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 3936 } 3937 3938 // If the shuffle can be profitably rewritten as a narrower shuffle, then 3939 // do it! 3940 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 3941 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 3942 if (NewOp.getNode()) 3943 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3944 LowerVECTOR_SHUFFLE(NewOp, DAG)); 3945 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 3946 // FIXME: Figure out a cleaner way to do this. 3947 // Try to make use of movq to zero out the top part. 3948 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 3949 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 3950 if (NewOp.getNode()) { 3951 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 3952 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 3953 DAG, Subtarget, dl); 3954 } 3955 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 3956 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 3957 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 3958 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 3959 DAG, Subtarget, dl); 3960 } 3961 } 3962 3963 if (X86::isPSHUFDMask(SVOp)) 3964 return Op; 3965 3966 // Check if this can be converted into a logical shift. 3967 bool isLeft = false; 3968 unsigned ShAmt = 0; 3969 SDValue ShVal; 3970 bool isShift = getSubtarget()->hasSSE2() && 3971 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 3972 if (isShift && ShVal.hasOneUse()) { 3973 // If the shifted value has multiple uses, it may be cheaper to use 3974 // v_set0 + movlhps or movhlps, etc. 3975 MVT EVT = VT.getVectorElementType(); 3976 ShAmt *= EVT.getSizeInBits(); 3977 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 3978 } 3979 3980 if (X86::isMOVLMask(SVOp)) { 3981 if (V1IsUndef) 3982 return V2; 3983 if (ISD::isBuildVectorAllZeros(V1.getNode())) 3984 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 3985 if (!isMMX) 3986 return Op; 3987 } 3988 3989 // FIXME: fold these into legal mask. 3990 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 3991 X86::isMOVSLDUPMask(SVOp) || 3992 X86::isMOVHLPSMask(SVOp) || 3993 X86::isMOVHPMask(SVOp) || 3994 X86::isMOVLPMask(SVOp))) 3995 return Op; 3996 3997 if (ShouldXformToMOVHLPS(SVOp) || 3998 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 3999 return CommuteVectorShuffle(SVOp, DAG); 4000 4001 if (isShift) { 4002 // No better options. Use a vshl / vsrl. 4003 MVT EVT = VT.getVectorElementType(); 4004 ShAmt *= EVT.getSizeInBits(); 4005 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4006 } 4007 4008 bool Commuted = false; 4009 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4010 // 1,1,1,1 -> v8i16 though. 4011 V1IsSplat = isSplatVector(V1.getNode()); 4012 V2IsSplat = isSplatVector(V2.getNode()); 4013 4014 // Canonicalize the splat or undef, if present, to be on the RHS. 4015 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4016 Op = CommuteVectorShuffle(SVOp, DAG); 4017 SVOp = cast<ShuffleVectorSDNode>(Op); 4018 V1 = SVOp->getOperand(0); 4019 V2 = SVOp->getOperand(1); 4020 std::swap(V1IsSplat, V2IsSplat); 4021 std::swap(V1IsUndef, V2IsUndef); 4022 Commuted = true; 4023 } 4024 4025 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4026 // Shuffling low element of v1 into undef, just return v1. 4027 if (V2IsUndef) 4028 return V1; 4029 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4030 // the instruction selector will not match, so get a canonical MOVL with 4031 // swapped operands to undo the commute. 4032 return getMOVL(DAG, dl, VT, V2, V1); 4033 } 4034 4035 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4036 X86::isUNPCKH_v_undef_Mask(SVOp) || 4037 X86::isUNPCKLMask(SVOp) || 4038 X86::isUNPCKHMask(SVOp)) 4039 return Op; 4040 4041 if (V2IsSplat) { 4042 // Normalize mask so all entries that point to V2 points to its first 4043 // element then try to match unpck{h|l} again. If match, return a 4044 // new vector_shuffle with the corrected mask. 4045 SDValue NewMask = NormalizeMask(SVOp, DAG); 4046 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4047 if (NSVOp != SVOp) { 4048 if (X86::isUNPCKLMask(NSVOp, true)) { 4049 return NewMask; 4050 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4051 return NewMask; 4052 } 4053 } 4054 } 4055 4056 if (Commuted) { 4057 // Commute is back and try unpck* again. 4058 // FIXME: this seems wrong. 4059 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4060 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4061 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4062 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4063 X86::isUNPCKLMask(NewSVOp) || 4064 X86::isUNPCKHMask(NewSVOp)) 4065 return NewOp; 4066 } 4067 4068 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4069 4070 // Normalize the node to match x86 shuffle ops if needed 4071 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4072 return CommuteVectorShuffle(SVOp, DAG); 4073 4074 // Check for legal shuffle and return? 4075 SmallVector<int, 16> PermMask; 4076 SVOp->getMask(PermMask); 4077 if (isShuffleMaskLegal(PermMask, VT)) 4078 return Op; 4079 4080 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4081 if (VT == MVT::v8i16) { 4082 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4083 if (NewOp.getNode()) 4084 return NewOp; 4085 } 4086 4087 if (VT == MVT::v16i8) { 4088 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4089 if (NewOp.getNode()) 4090 return NewOp; 4091 } 4092 4093 // Handle all 4 wide cases with a number of shuffles except for MMX. 4094 if (NumElems == 4 && !isMMX) 4095 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4096 4097 return SDValue(); 4098} 4099 4100SDValue 4101X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4102 SelectionDAG &DAG) { 4103 MVT VT = Op.getValueType(); 4104 DebugLoc dl = Op.getDebugLoc(); 4105 if (VT.getSizeInBits() == 8) { 4106 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4107 Op.getOperand(0), Op.getOperand(1)); 4108 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4109 DAG.getValueType(VT)); 4110 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4111 } else if (VT.getSizeInBits() == 16) { 4112 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4113 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4114 if (Idx == 0) 4115 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4116 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4117 DAG.getNode(ISD::BIT_CONVERT, dl, 4118 MVT::v4i32, 4119 Op.getOperand(0)), 4120 Op.getOperand(1))); 4121 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4122 Op.getOperand(0), Op.getOperand(1)); 4123 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4124 DAG.getValueType(VT)); 4125 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4126 } else if (VT == MVT::f32) { 4127 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4128 // the result back to FR32 register. It's only worth matching if the 4129 // result has a single use which is a store or a bitcast to i32. And in 4130 // the case of a store, it's not worth it if the index is a constant 0, 4131 // because a MOVSSmr can be used instead, which is smaller and faster. 4132 if (!Op.hasOneUse()) 4133 return SDValue(); 4134 SDNode *User = *Op.getNode()->use_begin(); 4135 if ((User->getOpcode() != ISD::STORE || 4136 (isa<ConstantSDNode>(Op.getOperand(1)) && 4137 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4138 (User->getOpcode() != ISD::BIT_CONVERT || 4139 User->getValueType(0) != MVT::i32)) 4140 return SDValue(); 4141 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4142 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4143 Op.getOperand(0)), 4144 Op.getOperand(1)); 4145 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4146 } else if (VT == MVT::i32) { 4147 // ExtractPS works with constant index. 4148 if (isa<ConstantSDNode>(Op.getOperand(1))) 4149 return Op; 4150 } 4151 return SDValue(); 4152} 4153 4154 4155SDValue 4156X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4157 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4158 return SDValue(); 4159 4160 if (Subtarget->hasSSE41()) { 4161 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4162 if (Res.getNode()) 4163 return Res; 4164 } 4165 4166 MVT VT = Op.getValueType(); 4167 DebugLoc dl = Op.getDebugLoc(); 4168 // TODO: handle v16i8. 4169 if (VT.getSizeInBits() == 16) { 4170 SDValue Vec = Op.getOperand(0); 4171 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4172 if (Idx == 0) 4173 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4174 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4175 DAG.getNode(ISD::BIT_CONVERT, dl, 4176 MVT::v4i32, Vec), 4177 Op.getOperand(1))); 4178 // Transform it so it match pextrw which produces a 32-bit result. 4179 MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1); 4180 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT, 4181 Op.getOperand(0), Op.getOperand(1)); 4182 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EVT, Extract, 4183 DAG.getValueType(VT)); 4184 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4185 } else if (VT.getSizeInBits() == 32) { 4186 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4187 if (Idx == 0) 4188 return Op; 4189 4190 // SHUFPS the element to the lowest double word, then movss. 4191 int Mask[4] = { Idx, -1, -1, -1 }; 4192 MVT VVT = Op.getOperand(0).getValueType(); 4193 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4194 DAG.getUNDEF(VVT), Mask); 4195 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4196 DAG.getIntPtrConstant(0)); 4197 } else if (VT.getSizeInBits() == 64) { 4198 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4199 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4200 // to match extract_elt for f64. 4201 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4202 if (Idx == 0) 4203 return Op; 4204 4205 // UNPCKHPD the element to the lowest double word, then movsd. 4206 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4207 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4208 int Mask[2] = { 1, -1 }; 4209 MVT VVT = Op.getOperand(0).getValueType(); 4210 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4211 DAG.getUNDEF(VVT), Mask); 4212 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4213 DAG.getIntPtrConstant(0)); 4214 } 4215 4216 return SDValue(); 4217} 4218 4219SDValue 4220X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4221 MVT VT = Op.getValueType(); 4222 MVT EVT = VT.getVectorElementType(); 4223 DebugLoc dl = Op.getDebugLoc(); 4224 4225 SDValue N0 = Op.getOperand(0); 4226 SDValue N1 = Op.getOperand(1); 4227 SDValue N2 = Op.getOperand(2); 4228 4229 if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) && 4230 isa<ConstantSDNode>(N2)) { 4231 unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB 4232 : X86ISD::PINSRW; 4233 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4234 // argument. 4235 if (N1.getValueType() != MVT::i32) 4236 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4237 if (N2.getValueType() != MVT::i32) 4238 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4239 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4240 } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4241 // Bits [7:6] of the constant are the source select. This will always be 4242 // zero here. The DAG Combiner may combine an extract_elt index into these 4243 // bits. For example (insert (extract, 3), 2) could be matched by putting 4244 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4245 // Bits [5:4] of the constant are the destination select. This is the 4246 // value of the incoming immediate. 4247 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4248 // combine either bitwise AND or insert of float 0.0 to set these bits. 4249 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4250 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4251 } else if (EVT == MVT::i32) { 4252 // InsertPS works with constant index. 4253 if (isa<ConstantSDNode>(N2)) 4254 return Op; 4255 } 4256 return SDValue(); 4257} 4258 4259SDValue 4260X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4261 MVT VT = Op.getValueType(); 4262 MVT EVT = VT.getVectorElementType(); 4263 4264 if (Subtarget->hasSSE41()) 4265 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4266 4267 if (EVT == MVT::i8) 4268 return SDValue(); 4269 4270 DebugLoc dl = Op.getDebugLoc(); 4271 SDValue N0 = Op.getOperand(0); 4272 SDValue N1 = Op.getOperand(1); 4273 SDValue N2 = Op.getOperand(2); 4274 4275 if (EVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 4276 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4277 // as its second argument. 4278 if (N1.getValueType() != MVT::i32) 4279 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4280 if (N2.getValueType() != MVT::i32) 4281 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4282 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 4283 } 4284 return SDValue(); 4285} 4286 4287SDValue 4288X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4289 DebugLoc dl = Op.getDebugLoc(); 4290 if (Op.getValueType() == MVT::v2f32) 4291 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4292 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4293 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4294 Op.getOperand(0)))); 4295 4296 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4297 MVT VT = MVT::v2i32; 4298 switch (Op.getValueType().getSimpleVT()) { 4299 default: break; 4300 case MVT::v16i8: 4301 case MVT::v8i16: 4302 VT = MVT::v4i32; 4303 break; 4304 } 4305 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4306 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4307} 4308 4309// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4310// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4311// one of the above mentioned nodes. It has to be wrapped because otherwise 4312// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4313// be used to form addressing mode. These wrapped nodes will be selected 4314// into MOV32ri. 4315SDValue 4316X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4317 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4318 // FIXME there isn't really any debug info here, should come from the parent 4319 DebugLoc dl = CP->getDebugLoc(); 4320 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4321 CP->getAlignment()); 4322 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4323 // With PIC, the address is actually $g + Offset. 4324 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4325 !Subtarget->isPICStyleRIPRel()) { 4326 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4327 DAG.getNode(X86ISD::GlobalBaseReg, 4328 DebugLoc::getUnknownLoc(), 4329 getPointerTy()), 4330 Result); 4331 } 4332 4333 return Result; 4334} 4335 4336SDValue 4337X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 4338 int64_t Offset, 4339 SelectionDAG &DAG) const { 4340 bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_; 4341 bool ExtraLoadRequired = 4342 Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false); 4343 4344 // Create the TargetGlobalAddress node, folding in the constant 4345 // offset if it is legal. 4346 SDValue Result; 4347 if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) { 4348 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 4349 Offset = 0; 4350 } else 4351 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0); 4352 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4353 4354 // With PIC, the address is actually $g + Offset. 4355 if (IsPic && !Subtarget->isPICStyleRIPRel()) { 4356 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4357 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 4358 Result); 4359 } 4360 4361 // For Darwin & Mingw32, external and weak symbols are indirect, so we want to 4362 // load the value at address GV, not the value of GV itself. This means that 4363 // the GlobalAddress must be in the base or index register of the address, not 4364 // the GV offset field. Platform check is inside GVRequiresExtraLoad() call 4365 // The same applies for external symbols during PIC codegen 4366 if (ExtraLoadRequired) 4367 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 4368 PseudoSourceValue::getGOT(), 0); 4369 4370 // If there was a non-zero offset that we didn't fold, create an explicit 4371 // addition for it. 4372 if (Offset != 0) 4373 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 4374 DAG.getConstant(Offset, getPointerTy())); 4375 4376 return Result; 4377} 4378 4379SDValue 4380X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 4381 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 4382 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 4383 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 4384} 4385 4386static SDValue 4387GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 4388 SDValue *InFlag, const MVT PtrVT, unsigned ReturnReg) { 4389 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 4390 DebugLoc dl = GA->getDebugLoc(); 4391 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4392 GA->getValueType(0), 4393 GA->getOffset()); 4394 if (InFlag) { 4395 SDValue Ops[] = { Chain, TGA, *InFlag }; 4396 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 4397 } else { 4398 SDValue Ops[] = { Chain, TGA }; 4399 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 4400 } 4401 SDValue Flag = Chain.getValue(1); 4402 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 4403} 4404 4405// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 4406static SDValue 4407LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4408 const MVT PtrVT) { 4409 SDValue InFlag; 4410 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 4411 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 4412 DAG.getNode(X86ISD::GlobalBaseReg, 4413 DebugLoc::getUnknownLoc(), 4414 PtrVT), InFlag); 4415 InFlag = Chain.getValue(1); 4416 4417 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX); 4418} 4419 4420// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 4421static SDValue 4422LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4423 const MVT PtrVT) { 4424 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX); 4425} 4426 4427// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 4428// "local exec" model. 4429static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4430 const MVT PtrVT, TLSModel::Model model, 4431 bool is64Bit) { 4432 DebugLoc dl = GA->getDebugLoc(); 4433 // Get the Thread Pointer 4434 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 4435 DebugLoc::getUnknownLoc(), PtrVT, 4436 DAG.getRegister(is64Bit? X86::FS : X86::GS, 4437 MVT::i32)); 4438 4439 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 4440 NULL, 0); 4441 4442 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 4443 // exec) 4444 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4445 GA->getValueType(0), 4446 GA->getOffset()); 4447 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); 4448 4449 if (model == TLSModel::InitialExec) 4450 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 4451 PseudoSourceValue::getGOT(), 0); 4452 4453 // The address of the thread local variable is the add of the thread 4454 // pointer with the offset of the variable. 4455 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 4456} 4457 4458SDValue 4459X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 4460 // TODO: implement the "local dynamic" model 4461 // TODO: implement the "initial exec"model for pic executables 4462 assert(Subtarget->isTargetELF() && 4463 "TLS not implemented for non-ELF targets"); 4464 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4465 GlobalValue *GV = GA->getGlobal(); 4466 TLSModel::Model model = 4467 getTLSModel (GV, getTargetMachine().getRelocationModel()); 4468 if (Subtarget->is64Bit()) { 4469 switch (model) { 4470 case TLSModel::GeneralDynamic: 4471 case TLSModel::LocalDynamic: // not implemented 4472 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 4473 4474 case TLSModel::InitialExec: 4475 case TLSModel::LocalExec: 4476 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, true); 4477 } 4478 } else { 4479 switch (model) { 4480 case TLSModel::GeneralDynamic: 4481 case TLSModel::LocalDynamic: // not implemented 4482 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 4483 4484 case TLSModel::InitialExec: 4485 case TLSModel::LocalExec: 4486 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, false); 4487 } 4488 } 4489 assert(0 && "Unreachable"); 4490 return SDValue(); 4491} 4492 4493SDValue 4494X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4495 // FIXME there isn't really any debug info here 4496 DebugLoc dl = Op.getDebugLoc(); 4497 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4498 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy()); 4499 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4500 // With PIC, the address is actually $g + Offset. 4501 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4502 !Subtarget->isPICStyleRIPRel()) { 4503 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4504 DAG.getNode(X86ISD::GlobalBaseReg, 4505 DebugLoc::getUnknownLoc(), 4506 getPointerTy()), 4507 Result); 4508 } 4509 4510 return Result; 4511} 4512 4513SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4514 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4515 // FIXME there isn't really any debug into here 4516 DebugLoc dl = JT->getDebugLoc(); 4517 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy()); 4518 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4519 // With PIC, the address is actually $g + Offset. 4520 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4521 !Subtarget->isPICStyleRIPRel()) { 4522 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4523 DAG.getNode(X86ISD::GlobalBaseReg, 4524 DebugLoc::getUnknownLoc(), 4525 getPointerTy()), 4526 Result); 4527 } 4528 4529 return Result; 4530} 4531 4532/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 4533/// take a 2 x i32 value to shift plus a shift amount. 4534SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 4535 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4536 MVT VT = Op.getValueType(); 4537 unsigned VTBits = VT.getSizeInBits(); 4538 DebugLoc dl = Op.getDebugLoc(); 4539 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 4540 SDValue ShOpLo = Op.getOperand(0); 4541 SDValue ShOpHi = Op.getOperand(1); 4542 SDValue ShAmt = Op.getOperand(2); 4543 SDValue Tmp1 = isSRA ? 4544 DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 4545 DAG.getConstant(VTBits - 1, MVT::i8)) : 4546 DAG.getConstant(0, VT); 4547 4548 SDValue Tmp2, Tmp3; 4549 if (Op.getOpcode() == ISD::SHL_PARTS) { 4550 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 4551 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4552 } else { 4553 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 4554 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 4555 } 4556 4557 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 4558 DAG.getConstant(VTBits, MVT::i8)); 4559 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, 4560 AndNode, DAG.getConstant(0, MVT::i8)); 4561 4562 SDValue Hi, Lo; 4563 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 4564 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 4565 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 4566 4567 if (Op.getOpcode() == ISD::SHL_PARTS) { 4568 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4569 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4570 } else { 4571 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4572 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4573 } 4574 4575 SDValue Ops[2] = { Lo, Hi }; 4576 return DAG.getMergeValues(Ops, 2, dl); 4577} 4578 4579SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4580 MVT SrcVT = Op.getOperand(0).getValueType(); 4581 4582 if (SrcVT.isVector()) { 4583 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 4584 return Op; 4585 } 4586 return SDValue(); 4587 } 4588 4589 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 4590 "Unknown SINT_TO_FP to lower!"); 4591 4592 // These are really Legal; return the operand so the caller accepts it as 4593 // Legal. 4594 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 4595 return Op; 4596 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 4597 Subtarget->is64Bit()) { 4598 return Op; 4599 } 4600 4601 DebugLoc dl = Op.getDebugLoc(); 4602 unsigned Size = SrcVT.getSizeInBits()/8; 4603 MachineFunction &MF = DAG.getMachineFunction(); 4604 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); 4605 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4606 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4607 StackSlot, 4608 PseudoSourceValue::getFixedStack(SSFI), 0); 4609 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 4610} 4611 4612SDValue X86TargetLowering::BuildFILD(SDValue Op, MVT SrcVT, SDValue Chain, 4613 SDValue StackSlot, 4614 SelectionDAG &DAG) { 4615 // Build the FILD 4616 DebugLoc dl = Op.getDebugLoc(); 4617 SDVTList Tys; 4618 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 4619 if (useSSE) 4620 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 4621 else 4622 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 4623 SmallVector<SDValue, 8> Ops; 4624 Ops.push_back(Chain); 4625 Ops.push_back(StackSlot); 4626 Ops.push_back(DAG.getValueType(SrcVT)); 4627 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 4628 Tys, &Ops[0], Ops.size()); 4629 4630 if (useSSE) { 4631 Chain = Result.getValue(1); 4632 SDValue InFlag = Result.getValue(2); 4633 4634 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 4635 // shouldn't be necessary except that RFP cannot be live across 4636 // multiple blocks. When stackifier is fixed, they can be uncoupled. 4637 MachineFunction &MF = DAG.getMachineFunction(); 4638 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); 4639 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4640 Tys = DAG.getVTList(MVT::Other); 4641 SmallVector<SDValue, 8> Ops; 4642 Ops.push_back(Chain); 4643 Ops.push_back(Result); 4644 Ops.push_back(StackSlot); 4645 Ops.push_back(DAG.getValueType(Op.getValueType())); 4646 Ops.push_back(InFlag); 4647 Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size()); 4648 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 4649 PseudoSourceValue::getFixedStack(SSFI), 0); 4650 } 4651 4652 return Result; 4653} 4654 4655// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 4656SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 4657 // This algorithm is not obvious. Here it is in C code, more or less: 4658 /* 4659 double uint64_to_double( uint32_t hi, uint32_t lo ) { 4660 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 4661 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 4662 4663 // Copy ints to xmm registers. 4664 __m128i xh = _mm_cvtsi32_si128( hi ); 4665 __m128i xl = _mm_cvtsi32_si128( lo ); 4666 4667 // Combine into low half of a single xmm register. 4668 __m128i x = _mm_unpacklo_epi32( xh, xl ); 4669 __m128d d; 4670 double sd; 4671 4672 // Merge in appropriate exponents to give the integer bits the right 4673 // magnitude. 4674 x = _mm_unpacklo_epi32( x, exp ); 4675 4676 // Subtract away the biases to deal with the IEEE-754 double precision 4677 // implicit 1. 4678 d = _mm_sub_pd( (__m128d) x, bias ); 4679 4680 // All conversions up to here are exact. The correctly rounded result is 4681 // calculated using the current rounding mode using the following 4682 // horizontal add. 4683 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 4684 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 4685 // store doesn't really need to be here (except 4686 // maybe to zero the other double) 4687 return sd; 4688 } 4689 */ 4690 4691 DebugLoc dl = Op.getDebugLoc(); 4692 4693 // Build some magic constants. 4694 std::vector<Constant*> CV0; 4695 CV0.push_back(ConstantInt::get(APInt(32, 0x45300000))); 4696 CV0.push_back(ConstantInt::get(APInt(32, 0x43300000))); 4697 CV0.push_back(ConstantInt::get(APInt(32, 0))); 4698 CV0.push_back(ConstantInt::get(APInt(32, 0))); 4699 Constant *C0 = ConstantVector::get(CV0); 4700 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 4701 4702 std::vector<Constant*> CV1; 4703 CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL)))); 4704 CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL)))); 4705 Constant *C1 = ConstantVector::get(CV1); 4706 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 4707 4708 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4709 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4710 Op.getOperand(0), 4711 DAG.getIntPtrConstant(1))); 4712 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4713 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4714 Op.getOperand(0), 4715 DAG.getIntPtrConstant(0))); 4716 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 4717 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 4718 PseudoSourceValue::getConstantPool(), 0, 4719 false, 16); 4720 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 4721 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 4722 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 4723 PseudoSourceValue::getConstantPool(), 0, 4724 false, 16); 4725 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 4726 4727 // Add the halves; easiest way is to swap them into another reg first. 4728 int ShufMask[2] = { 1, -1 }; 4729 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 4730 DAG.getUNDEF(MVT::v2f64), ShufMask); 4731 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 4732 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 4733 DAG.getIntPtrConstant(0)); 4734} 4735 4736// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 4737SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 4738 DebugLoc dl = Op.getDebugLoc(); 4739 // FP constant to bias correct the final result. 4740 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 4741 MVT::f64); 4742 4743 // Load the 32-bit value into an XMM register. 4744 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4745 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4746 Op.getOperand(0), 4747 DAG.getIntPtrConstant(0))); 4748 4749 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 4750 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 4751 DAG.getIntPtrConstant(0)); 4752 4753 // Or the load with the bias. 4754 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 4755 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4756 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4757 MVT::v2f64, Load)), 4758 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4759 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4760 MVT::v2f64, Bias))); 4761 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 4762 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 4763 DAG.getIntPtrConstant(0)); 4764 4765 // Subtract the bias. 4766 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 4767 4768 // Handle final rounding. 4769 MVT DestVT = Op.getValueType(); 4770 4771 if (DestVT.bitsLT(MVT::f64)) { 4772 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 4773 DAG.getIntPtrConstant(0)); 4774 } else if (DestVT.bitsGT(MVT::f64)) { 4775 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 4776 } 4777 4778 // Handle final rounding. 4779 return Sub; 4780} 4781 4782SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4783 SDValue N0 = Op.getOperand(0); 4784 DebugLoc dl = Op.getDebugLoc(); 4785 4786 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 4787 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 4788 // the optimization here. 4789 if (DAG.SignBitIsZero(N0)) 4790 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 4791 4792 MVT SrcVT = N0.getValueType(); 4793 if (SrcVT == MVT::i64) { 4794 // We only handle SSE2 f64 target here; caller can expand the rest. 4795 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 4796 return SDValue(); 4797 4798 return LowerUINT_TO_FP_i64(Op, DAG); 4799 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 4800 return LowerUINT_TO_FP_i32(Op, DAG); 4801 } 4802 4803 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 4804 4805 // Make a 64-bit buffer, and use it to build an FILD. 4806 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 4807 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 4808 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 4809 getPointerTy(), StackSlot, WordOff); 4810 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4811 StackSlot, NULL, 0); 4812 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 4813 OffsetSlot, NULL, 0); 4814 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 4815} 4816 4817std::pair<SDValue,SDValue> X86TargetLowering:: 4818FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 4819 DebugLoc dl = Op.getDebugLoc(); 4820 4821 MVT DstTy = Op.getValueType(); 4822 4823 if (!IsSigned) { 4824 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 4825 DstTy = MVT::i64; 4826 } 4827 4828 assert(DstTy.getSimpleVT() <= MVT::i64 && 4829 DstTy.getSimpleVT() >= MVT::i16 && 4830 "Unknown FP_TO_SINT to lower!"); 4831 4832 // These are really Legal. 4833 if (DstTy == MVT::i32 && 4834 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 4835 return std::make_pair(SDValue(), SDValue()); 4836 if (Subtarget->is64Bit() && 4837 DstTy == MVT::i64 && 4838 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 4839 return std::make_pair(SDValue(), SDValue()); 4840 4841 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 4842 // stack slot. 4843 MachineFunction &MF = DAG.getMachineFunction(); 4844 unsigned MemSize = DstTy.getSizeInBits()/8; 4845 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 4846 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4847 4848 unsigned Opc; 4849 switch (DstTy.getSimpleVT()) { 4850 default: assert(0 && "Invalid FP_TO_SINT to lower!"); 4851 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 4852 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 4853 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 4854 } 4855 4856 SDValue Chain = DAG.getEntryNode(); 4857 SDValue Value = Op.getOperand(0); 4858 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 4859 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 4860 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 4861 PseudoSourceValue::getFixedStack(SSFI), 0); 4862 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 4863 SDValue Ops[] = { 4864 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 4865 }; 4866 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 4867 Chain = Value.getValue(1); 4868 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 4869 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4870 } 4871 4872 // Build the FP_TO_INT*_IN_MEM 4873 SDValue Ops[] = { Chain, Value, StackSlot }; 4874 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 4875 4876 return std::make_pair(FIST, StackSlot); 4877} 4878 4879SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 4880 if (Op.getValueType().isVector()) { 4881 if (Op.getValueType() == MVT::v2i32 && 4882 Op.getOperand(0).getValueType() == MVT::v2f64) { 4883 return Op; 4884 } 4885 return SDValue(); 4886 } 4887 4888 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 4889 SDValue FIST = Vals.first, StackSlot = Vals.second; 4890 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 4891 if (FIST.getNode() == 0) return Op; 4892 4893 // Load the result. 4894 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 4895 FIST, StackSlot, NULL, 0); 4896} 4897 4898SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 4899 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 4900 SDValue FIST = Vals.first, StackSlot = Vals.second; 4901 assert(FIST.getNode() && "Unexpected failure"); 4902 4903 // Load the result. 4904 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 4905 FIST, StackSlot, NULL, 0); 4906} 4907 4908SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 4909 DebugLoc dl = Op.getDebugLoc(); 4910 MVT VT = Op.getValueType(); 4911 MVT EltVT = VT; 4912 if (VT.isVector()) 4913 EltVT = VT.getVectorElementType(); 4914 std::vector<Constant*> CV; 4915 if (EltVT == MVT::f64) { 4916 Constant *C = ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63)))); 4917 CV.push_back(C); 4918 CV.push_back(C); 4919 } else { 4920 Constant *C = ConstantFP::get(APFloat(APInt(32, ~(1U << 31)))); 4921 CV.push_back(C); 4922 CV.push_back(C); 4923 CV.push_back(C); 4924 CV.push_back(C); 4925 } 4926 Constant *C = ConstantVector::get(CV); 4927 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 4928 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 4929 PseudoSourceValue::getConstantPool(), 0, 4930 false, 16); 4931 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 4932} 4933 4934SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 4935 DebugLoc dl = Op.getDebugLoc(); 4936 MVT VT = Op.getValueType(); 4937 MVT EltVT = VT; 4938 unsigned EltNum = 1; 4939 if (VT.isVector()) { 4940 EltVT = VT.getVectorElementType(); 4941 EltNum = VT.getVectorNumElements(); 4942 } 4943 std::vector<Constant*> CV; 4944 if (EltVT == MVT::f64) { 4945 Constant *C = ConstantFP::get(APFloat(APInt(64, 1ULL << 63))); 4946 CV.push_back(C); 4947 CV.push_back(C); 4948 } else { 4949 Constant *C = ConstantFP::get(APFloat(APInt(32, 1U << 31))); 4950 CV.push_back(C); 4951 CV.push_back(C); 4952 CV.push_back(C); 4953 CV.push_back(C); 4954 } 4955 Constant *C = ConstantVector::get(CV); 4956 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 4957 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 4958 PseudoSourceValue::getConstantPool(), 0, 4959 false, 16); 4960 if (VT.isVector()) { 4961 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4962 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 4963 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4964 Op.getOperand(0)), 4965 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 4966 } else { 4967 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 4968 } 4969} 4970 4971SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 4972 SDValue Op0 = Op.getOperand(0); 4973 SDValue Op1 = Op.getOperand(1); 4974 DebugLoc dl = Op.getDebugLoc(); 4975 MVT VT = Op.getValueType(); 4976 MVT SrcVT = Op1.getValueType(); 4977 4978 // If second operand is smaller, extend it first. 4979 if (SrcVT.bitsLT(VT)) { 4980 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 4981 SrcVT = VT; 4982 } 4983 // And if it is bigger, shrink it first. 4984 if (SrcVT.bitsGT(VT)) { 4985 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 4986 SrcVT = VT; 4987 } 4988 4989 // At this point the operands and the result should have the same 4990 // type, and that won't be f80 since that is not custom lowered. 4991 4992 // First get the sign bit of second operand. 4993 std::vector<Constant*> CV; 4994 if (SrcVT == MVT::f64) { 4995 CV.push_back(ConstantFP::get(APFloat(APInt(64, 1ULL << 63)))); 4996 CV.push_back(ConstantFP::get(APFloat(APInt(64, 0)))); 4997 } else { 4998 CV.push_back(ConstantFP::get(APFloat(APInt(32, 1U << 31)))); 4999 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5000 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5001 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5002 } 5003 Constant *C = ConstantVector::get(CV); 5004 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5005 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5006 PseudoSourceValue::getConstantPool(), 0, 5007 false, 16); 5008 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5009 5010 // Shift sign bit right or left if the two operands have different types. 5011 if (SrcVT.bitsGT(VT)) { 5012 // Op0 is MVT::f32, Op1 is MVT::f64. 5013 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5014 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5015 DAG.getConstant(32, MVT::i32)); 5016 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5017 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5018 DAG.getIntPtrConstant(0)); 5019 } 5020 5021 // Clear first operand sign bit. 5022 CV.clear(); 5023 if (VT == MVT::f64) { 5024 CV.push_back(ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63))))); 5025 CV.push_back(ConstantFP::get(APFloat(APInt(64, 0)))); 5026 } else { 5027 CV.push_back(ConstantFP::get(APFloat(APInt(32, ~(1U << 31))))); 5028 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5029 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5030 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5031 } 5032 C = ConstantVector::get(CV); 5033 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5034 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5035 PseudoSourceValue::getConstantPool(), 0, 5036 false, 16); 5037 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5038 5039 // Or the value with the sign bit. 5040 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5041} 5042 5043/// Emit nodes that will be selected as "test Op0,Op0", or something 5044/// equivalent. 5045SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5046 SelectionDAG &DAG) { 5047 DebugLoc dl = Op.getDebugLoc(); 5048 5049 // CF and OF aren't always set the way we want. Determine which 5050 // of these we need. 5051 bool NeedCF = false; 5052 bool NeedOF = false; 5053 switch (X86CC) { 5054 case X86::COND_A: case X86::COND_AE: 5055 case X86::COND_B: case X86::COND_BE: 5056 NeedCF = true; 5057 break; 5058 case X86::COND_G: case X86::COND_GE: 5059 case X86::COND_L: case X86::COND_LE: 5060 case X86::COND_O: case X86::COND_NO: 5061 NeedOF = true; 5062 break; 5063 default: break; 5064 } 5065 5066 // See if we can use the EFLAGS value from the operand instead of 5067 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5068 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5069 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5070 unsigned Opcode = 0; 5071 unsigned NumOperands = 0; 5072 switch (Op.getNode()->getOpcode()) { 5073 case ISD::ADD: 5074 // Due to an isel shortcoming, be conservative if this add is likely to 5075 // be selected as part of a load-modify-store instruction. When the root 5076 // node in a match is a store, isel doesn't know how to remap non-chain 5077 // non-flag uses of other nodes in the match, such as the ADD in this 5078 // case. This leads to the ADD being left around and reselected, with 5079 // the result being two adds in the output. 5080 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5081 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5082 if (UI->getOpcode() == ISD::STORE) 5083 goto default_case; 5084 if (ConstantSDNode *C = 5085 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5086 // An add of one will be selected as an INC. 5087 if (C->getAPIntValue() == 1) { 5088 Opcode = X86ISD::INC; 5089 NumOperands = 1; 5090 break; 5091 } 5092 // An add of negative one (subtract of one) will be selected as a DEC. 5093 if (C->getAPIntValue().isAllOnesValue()) { 5094 Opcode = X86ISD::DEC; 5095 NumOperands = 1; 5096 break; 5097 } 5098 } 5099 // Otherwise use a regular EFLAGS-setting add. 5100 Opcode = X86ISD::ADD; 5101 NumOperands = 2; 5102 break; 5103 case ISD::SUB: 5104 // Due to the ISEL shortcoming noted above, be conservative if this sub is 5105 // likely to be selected as part of a load-modify-store instruction. 5106 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5107 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5108 if (UI->getOpcode() == ISD::STORE) 5109 goto default_case; 5110 // Otherwise use a regular EFLAGS-setting sub. 5111 Opcode = X86ISD::SUB; 5112 NumOperands = 2; 5113 break; 5114 case X86ISD::ADD: 5115 case X86ISD::SUB: 5116 case X86ISD::INC: 5117 case X86ISD::DEC: 5118 return SDValue(Op.getNode(), 1); 5119 default: 5120 default_case: 5121 break; 5122 } 5123 if (Opcode != 0) { 5124 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5125 SmallVector<SDValue, 4> Ops; 5126 for (unsigned i = 0; i != NumOperands; ++i) 5127 Ops.push_back(Op.getOperand(i)); 5128 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5129 DAG.ReplaceAllUsesWith(Op, New); 5130 return SDValue(New.getNode(), 1); 5131 } 5132 } 5133 5134 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5135 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5136 DAG.getConstant(0, Op.getValueType())); 5137} 5138 5139/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5140/// equivalent. 5141SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5142 SelectionDAG &DAG) { 5143 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5144 if (C->getAPIntValue() == 0) 5145 return EmitTest(Op0, X86CC, DAG); 5146 5147 DebugLoc dl = Op0.getDebugLoc(); 5148 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5149} 5150 5151SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5152 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5153 SDValue Op0 = Op.getOperand(0); 5154 SDValue Op1 = Op.getOperand(1); 5155 DebugLoc dl = Op.getDebugLoc(); 5156 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5157 5158 // Lower (X & (1 << N)) == 0 to BT(X, N). 5159 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5160 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5161 if (Op0.getOpcode() == ISD::AND && 5162 Op0.hasOneUse() && 5163 Op1.getOpcode() == ISD::Constant && 5164 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5165 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5166 SDValue LHS, RHS; 5167 if (Op0.getOperand(1).getOpcode() == ISD::SHL) { 5168 if (ConstantSDNode *Op010C = 5169 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0))) 5170 if (Op010C->getZExtValue() == 1) { 5171 LHS = Op0.getOperand(0); 5172 RHS = Op0.getOperand(1).getOperand(1); 5173 } 5174 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { 5175 if (ConstantSDNode *Op000C = 5176 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0))) 5177 if (Op000C->getZExtValue() == 1) { 5178 LHS = Op0.getOperand(1); 5179 RHS = Op0.getOperand(0).getOperand(1); 5180 } 5181 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { 5182 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1)); 5183 SDValue AndLHS = Op0.getOperand(0); 5184 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5185 LHS = AndLHS.getOperand(0); 5186 RHS = AndLHS.getOperand(1); 5187 } 5188 } 5189 5190 if (LHS.getNode()) { 5191 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5192 // instruction. Since the shift amount is in-range-or-undefined, we know 5193 // that doing a bittest on the i16 value is ok. We extend to i32 because 5194 // the encoding for the i16 version is larger than the i32 version. 5195 if (LHS.getValueType() == MVT::i8) 5196 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5197 5198 // If the operand types disagree, extend the shift amount to match. Since 5199 // BT ignores high bits (like shifts) we can use anyextend. 5200 if (LHS.getValueType() != RHS.getValueType()) 5201 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5202 5203 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5204 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5205 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5206 DAG.getConstant(Cond, MVT::i8), BT); 5207 } 5208 } 5209 5210 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5211 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5212 5213 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5214 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5215 DAG.getConstant(X86CC, MVT::i8), Cond); 5216} 5217 5218SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5219 SDValue Cond; 5220 SDValue Op0 = Op.getOperand(0); 5221 SDValue Op1 = Op.getOperand(1); 5222 SDValue CC = Op.getOperand(2); 5223 MVT VT = Op.getValueType(); 5224 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5225 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5226 DebugLoc dl = Op.getDebugLoc(); 5227 5228 if (isFP) { 5229 unsigned SSECC = 8; 5230 MVT VT0 = Op0.getValueType(); 5231 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 5232 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 5233 bool Swap = false; 5234 5235 switch (SetCCOpcode) { 5236 default: break; 5237 case ISD::SETOEQ: 5238 case ISD::SETEQ: SSECC = 0; break; 5239 case ISD::SETOGT: 5240 case ISD::SETGT: Swap = true; // Fallthrough 5241 case ISD::SETLT: 5242 case ISD::SETOLT: SSECC = 1; break; 5243 case ISD::SETOGE: 5244 case ISD::SETGE: Swap = true; // Fallthrough 5245 case ISD::SETLE: 5246 case ISD::SETOLE: SSECC = 2; break; 5247 case ISD::SETUO: SSECC = 3; break; 5248 case ISD::SETUNE: 5249 case ISD::SETNE: SSECC = 4; break; 5250 case ISD::SETULE: Swap = true; 5251 case ISD::SETUGE: SSECC = 5; break; 5252 case ISD::SETULT: Swap = true; 5253 case ISD::SETUGT: SSECC = 6; break; 5254 case ISD::SETO: SSECC = 7; break; 5255 } 5256 if (Swap) 5257 std::swap(Op0, Op1); 5258 5259 // In the two special cases we can't handle, emit two comparisons. 5260 if (SSECC == 8) { 5261 if (SetCCOpcode == ISD::SETUEQ) { 5262 SDValue UNORD, EQ; 5263 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 5264 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 5265 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 5266 } 5267 else if (SetCCOpcode == ISD::SETONE) { 5268 SDValue ORD, NEQ; 5269 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 5270 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 5271 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 5272 } 5273 assert(0 && "Illegal FP comparison"); 5274 } 5275 // Handle all other FP comparisons here. 5276 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 5277 } 5278 5279 // We are handling one of the integer comparisons here. Since SSE only has 5280 // GT and EQ comparisons for integer, swapping operands and multiple 5281 // operations may be required for some comparisons. 5282 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 5283 bool Swap = false, Invert = false, FlipSigns = false; 5284 5285 switch (VT.getSimpleVT()) { 5286 default: break; 5287 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 5288 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 5289 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 5290 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 5291 } 5292 5293 switch (SetCCOpcode) { 5294 default: break; 5295 case ISD::SETNE: Invert = true; 5296 case ISD::SETEQ: Opc = EQOpc; break; 5297 case ISD::SETLT: Swap = true; 5298 case ISD::SETGT: Opc = GTOpc; break; 5299 case ISD::SETGE: Swap = true; 5300 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 5301 case ISD::SETULT: Swap = true; 5302 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 5303 case ISD::SETUGE: Swap = true; 5304 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 5305 } 5306 if (Swap) 5307 std::swap(Op0, Op1); 5308 5309 // Since SSE has no unsigned integer comparisons, we need to flip the sign 5310 // bits of the inputs before performing those operations. 5311 if (FlipSigns) { 5312 MVT EltVT = VT.getVectorElementType(); 5313 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 5314 EltVT); 5315 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 5316 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 5317 SignBits.size()); 5318 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 5319 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 5320 } 5321 5322 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 5323 5324 // If the logical-not of the result is required, perform that now. 5325 if (Invert) 5326 Result = DAG.getNOT(dl, Result, VT); 5327 5328 return Result; 5329} 5330 5331// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 5332static bool isX86LogicalCmp(SDValue Op) { 5333 unsigned Opc = Op.getNode()->getOpcode(); 5334 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 5335 return true; 5336 if (Op.getResNo() == 1 && 5337 (Opc == X86ISD::ADD || 5338 Opc == X86ISD::SUB || 5339 Opc == X86ISD::SMUL || 5340 Opc == X86ISD::UMUL || 5341 Opc == X86ISD::INC || 5342 Opc == X86ISD::DEC)) 5343 return true; 5344 5345 return false; 5346} 5347 5348SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 5349 bool addTest = true; 5350 SDValue Cond = Op.getOperand(0); 5351 DebugLoc dl = Op.getDebugLoc(); 5352 SDValue CC; 5353 5354 if (Cond.getOpcode() == ISD::SETCC) 5355 Cond = LowerSETCC(Cond, DAG); 5356 5357 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5358 // setting operand in place of the X86ISD::SETCC. 5359 if (Cond.getOpcode() == X86ISD::SETCC) { 5360 CC = Cond.getOperand(0); 5361 5362 SDValue Cmp = Cond.getOperand(1); 5363 unsigned Opc = Cmp.getOpcode(); 5364 MVT VT = Op.getValueType(); 5365 5366 bool IllegalFPCMov = false; 5367 if (VT.isFloatingPoint() && !VT.isVector() && 5368 !isScalarFPTypeInSSEReg(VT)) // FPStack? 5369 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 5370 5371 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 5372 Opc == X86ISD::BT) { // FIXME 5373 Cond = Cmp; 5374 addTest = false; 5375 } 5376 } 5377 5378 if (addTest) { 5379 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5380 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5381 } 5382 5383 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 5384 SmallVector<SDValue, 4> Ops; 5385 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 5386 // condition is true. 5387 Ops.push_back(Op.getOperand(2)); 5388 Ops.push_back(Op.getOperand(1)); 5389 Ops.push_back(CC); 5390 Ops.push_back(Cond); 5391 return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size()); 5392} 5393 5394// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 5395// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 5396// from the AND / OR. 5397static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 5398 Opc = Op.getOpcode(); 5399 if (Opc != ISD::OR && Opc != ISD::AND) 5400 return false; 5401 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5402 Op.getOperand(0).hasOneUse() && 5403 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 5404 Op.getOperand(1).hasOneUse()); 5405} 5406 5407// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 5408// 1 and that the SETCC node has a single use. 5409static bool isXor1OfSetCC(SDValue Op) { 5410 if (Op.getOpcode() != ISD::XOR) 5411 return false; 5412 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5413 if (N1C && N1C->getAPIntValue() == 1) { 5414 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5415 Op.getOperand(0).hasOneUse(); 5416 } 5417 return false; 5418} 5419 5420SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 5421 bool addTest = true; 5422 SDValue Chain = Op.getOperand(0); 5423 SDValue Cond = Op.getOperand(1); 5424 SDValue Dest = Op.getOperand(2); 5425 DebugLoc dl = Op.getDebugLoc(); 5426 SDValue CC; 5427 5428 if (Cond.getOpcode() == ISD::SETCC) 5429 Cond = LowerSETCC(Cond, DAG); 5430#if 0 5431 // FIXME: LowerXALUO doesn't handle these!! 5432 else if (Cond.getOpcode() == X86ISD::ADD || 5433 Cond.getOpcode() == X86ISD::SUB || 5434 Cond.getOpcode() == X86ISD::SMUL || 5435 Cond.getOpcode() == X86ISD::UMUL) 5436 Cond = LowerXALUO(Cond, DAG); 5437#endif 5438 5439 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5440 // setting operand in place of the X86ISD::SETCC. 5441 if (Cond.getOpcode() == X86ISD::SETCC) { 5442 CC = Cond.getOperand(0); 5443 5444 SDValue Cmp = Cond.getOperand(1); 5445 unsigned Opc = Cmp.getOpcode(); 5446 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 5447 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 5448 Cond = Cmp; 5449 addTest = false; 5450 } else { 5451 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 5452 default: break; 5453 case X86::COND_O: 5454 case X86::COND_B: 5455 // These can only come from an arithmetic instruction with overflow, 5456 // e.g. SADDO, UADDO. 5457 Cond = Cond.getNode()->getOperand(1); 5458 addTest = false; 5459 break; 5460 } 5461 } 5462 } else { 5463 unsigned CondOpc; 5464 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 5465 SDValue Cmp = Cond.getOperand(0).getOperand(1); 5466 if (CondOpc == ISD::OR) { 5467 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 5468 // two branches instead of an explicit OR instruction with a 5469 // separate test. 5470 if (Cmp == Cond.getOperand(1).getOperand(1) && 5471 isX86LogicalCmp(Cmp)) { 5472 CC = Cond.getOperand(0).getOperand(0); 5473 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5474 Chain, Dest, CC, Cmp); 5475 CC = Cond.getOperand(1).getOperand(0); 5476 Cond = Cmp; 5477 addTest = false; 5478 } 5479 } else { // ISD::AND 5480 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 5481 // two branches instead of an explicit AND instruction with a 5482 // separate test. However, we only do this if this block doesn't 5483 // have a fall-through edge, because this requires an explicit 5484 // jmp when the condition is false. 5485 if (Cmp == Cond.getOperand(1).getOperand(1) && 5486 isX86LogicalCmp(Cmp) && 5487 Op.getNode()->hasOneUse()) { 5488 X86::CondCode CCode = 5489 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5490 CCode = X86::GetOppositeBranchCondition(CCode); 5491 CC = DAG.getConstant(CCode, MVT::i8); 5492 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 5493 // Look for an unconditional branch following this conditional branch. 5494 // We need this because we need to reverse the successors in order 5495 // to implement FCMP_OEQ. 5496 if (User.getOpcode() == ISD::BR) { 5497 SDValue FalseBB = User.getOperand(1); 5498 SDValue NewBR = 5499 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 5500 assert(NewBR == User); 5501 Dest = FalseBB; 5502 5503 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5504 Chain, Dest, CC, Cmp); 5505 X86::CondCode CCode = 5506 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 5507 CCode = X86::GetOppositeBranchCondition(CCode); 5508 CC = DAG.getConstant(CCode, MVT::i8); 5509 Cond = Cmp; 5510 addTest = false; 5511 } 5512 } 5513 } 5514 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 5515 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 5516 // It should be transformed during dag combiner except when the condition 5517 // is set by a arithmetics with overflow node. 5518 X86::CondCode CCode = 5519 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5520 CCode = X86::GetOppositeBranchCondition(CCode); 5521 CC = DAG.getConstant(CCode, MVT::i8); 5522 Cond = Cond.getOperand(0).getOperand(1); 5523 addTest = false; 5524 } 5525 } 5526 5527 if (addTest) { 5528 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5529 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5530 } 5531 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5532 Chain, Dest, CC, Cond); 5533} 5534 5535 5536// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 5537// Calls to _alloca is needed to probe the stack when allocating more than 4k 5538// bytes in one go. Touching the stack at 4K increments is necessary to ensure 5539// that the guard pages used by the OS virtual memory manager are allocated in 5540// correct sequence. 5541SDValue 5542X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 5543 SelectionDAG &DAG) { 5544 assert(Subtarget->isTargetCygMing() && 5545 "This should be used only on Cygwin/Mingw targets"); 5546 DebugLoc dl = Op.getDebugLoc(); 5547 5548 // Get the inputs. 5549 SDValue Chain = Op.getOperand(0); 5550 SDValue Size = Op.getOperand(1); 5551 // FIXME: Ensure alignment here 5552 5553 SDValue Flag; 5554 5555 MVT IntPtr = getPointerTy(); 5556 MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 5557 5558 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 5559 5560 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 5561 Flag = Chain.getValue(1); 5562 5563 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5564 SDValue Ops[] = { Chain, 5565 DAG.getTargetExternalSymbol("_alloca", IntPtr), 5566 DAG.getRegister(X86::EAX, IntPtr), 5567 DAG.getRegister(X86StackPtr, SPTy), 5568 Flag }; 5569 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 5570 Flag = Chain.getValue(1); 5571 5572 Chain = DAG.getCALLSEQ_END(Chain, 5573 DAG.getIntPtrConstant(0, true), 5574 DAG.getIntPtrConstant(0, true), 5575 Flag); 5576 5577 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 5578 5579 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 5580 return DAG.getMergeValues(Ops1, 2, dl); 5581} 5582 5583SDValue 5584X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 5585 SDValue Chain, 5586 SDValue Dst, SDValue Src, 5587 SDValue Size, unsigned Align, 5588 const Value *DstSV, 5589 uint64_t DstSVOff) { 5590 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5591 5592 // If not DWORD aligned or size is more than the threshold, call the library. 5593 // The libc version is likely to be faster for these cases. It can use the 5594 // address value and run time information about the CPU. 5595 if ((Align & 3) != 0 || 5596 !ConstantSize || 5597 ConstantSize->getZExtValue() > 5598 getSubtarget()->getMaxInlineSizeThreshold()) { 5599 SDValue InFlag(0, 0); 5600 5601 // Check to see if there is a specialized entry-point for memory zeroing. 5602 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 5603 5604 if (const char *bzeroEntry = V && 5605 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 5606 MVT IntPtr = getPointerTy(); 5607 const Type *IntPtrTy = TD->getIntPtrType(); 5608 TargetLowering::ArgListTy Args; 5609 TargetLowering::ArgListEntry Entry; 5610 Entry.Node = Dst; 5611 Entry.Ty = IntPtrTy; 5612 Args.push_back(Entry); 5613 Entry.Node = Size; 5614 Args.push_back(Entry); 5615 std::pair<SDValue,SDValue> CallResult = 5616 LowerCallTo(Chain, Type::VoidTy, false, false, false, false, 5617 CallingConv::C, false, 5618 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 5619 return CallResult.second; 5620 } 5621 5622 // Otherwise have the target-independent code call memset. 5623 return SDValue(); 5624 } 5625 5626 uint64_t SizeVal = ConstantSize->getZExtValue(); 5627 SDValue InFlag(0, 0); 5628 MVT AVT; 5629 SDValue Count; 5630 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 5631 unsigned BytesLeft = 0; 5632 bool TwoRepStos = false; 5633 if (ValC) { 5634 unsigned ValReg; 5635 uint64_t Val = ValC->getZExtValue() & 255; 5636 5637 // If the value is a constant, then we can potentially use larger sets. 5638 switch (Align & 3) { 5639 case 2: // WORD aligned 5640 AVT = MVT::i16; 5641 ValReg = X86::AX; 5642 Val = (Val << 8) | Val; 5643 break; 5644 case 0: // DWORD aligned 5645 AVT = MVT::i32; 5646 ValReg = X86::EAX; 5647 Val = (Val << 8) | Val; 5648 Val = (Val << 16) | Val; 5649 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 5650 AVT = MVT::i64; 5651 ValReg = X86::RAX; 5652 Val = (Val << 32) | Val; 5653 } 5654 break; 5655 default: // Byte aligned 5656 AVT = MVT::i8; 5657 ValReg = X86::AL; 5658 Count = DAG.getIntPtrConstant(SizeVal); 5659 break; 5660 } 5661 5662 if (AVT.bitsGT(MVT::i8)) { 5663 unsigned UBytes = AVT.getSizeInBits() / 8; 5664 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 5665 BytesLeft = SizeVal % UBytes; 5666 } 5667 5668 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 5669 InFlag); 5670 InFlag = Chain.getValue(1); 5671 } else { 5672 AVT = MVT::i8; 5673 Count = DAG.getIntPtrConstant(SizeVal); 5674 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 5675 InFlag = Chain.getValue(1); 5676 } 5677 5678 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 5679 X86::ECX, 5680 Count, InFlag); 5681 InFlag = Chain.getValue(1); 5682 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 5683 X86::EDI, 5684 Dst, InFlag); 5685 InFlag = Chain.getValue(1); 5686 5687 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5688 SmallVector<SDValue, 8> Ops; 5689 Ops.push_back(Chain); 5690 Ops.push_back(DAG.getValueType(AVT)); 5691 Ops.push_back(InFlag); 5692 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 5693 5694 if (TwoRepStos) { 5695 InFlag = Chain.getValue(1); 5696 Count = Size; 5697 MVT CVT = Count.getValueType(); 5698 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 5699 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 5700 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 5701 X86::ECX, 5702 Left, InFlag); 5703 InFlag = Chain.getValue(1); 5704 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5705 Ops.clear(); 5706 Ops.push_back(Chain); 5707 Ops.push_back(DAG.getValueType(MVT::i8)); 5708 Ops.push_back(InFlag); 5709 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 5710 } else if (BytesLeft) { 5711 // Handle the last 1 - 7 bytes. 5712 unsigned Offset = SizeVal - BytesLeft; 5713 MVT AddrVT = Dst.getValueType(); 5714 MVT SizeVT = Size.getValueType(); 5715 5716 Chain = DAG.getMemset(Chain, dl, 5717 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 5718 DAG.getConstant(Offset, AddrVT)), 5719 Src, 5720 DAG.getConstant(BytesLeft, SizeVT), 5721 Align, DstSV, DstSVOff + Offset); 5722 } 5723 5724 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 5725 return Chain; 5726} 5727 5728SDValue 5729X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 5730 SDValue Chain, SDValue Dst, SDValue Src, 5731 SDValue Size, unsigned Align, 5732 bool AlwaysInline, 5733 const Value *DstSV, uint64_t DstSVOff, 5734 const Value *SrcSV, uint64_t SrcSVOff) { 5735 // This requires the copy size to be a constant, preferrably 5736 // within a subtarget-specific limit. 5737 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5738 if (!ConstantSize) 5739 return SDValue(); 5740 uint64_t SizeVal = ConstantSize->getZExtValue(); 5741 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 5742 return SDValue(); 5743 5744 /// If not DWORD aligned, call the library. 5745 if ((Align & 3) != 0) 5746 return SDValue(); 5747 5748 // DWORD aligned 5749 MVT AVT = MVT::i32; 5750 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 5751 AVT = MVT::i64; 5752 5753 unsigned UBytes = AVT.getSizeInBits() / 8; 5754 unsigned CountVal = SizeVal / UBytes; 5755 SDValue Count = DAG.getIntPtrConstant(CountVal); 5756 unsigned BytesLeft = SizeVal % UBytes; 5757 5758 SDValue InFlag(0, 0); 5759 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 5760 X86::ECX, 5761 Count, InFlag); 5762 InFlag = Chain.getValue(1); 5763 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 5764 X86::EDI, 5765 Dst, InFlag); 5766 InFlag = Chain.getValue(1); 5767 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 5768 X86::ESI, 5769 Src, InFlag); 5770 InFlag = Chain.getValue(1); 5771 5772 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5773 SmallVector<SDValue, 8> Ops; 5774 Ops.push_back(Chain); 5775 Ops.push_back(DAG.getValueType(AVT)); 5776 Ops.push_back(InFlag); 5777 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size()); 5778 5779 SmallVector<SDValue, 4> Results; 5780 Results.push_back(RepMovs); 5781 if (BytesLeft) { 5782 // Handle the last 1 - 7 bytes. 5783 unsigned Offset = SizeVal - BytesLeft; 5784 MVT DstVT = Dst.getValueType(); 5785 MVT SrcVT = Src.getValueType(); 5786 MVT SizeVT = Size.getValueType(); 5787 Results.push_back(DAG.getMemcpy(Chain, dl, 5788 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 5789 DAG.getConstant(Offset, DstVT)), 5790 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 5791 DAG.getConstant(Offset, SrcVT)), 5792 DAG.getConstant(BytesLeft, SizeVT), 5793 Align, AlwaysInline, 5794 DstSV, DstSVOff + Offset, 5795 SrcSV, SrcSVOff + Offset)); 5796 } 5797 5798 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 5799 &Results[0], Results.size()); 5800} 5801 5802SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 5803 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 5804 DebugLoc dl = Op.getDebugLoc(); 5805 5806 if (!Subtarget->is64Bit()) { 5807 // vastart just stores the address of the VarArgsFrameIndex slot into the 5808 // memory location argument. 5809 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 5810 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); 5811 } 5812 5813 // __va_list_tag: 5814 // gp_offset (0 - 6 * 8) 5815 // fp_offset (48 - 48 + 8 * 16) 5816 // overflow_arg_area (point to parameters coming in memory). 5817 // reg_save_area 5818 SmallVector<SDValue, 8> MemOps; 5819 SDValue FIN = Op.getOperand(1); 5820 // Store gp_offset 5821 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 5822 DAG.getConstant(VarArgsGPOffset, MVT::i32), 5823 FIN, SV, 0); 5824 MemOps.push_back(Store); 5825 5826 // Store fp_offset 5827 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5828 FIN, DAG.getIntPtrConstant(4)); 5829 Store = DAG.getStore(Op.getOperand(0), dl, 5830 DAG.getConstant(VarArgsFPOffset, MVT::i32), 5831 FIN, SV, 0); 5832 MemOps.push_back(Store); 5833 5834 // Store ptr to overflow_arg_area 5835 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5836 FIN, DAG.getIntPtrConstant(4)); 5837 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 5838 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); 5839 MemOps.push_back(Store); 5840 5841 // Store ptr to reg_save_area. 5842 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5843 FIN, DAG.getIntPtrConstant(8)); 5844 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 5845 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); 5846 MemOps.push_back(Store); 5847 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 5848 &MemOps[0], MemOps.size()); 5849} 5850 5851SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 5852 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 5853 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 5854 SDValue Chain = Op.getOperand(0); 5855 SDValue SrcPtr = Op.getOperand(1); 5856 SDValue SrcSV = Op.getOperand(2); 5857 5858 assert(0 && "VAArgInst is not yet implemented for x86-64!"); 5859 abort(); 5860 return SDValue(); 5861} 5862 5863SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 5864 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 5865 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 5866 SDValue Chain = Op.getOperand(0); 5867 SDValue DstPtr = Op.getOperand(1); 5868 SDValue SrcPtr = Op.getOperand(2); 5869 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 5870 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 5871 DebugLoc dl = Op.getDebugLoc(); 5872 5873 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 5874 DAG.getIntPtrConstant(24), 8, false, 5875 DstSV, 0, SrcSV, 0); 5876} 5877 5878SDValue 5879X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 5880 DebugLoc dl = Op.getDebugLoc(); 5881 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5882 switch (IntNo) { 5883 default: return SDValue(); // Don't custom lower most intrinsics. 5884 // Comparison intrinsics. 5885 case Intrinsic::x86_sse_comieq_ss: 5886 case Intrinsic::x86_sse_comilt_ss: 5887 case Intrinsic::x86_sse_comile_ss: 5888 case Intrinsic::x86_sse_comigt_ss: 5889 case Intrinsic::x86_sse_comige_ss: 5890 case Intrinsic::x86_sse_comineq_ss: 5891 case Intrinsic::x86_sse_ucomieq_ss: 5892 case Intrinsic::x86_sse_ucomilt_ss: 5893 case Intrinsic::x86_sse_ucomile_ss: 5894 case Intrinsic::x86_sse_ucomigt_ss: 5895 case Intrinsic::x86_sse_ucomige_ss: 5896 case Intrinsic::x86_sse_ucomineq_ss: 5897 case Intrinsic::x86_sse2_comieq_sd: 5898 case Intrinsic::x86_sse2_comilt_sd: 5899 case Intrinsic::x86_sse2_comile_sd: 5900 case Intrinsic::x86_sse2_comigt_sd: 5901 case Intrinsic::x86_sse2_comige_sd: 5902 case Intrinsic::x86_sse2_comineq_sd: 5903 case Intrinsic::x86_sse2_ucomieq_sd: 5904 case Intrinsic::x86_sse2_ucomilt_sd: 5905 case Intrinsic::x86_sse2_ucomile_sd: 5906 case Intrinsic::x86_sse2_ucomigt_sd: 5907 case Intrinsic::x86_sse2_ucomige_sd: 5908 case Intrinsic::x86_sse2_ucomineq_sd: { 5909 unsigned Opc = 0; 5910 ISD::CondCode CC = ISD::SETCC_INVALID; 5911 switch (IntNo) { 5912 default: break; 5913 case Intrinsic::x86_sse_comieq_ss: 5914 case Intrinsic::x86_sse2_comieq_sd: 5915 Opc = X86ISD::COMI; 5916 CC = ISD::SETEQ; 5917 break; 5918 case Intrinsic::x86_sse_comilt_ss: 5919 case Intrinsic::x86_sse2_comilt_sd: 5920 Opc = X86ISD::COMI; 5921 CC = ISD::SETLT; 5922 break; 5923 case Intrinsic::x86_sse_comile_ss: 5924 case Intrinsic::x86_sse2_comile_sd: 5925 Opc = X86ISD::COMI; 5926 CC = ISD::SETLE; 5927 break; 5928 case Intrinsic::x86_sse_comigt_ss: 5929 case Intrinsic::x86_sse2_comigt_sd: 5930 Opc = X86ISD::COMI; 5931 CC = ISD::SETGT; 5932 break; 5933 case Intrinsic::x86_sse_comige_ss: 5934 case Intrinsic::x86_sse2_comige_sd: 5935 Opc = X86ISD::COMI; 5936 CC = ISD::SETGE; 5937 break; 5938 case Intrinsic::x86_sse_comineq_ss: 5939 case Intrinsic::x86_sse2_comineq_sd: 5940 Opc = X86ISD::COMI; 5941 CC = ISD::SETNE; 5942 break; 5943 case Intrinsic::x86_sse_ucomieq_ss: 5944 case Intrinsic::x86_sse2_ucomieq_sd: 5945 Opc = X86ISD::UCOMI; 5946 CC = ISD::SETEQ; 5947 break; 5948 case Intrinsic::x86_sse_ucomilt_ss: 5949 case Intrinsic::x86_sse2_ucomilt_sd: 5950 Opc = X86ISD::UCOMI; 5951 CC = ISD::SETLT; 5952 break; 5953 case Intrinsic::x86_sse_ucomile_ss: 5954 case Intrinsic::x86_sse2_ucomile_sd: 5955 Opc = X86ISD::UCOMI; 5956 CC = ISD::SETLE; 5957 break; 5958 case Intrinsic::x86_sse_ucomigt_ss: 5959 case Intrinsic::x86_sse2_ucomigt_sd: 5960 Opc = X86ISD::UCOMI; 5961 CC = ISD::SETGT; 5962 break; 5963 case Intrinsic::x86_sse_ucomige_ss: 5964 case Intrinsic::x86_sse2_ucomige_sd: 5965 Opc = X86ISD::UCOMI; 5966 CC = ISD::SETGE; 5967 break; 5968 case Intrinsic::x86_sse_ucomineq_ss: 5969 case Intrinsic::x86_sse2_ucomineq_sd: 5970 Opc = X86ISD::UCOMI; 5971 CC = ISD::SETNE; 5972 break; 5973 } 5974 5975 SDValue LHS = Op.getOperand(1); 5976 SDValue RHS = Op.getOperand(2); 5977 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 5978 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 5979 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5980 DAG.getConstant(X86CC, MVT::i8), Cond); 5981 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 5982 } 5983 5984 // Fix vector shift instructions where the last operand is a non-immediate 5985 // i32 value. 5986 case Intrinsic::x86_sse2_pslli_w: 5987 case Intrinsic::x86_sse2_pslli_d: 5988 case Intrinsic::x86_sse2_pslli_q: 5989 case Intrinsic::x86_sse2_psrli_w: 5990 case Intrinsic::x86_sse2_psrli_d: 5991 case Intrinsic::x86_sse2_psrli_q: 5992 case Intrinsic::x86_sse2_psrai_w: 5993 case Intrinsic::x86_sse2_psrai_d: 5994 case Intrinsic::x86_mmx_pslli_w: 5995 case Intrinsic::x86_mmx_pslli_d: 5996 case Intrinsic::x86_mmx_pslli_q: 5997 case Intrinsic::x86_mmx_psrli_w: 5998 case Intrinsic::x86_mmx_psrli_d: 5999 case Intrinsic::x86_mmx_psrli_q: 6000 case Intrinsic::x86_mmx_psrai_w: 6001 case Intrinsic::x86_mmx_psrai_d: { 6002 SDValue ShAmt = Op.getOperand(2); 6003 if (isa<ConstantSDNode>(ShAmt)) 6004 return SDValue(); 6005 6006 unsigned NewIntNo = 0; 6007 MVT ShAmtVT = MVT::v4i32; 6008 switch (IntNo) { 6009 case Intrinsic::x86_sse2_pslli_w: 6010 NewIntNo = Intrinsic::x86_sse2_psll_w; 6011 break; 6012 case Intrinsic::x86_sse2_pslli_d: 6013 NewIntNo = Intrinsic::x86_sse2_psll_d; 6014 break; 6015 case Intrinsic::x86_sse2_pslli_q: 6016 NewIntNo = Intrinsic::x86_sse2_psll_q; 6017 break; 6018 case Intrinsic::x86_sse2_psrli_w: 6019 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6020 break; 6021 case Intrinsic::x86_sse2_psrli_d: 6022 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6023 break; 6024 case Intrinsic::x86_sse2_psrli_q: 6025 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6026 break; 6027 case Intrinsic::x86_sse2_psrai_w: 6028 NewIntNo = Intrinsic::x86_sse2_psra_w; 6029 break; 6030 case Intrinsic::x86_sse2_psrai_d: 6031 NewIntNo = Intrinsic::x86_sse2_psra_d; 6032 break; 6033 default: { 6034 ShAmtVT = MVT::v2i32; 6035 switch (IntNo) { 6036 case Intrinsic::x86_mmx_pslli_w: 6037 NewIntNo = Intrinsic::x86_mmx_psll_w; 6038 break; 6039 case Intrinsic::x86_mmx_pslli_d: 6040 NewIntNo = Intrinsic::x86_mmx_psll_d; 6041 break; 6042 case Intrinsic::x86_mmx_pslli_q: 6043 NewIntNo = Intrinsic::x86_mmx_psll_q; 6044 break; 6045 case Intrinsic::x86_mmx_psrli_w: 6046 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6047 break; 6048 case Intrinsic::x86_mmx_psrli_d: 6049 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6050 break; 6051 case Intrinsic::x86_mmx_psrli_q: 6052 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6053 break; 6054 case Intrinsic::x86_mmx_psrai_w: 6055 NewIntNo = Intrinsic::x86_mmx_psra_w; 6056 break; 6057 case Intrinsic::x86_mmx_psrai_d: 6058 NewIntNo = Intrinsic::x86_mmx_psra_d; 6059 break; 6060 default: abort(); // Can't reach here. 6061 } 6062 break; 6063 } 6064 } 6065 MVT VT = Op.getValueType(); 6066 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6067 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt)); 6068 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6069 DAG.getConstant(NewIntNo, MVT::i32), 6070 Op.getOperand(1), ShAmt); 6071 } 6072 } 6073} 6074 6075SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6076 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6077 DebugLoc dl = Op.getDebugLoc(); 6078 6079 if (Depth > 0) { 6080 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6081 SDValue Offset = 6082 DAG.getConstant(TD->getPointerSize(), 6083 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6084 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6085 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6086 FrameAddr, Offset), 6087 NULL, 0); 6088 } 6089 6090 // Just load the return address. 6091 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6092 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6093 RetAddrFI, NULL, 0); 6094} 6095 6096SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6097 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6098 MFI->setFrameAddressIsTaken(true); 6099 MVT VT = Op.getValueType(); 6100 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6101 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6102 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 6103 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6104 while (Depth--) 6105 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); 6106 return FrameAddr; 6107} 6108 6109SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 6110 SelectionDAG &DAG) { 6111 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 6112} 6113 6114SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 6115{ 6116 MachineFunction &MF = DAG.getMachineFunction(); 6117 SDValue Chain = Op.getOperand(0); 6118 SDValue Offset = Op.getOperand(1); 6119 SDValue Handler = Op.getOperand(2); 6120 DebugLoc dl = Op.getDebugLoc(); 6121 6122 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 6123 getPointerTy()); 6124 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 6125 6126 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 6127 DAG.getIntPtrConstant(-TD->getPointerSize())); 6128 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 6129 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); 6130 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 6131 MF.getRegInfo().addLiveOut(StoreAddrReg); 6132 6133 return DAG.getNode(X86ISD::EH_RETURN, dl, 6134 MVT::Other, 6135 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 6136} 6137 6138SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 6139 SelectionDAG &DAG) { 6140 SDValue Root = Op.getOperand(0); 6141 SDValue Trmp = Op.getOperand(1); // trampoline 6142 SDValue FPtr = Op.getOperand(2); // nested function 6143 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 6144 DebugLoc dl = Op.getDebugLoc(); 6145 6146 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6147 6148 const X86InstrInfo *TII = 6149 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 6150 6151 if (Subtarget->is64Bit()) { 6152 SDValue OutChains[6]; 6153 6154 // Large code-model. 6155 6156 const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r); 6157 const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri); 6158 6159 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 6160 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 6161 6162 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 6163 6164 // Load the pointer to the nested function into R11. 6165 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 6166 SDValue Addr = Trmp; 6167 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6168 Addr, TrmpAddr, 0); 6169 6170 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6171 DAG.getConstant(2, MVT::i64)); 6172 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); 6173 6174 // Load the 'nest' parameter value into R10. 6175 // R10 is specified in X86CallingConv.td 6176 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 6177 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6178 DAG.getConstant(10, MVT::i64)); 6179 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6180 Addr, TrmpAddr, 10); 6181 6182 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6183 DAG.getConstant(12, MVT::i64)); 6184 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); 6185 6186 // Jump to the nested function. 6187 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 6188 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6189 DAG.getConstant(20, MVT::i64)); 6190 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6191 Addr, TrmpAddr, 20); 6192 6193 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 6194 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6195 DAG.getConstant(22, MVT::i64)); 6196 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 6197 TrmpAddr, 22); 6198 6199 SDValue Ops[] = 6200 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 6201 return DAG.getMergeValues(Ops, 2, dl); 6202 } else { 6203 const Function *Func = 6204 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 6205 unsigned CC = Func->getCallingConv(); 6206 unsigned NestReg; 6207 6208 switch (CC) { 6209 default: 6210 assert(0 && "Unsupported calling convention"); 6211 case CallingConv::C: 6212 case CallingConv::X86_StdCall: { 6213 // Pass 'nest' parameter in ECX. 6214 // Must be kept in sync with X86CallingConv.td 6215 NestReg = X86::ECX; 6216 6217 // Check that ECX wasn't needed by an 'inreg' parameter. 6218 const FunctionType *FTy = Func->getFunctionType(); 6219 const AttrListPtr &Attrs = Func->getAttributes(); 6220 6221 if (!Attrs.isEmpty() && !Func->isVarArg()) { 6222 unsigned InRegCount = 0; 6223 unsigned Idx = 1; 6224 6225 for (FunctionType::param_iterator I = FTy->param_begin(), 6226 E = FTy->param_end(); I != E; ++I, ++Idx) 6227 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 6228 // FIXME: should only count parameters that are lowered to integers. 6229 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 6230 6231 if (InRegCount > 2) { 6232 cerr << "Nest register in use - reduce number of inreg parameters!\n"; 6233 abort(); 6234 } 6235 } 6236 break; 6237 } 6238 case CallingConv::X86_FastCall: 6239 case CallingConv::Fast: 6240 // Pass 'nest' parameter in EAX. 6241 // Must be kept in sync with X86CallingConv.td 6242 NestReg = X86::EAX; 6243 break; 6244 } 6245 6246 SDValue OutChains[4]; 6247 SDValue Addr, Disp; 6248 6249 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6250 DAG.getConstant(10, MVT::i32)); 6251 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 6252 6253 const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri); 6254 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 6255 OutChains[0] = DAG.getStore(Root, dl, 6256 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 6257 Trmp, TrmpAddr, 0); 6258 6259 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6260 DAG.getConstant(1, MVT::i32)); 6261 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); 6262 6263 const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP); 6264 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6265 DAG.getConstant(5, MVT::i32)); 6266 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 6267 TrmpAddr, 5, false, 1); 6268 6269 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6270 DAG.getConstant(6, MVT::i32)); 6271 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); 6272 6273 SDValue Ops[] = 6274 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 6275 return DAG.getMergeValues(Ops, 2, dl); 6276 } 6277} 6278 6279SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 6280 /* 6281 The rounding mode is in bits 11:10 of FPSR, and has the following 6282 settings: 6283 00 Round to nearest 6284 01 Round to -inf 6285 10 Round to +inf 6286 11 Round to 0 6287 6288 FLT_ROUNDS, on the other hand, expects the following: 6289 -1 Undefined 6290 0 Round to 0 6291 1 Round to nearest 6292 2 Round to +inf 6293 3 Round to -inf 6294 6295 To perform the conversion, we do: 6296 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 6297 */ 6298 6299 MachineFunction &MF = DAG.getMachineFunction(); 6300 const TargetMachine &TM = MF.getTarget(); 6301 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 6302 unsigned StackAlignment = TFI.getStackAlignment(); 6303 MVT VT = Op.getValueType(); 6304 DebugLoc dl = Op.getDebugLoc(); 6305 6306 // Save FP Control Word to stack slot 6307 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment); 6308 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6309 6310 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 6311 DAG.getEntryNode(), StackSlot); 6312 6313 // Load FP Control Word from stack slot 6314 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); 6315 6316 // Transform as necessary 6317 SDValue CWD1 = 6318 DAG.getNode(ISD::SRL, dl, MVT::i16, 6319 DAG.getNode(ISD::AND, dl, MVT::i16, 6320 CWD, DAG.getConstant(0x800, MVT::i16)), 6321 DAG.getConstant(11, MVT::i8)); 6322 SDValue CWD2 = 6323 DAG.getNode(ISD::SRL, dl, MVT::i16, 6324 DAG.getNode(ISD::AND, dl, MVT::i16, 6325 CWD, DAG.getConstant(0x400, MVT::i16)), 6326 DAG.getConstant(9, MVT::i8)); 6327 6328 SDValue RetVal = 6329 DAG.getNode(ISD::AND, dl, MVT::i16, 6330 DAG.getNode(ISD::ADD, dl, MVT::i16, 6331 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 6332 DAG.getConstant(1, MVT::i16)), 6333 DAG.getConstant(3, MVT::i16)); 6334 6335 6336 return DAG.getNode((VT.getSizeInBits() < 16 ? 6337 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 6338} 6339 6340SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 6341 MVT VT = Op.getValueType(); 6342 MVT OpVT = VT; 6343 unsigned NumBits = VT.getSizeInBits(); 6344 DebugLoc dl = Op.getDebugLoc(); 6345 6346 Op = Op.getOperand(0); 6347 if (VT == MVT::i8) { 6348 // Zero extend to i32 since there is not an i8 bsr. 6349 OpVT = MVT::i32; 6350 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6351 } 6352 6353 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 6354 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6355 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 6356 6357 // If src is zero (i.e. bsr sets ZF), returns NumBits. 6358 SmallVector<SDValue, 4> Ops; 6359 Ops.push_back(Op); 6360 Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT)); 6361 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6362 Ops.push_back(Op.getValue(1)); 6363 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6364 6365 // Finally xor with NumBits-1. 6366 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 6367 6368 if (VT == MVT::i8) 6369 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6370 return Op; 6371} 6372 6373SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 6374 MVT VT = Op.getValueType(); 6375 MVT OpVT = VT; 6376 unsigned NumBits = VT.getSizeInBits(); 6377 DebugLoc dl = Op.getDebugLoc(); 6378 6379 Op = Op.getOperand(0); 6380 if (VT == MVT::i8) { 6381 OpVT = MVT::i32; 6382 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6383 } 6384 6385 // Issue a bsf (scan bits forward) which also sets EFLAGS. 6386 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6387 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 6388 6389 // If src is zero (i.e. bsf sets ZF), returns NumBits. 6390 SmallVector<SDValue, 4> Ops; 6391 Ops.push_back(Op); 6392 Ops.push_back(DAG.getConstant(NumBits, OpVT)); 6393 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6394 Ops.push_back(Op.getValue(1)); 6395 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6396 6397 if (VT == MVT::i8) 6398 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6399 return Op; 6400} 6401 6402SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 6403 MVT VT = Op.getValueType(); 6404 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 6405 DebugLoc dl = Op.getDebugLoc(); 6406 6407 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 6408 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 6409 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 6410 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 6411 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 6412 // 6413 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 6414 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 6415 // return AloBlo + AloBhi + AhiBlo; 6416 6417 SDValue A = Op.getOperand(0); 6418 SDValue B = Op.getOperand(1); 6419 6420 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6421 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6422 A, DAG.getConstant(32, MVT::i32)); 6423 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6424 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6425 B, DAG.getConstant(32, MVT::i32)); 6426 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6427 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6428 A, B); 6429 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6430 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6431 A, Bhi); 6432 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6433 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6434 Ahi, B); 6435 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6436 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6437 AloBhi, DAG.getConstant(32, MVT::i32)); 6438 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6439 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6440 AhiBlo, DAG.getConstant(32, MVT::i32)); 6441 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 6442 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 6443 return Res; 6444} 6445 6446 6447SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 6448 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 6449 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 6450 // looks for this combo and may remove the "setcc" instruction if the "setcc" 6451 // has only one use. 6452 SDNode *N = Op.getNode(); 6453 SDValue LHS = N->getOperand(0); 6454 SDValue RHS = N->getOperand(1); 6455 unsigned BaseOp = 0; 6456 unsigned Cond = 0; 6457 DebugLoc dl = Op.getDebugLoc(); 6458 6459 switch (Op.getOpcode()) { 6460 default: assert(0 && "Unknown ovf instruction!"); 6461 case ISD::SADDO: 6462 // A subtract of one will be selected as a INC. Note that INC doesn't 6463 // set CF, so we can't do this for UADDO. 6464 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6465 if (C->getAPIntValue() == 1) { 6466 BaseOp = X86ISD::INC; 6467 Cond = X86::COND_O; 6468 break; 6469 } 6470 BaseOp = X86ISD::ADD; 6471 Cond = X86::COND_O; 6472 break; 6473 case ISD::UADDO: 6474 BaseOp = X86ISD::ADD; 6475 Cond = X86::COND_B; 6476 break; 6477 case ISD::SSUBO: 6478 // A subtract of one will be selected as a DEC. Note that DEC doesn't 6479 // set CF, so we can't do this for USUBO. 6480 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6481 if (C->getAPIntValue() == 1) { 6482 BaseOp = X86ISD::DEC; 6483 Cond = X86::COND_O; 6484 break; 6485 } 6486 BaseOp = X86ISD::SUB; 6487 Cond = X86::COND_O; 6488 break; 6489 case ISD::USUBO: 6490 BaseOp = X86ISD::SUB; 6491 Cond = X86::COND_B; 6492 break; 6493 case ISD::SMULO: 6494 BaseOp = X86ISD::SMUL; 6495 Cond = X86::COND_O; 6496 break; 6497 case ISD::UMULO: 6498 BaseOp = X86ISD::UMUL; 6499 Cond = X86::COND_B; 6500 break; 6501 } 6502 6503 // Also sets EFLAGS. 6504 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 6505 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 6506 6507 SDValue SetCC = 6508 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 6509 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 6510 6511 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 6512 return Sum; 6513} 6514 6515SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 6516 MVT T = Op.getValueType(); 6517 DebugLoc dl = Op.getDebugLoc(); 6518 unsigned Reg = 0; 6519 unsigned size = 0; 6520 switch(T.getSimpleVT()) { 6521 default: 6522 assert(false && "Invalid value type!"); 6523 case MVT::i8: Reg = X86::AL; size = 1; break; 6524 case MVT::i16: Reg = X86::AX; size = 2; break; 6525 case MVT::i32: Reg = X86::EAX; size = 4; break; 6526 case MVT::i64: 6527 assert(Subtarget->is64Bit() && "Node not type legal!"); 6528 Reg = X86::RAX; size = 8; 6529 break; 6530 } 6531 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 6532 Op.getOperand(2), SDValue()); 6533 SDValue Ops[] = { cpIn.getValue(0), 6534 Op.getOperand(1), 6535 Op.getOperand(3), 6536 DAG.getTargetConstant(size, MVT::i8), 6537 cpIn.getValue(1) }; 6538 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6539 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 6540 SDValue cpOut = 6541 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 6542 return cpOut; 6543} 6544 6545SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 6546 SelectionDAG &DAG) { 6547 assert(Subtarget->is64Bit() && "Result not type legalized?"); 6548 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6549 SDValue TheChain = Op.getOperand(0); 6550 DebugLoc dl = Op.getDebugLoc(); 6551 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6552 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 6553 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 6554 rax.getValue(2)); 6555 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 6556 DAG.getConstant(32, MVT::i8)); 6557 SDValue Ops[] = { 6558 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 6559 rdx.getValue(1) 6560 }; 6561 return DAG.getMergeValues(Ops, 2, dl); 6562} 6563 6564SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 6565 SDNode *Node = Op.getNode(); 6566 DebugLoc dl = Node->getDebugLoc(); 6567 MVT T = Node->getValueType(0); 6568 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 6569 DAG.getConstant(0, T), Node->getOperand(2)); 6570 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 6571 cast<AtomicSDNode>(Node)->getMemoryVT(), 6572 Node->getOperand(0), 6573 Node->getOperand(1), negOp, 6574 cast<AtomicSDNode>(Node)->getSrcValue(), 6575 cast<AtomicSDNode>(Node)->getAlignment()); 6576} 6577 6578/// LowerOperation - Provide custom lowering hooks for some operations. 6579/// 6580SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 6581 switch (Op.getOpcode()) { 6582 default: assert(0 && "Should not custom lower this!"); 6583 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 6584 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 6585 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 6586 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 6587 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 6588 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 6589 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 6590 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 6591 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 6592 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 6593 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 6594 case ISD::SHL_PARTS: 6595 case ISD::SRA_PARTS: 6596 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 6597 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 6598 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 6599 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 6600 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 6601 case ISD::FABS: return LowerFABS(Op, DAG); 6602 case ISD::FNEG: return LowerFNEG(Op, DAG); 6603 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 6604 case ISD::SETCC: return LowerSETCC(Op, DAG); 6605 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 6606 case ISD::SELECT: return LowerSELECT(Op, DAG); 6607 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 6608 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 6609 case ISD::CALL: return LowerCALL(Op, DAG); 6610 case ISD::RET: return LowerRET(Op, DAG); 6611 case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG); 6612 case ISD::VASTART: return LowerVASTART(Op, DAG); 6613 case ISD::VAARG: return LowerVAARG(Op, DAG); 6614 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 6615 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 6616 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 6617 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 6618 case ISD::FRAME_TO_ARGS_OFFSET: 6619 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 6620 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 6621 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 6622 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 6623 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 6624 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 6625 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 6626 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 6627 case ISD::SADDO: 6628 case ISD::UADDO: 6629 case ISD::SSUBO: 6630 case ISD::USUBO: 6631 case ISD::SMULO: 6632 case ISD::UMULO: return LowerXALUO(Op, DAG); 6633 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 6634 } 6635} 6636 6637void X86TargetLowering:: 6638ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 6639 SelectionDAG &DAG, unsigned NewOp) { 6640 MVT T = Node->getValueType(0); 6641 DebugLoc dl = Node->getDebugLoc(); 6642 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 6643 6644 SDValue Chain = Node->getOperand(0); 6645 SDValue In1 = Node->getOperand(1); 6646 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6647 Node->getOperand(2), DAG.getIntPtrConstant(0)); 6648 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6649 Node->getOperand(2), DAG.getIntPtrConstant(1)); 6650 // This is a generalized SDNode, not an AtomicSDNode, so it doesn't 6651 // have a MemOperand. Pass the info through as a normal operand. 6652 SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand()); 6653 SDValue Ops[] = { Chain, In1, In2L, In2H, LSI }; 6654 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 6655 SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5); 6656 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 6657 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 6658 Results.push_back(Result.getValue(2)); 6659} 6660 6661/// ReplaceNodeResults - Replace a node with an illegal result type 6662/// with a new node built out of custom code. 6663void X86TargetLowering::ReplaceNodeResults(SDNode *N, 6664 SmallVectorImpl<SDValue>&Results, 6665 SelectionDAG &DAG) { 6666 DebugLoc dl = N->getDebugLoc(); 6667 switch (N->getOpcode()) { 6668 default: 6669 assert(false && "Do not know how to custom type legalize this operation!"); 6670 return; 6671 case ISD::FP_TO_SINT: { 6672 std::pair<SDValue,SDValue> Vals = 6673 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 6674 SDValue FIST = Vals.first, StackSlot = Vals.second; 6675 if (FIST.getNode() != 0) { 6676 MVT VT = N->getValueType(0); 6677 // Return a load from the stack slot. 6678 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); 6679 } 6680 return; 6681 } 6682 case ISD::READCYCLECOUNTER: { 6683 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6684 SDValue TheChain = N->getOperand(0); 6685 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6686 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 6687 rd.getValue(1)); 6688 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 6689 eax.getValue(2)); 6690 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 6691 SDValue Ops[] = { eax, edx }; 6692 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 6693 Results.push_back(edx.getValue(1)); 6694 return; 6695 } 6696 case ISD::ATOMIC_CMP_SWAP: { 6697 MVT T = N->getValueType(0); 6698 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 6699 SDValue cpInL, cpInH; 6700 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 6701 DAG.getConstant(0, MVT::i32)); 6702 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 6703 DAG.getConstant(1, MVT::i32)); 6704 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 6705 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 6706 cpInL.getValue(1)); 6707 SDValue swapInL, swapInH; 6708 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 6709 DAG.getConstant(0, MVT::i32)); 6710 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 6711 DAG.getConstant(1, MVT::i32)); 6712 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 6713 cpInH.getValue(1)); 6714 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 6715 swapInL.getValue(1)); 6716 SDValue Ops[] = { swapInH.getValue(0), 6717 N->getOperand(1), 6718 swapInH.getValue(1) }; 6719 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6720 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 6721 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 6722 MVT::i32, Result.getValue(1)); 6723 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 6724 MVT::i32, cpOutL.getValue(2)); 6725 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 6726 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 6727 Results.push_back(cpOutH.getValue(1)); 6728 return; 6729 } 6730 case ISD::ATOMIC_LOAD_ADD: 6731 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 6732 return; 6733 case ISD::ATOMIC_LOAD_AND: 6734 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 6735 return; 6736 case ISD::ATOMIC_LOAD_NAND: 6737 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 6738 return; 6739 case ISD::ATOMIC_LOAD_OR: 6740 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 6741 return; 6742 case ISD::ATOMIC_LOAD_SUB: 6743 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 6744 return; 6745 case ISD::ATOMIC_LOAD_XOR: 6746 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 6747 return; 6748 case ISD::ATOMIC_SWAP: 6749 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 6750 return; 6751 } 6752} 6753 6754const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 6755 switch (Opcode) { 6756 default: return NULL; 6757 case X86ISD::BSF: return "X86ISD::BSF"; 6758 case X86ISD::BSR: return "X86ISD::BSR"; 6759 case X86ISD::SHLD: return "X86ISD::SHLD"; 6760 case X86ISD::SHRD: return "X86ISD::SHRD"; 6761 case X86ISD::FAND: return "X86ISD::FAND"; 6762 case X86ISD::FOR: return "X86ISD::FOR"; 6763 case X86ISD::FXOR: return "X86ISD::FXOR"; 6764 case X86ISD::FSRL: return "X86ISD::FSRL"; 6765 case X86ISD::FILD: return "X86ISD::FILD"; 6766 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 6767 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 6768 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 6769 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 6770 case X86ISD::FLD: return "X86ISD::FLD"; 6771 case X86ISD::FST: return "X86ISD::FST"; 6772 case X86ISD::CALL: return "X86ISD::CALL"; 6773 case X86ISD::TAILCALL: return "X86ISD::TAILCALL"; 6774 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 6775 case X86ISD::BT: return "X86ISD::BT"; 6776 case X86ISD::CMP: return "X86ISD::CMP"; 6777 case X86ISD::COMI: return "X86ISD::COMI"; 6778 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 6779 case X86ISD::SETCC: return "X86ISD::SETCC"; 6780 case X86ISD::CMOV: return "X86ISD::CMOV"; 6781 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 6782 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 6783 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 6784 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 6785 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 6786 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 6787 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 6788 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 6789 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 6790 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 6791 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 6792 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 6793 case X86ISD::FMAX: return "X86ISD::FMAX"; 6794 case X86ISD::FMIN: return "X86ISD::FMIN"; 6795 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 6796 case X86ISD::FRCP: return "X86ISD::FRCP"; 6797 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 6798 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 6799 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 6800 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 6801 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 6802 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 6803 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 6804 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 6805 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 6806 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 6807 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 6808 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 6809 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 6810 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 6811 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 6812 case X86ISD::VSHL: return "X86ISD::VSHL"; 6813 case X86ISD::VSRL: return "X86ISD::VSRL"; 6814 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 6815 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 6816 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 6817 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 6818 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 6819 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 6820 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 6821 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 6822 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 6823 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 6824 case X86ISD::ADD: return "X86ISD::ADD"; 6825 case X86ISD::SUB: return "X86ISD::SUB"; 6826 case X86ISD::SMUL: return "X86ISD::SMUL"; 6827 case X86ISD::UMUL: return "X86ISD::UMUL"; 6828 case X86ISD::INC: return "X86ISD::INC"; 6829 case X86ISD::DEC: return "X86ISD::DEC"; 6830 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 6831 } 6832} 6833 6834// isLegalAddressingMode - Return true if the addressing mode represented 6835// by AM is legal for this target, for a load/store of the specified type. 6836bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 6837 const Type *Ty) const { 6838 // X86 supports extremely general addressing modes. 6839 6840 // X86 allows a sign-extended 32-bit immediate field as a displacement. 6841 if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1) 6842 return false; 6843 6844 if (AM.BaseGV) { 6845 // We can only fold this if we don't need an extra load. 6846 if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false)) 6847 return false; 6848 // If BaseGV requires a register, we cannot also have a BaseReg. 6849 if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) && 6850 AM.HasBaseReg) 6851 return false; 6852 6853 // X86-64 only supports addr of globals in small code model. 6854 if (Subtarget->is64Bit()) { 6855 if (getTargetMachine().getCodeModel() != CodeModel::Small) 6856 return false; 6857 // If lower 4G is not available, then we must use rip-relative addressing. 6858 if (AM.BaseOffs || AM.Scale > 1) 6859 return false; 6860 } 6861 } 6862 6863 switch (AM.Scale) { 6864 case 0: 6865 case 1: 6866 case 2: 6867 case 4: 6868 case 8: 6869 // These scales always work. 6870 break; 6871 case 3: 6872 case 5: 6873 case 9: 6874 // These scales are formed with basereg+scalereg. Only accept if there is 6875 // no basereg yet. 6876 if (AM.HasBaseReg) 6877 return false; 6878 break; 6879 default: // Other stuff never works. 6880 return false; 6881 } 6882 6883 return true; 6884} 6885 6886 6887bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 6888 if (!Ty1->isInteger() || !Ty2->isInteger()) 6889 return false; 6890 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6891 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6892 if (NumBits1 <= NumBits2) 6893 return false; 6894 return Subtarget->is64Bit() || NumBits1 < 64; 6895} 6896 6897bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const { 6898 if (!VT1.isInteger() || !VT2.isInteger()) 6899 return false; 6900 unsigned NumBits1 = VT1.getSizeInBits(); 6901 unsigned NumBits2 = VT2.getSizeInBits(); 6902 if (NumBits1 <= NumBits2) 6903 return false; 6904 return Subtarget->is64Bit() || NumBits1 < 64; 6905} 6906 6907bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 6908 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 6909 return Ty1 == Type::Int32Ty && Ty2 == Type::Int64Ty && Subtarget->is64Bit(); 6910} 6911 6912bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const { 6913 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 6914 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 6915} 6916 6917bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const { 6918 // i16 instructions are longer (0x66 prefix) and potentially slower. 6919 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 6920} 6921 6922/// isShuffleMaskLegal - Targets can use this to indicate that they only 6923/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 6924/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 6925/// are assumed to be legal. 6926bool 6927X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 6928 MVT VT) const { 6929 // Only do shuffles on 128-bit vector types for now. 6930 if (VT.getSizeInBits() == 64) 6931 return false; 6932 6933 // FIXME: pshufb, blends, palignr, shifts. 6934 return (VT.getVectorNumElements() == 2 || 6935 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 6936 isMOVLMask(M, VT) || 6937 isSHUFPMask(M, VT) || 6938 isPSHUFDMask(M, VT) || 6939 isPSHUFHWMask(M, VT) || 6940 isPSHUFLWMask(M, VT) || 6941 isUNPCKLMask(M, VT) || 6942 isUNPCKHMask(M, VT) || 6943 isUNPCKL_v_undef_Mask(M, VT) || 6944 isUNPCKH_v_undef_Mask(M, VT)); 6945} 6946 6947bool 6948X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 6949 MVT VT) const { 6950 unsigned NumElts = VT.getVectorNumElements(); 6951 // FIXME: This collection of masks seems suspect. 6952 if (NumElts == 2) 6953 return true; 6954 if (NumElts == 4 && VT.getSizeInBits() == 128) { 6955 return (isMOVLMask(Mask, VT) || 6956 isCommutedMOVLMask(Mask, VT, true) || 6957 isSHUFPMask(Mask, VT) || 6958 isCommutedSHUFPMask(Mask, VT)); 6959 } 6960 return false; 6961} 6962 6963//===----------------------------------------------------------------------===// 6964// X86 Scheduler Hooks 6965//===----------------------------------------------------------------------===// 6966 6967// private utility function 6968MachineBasicBlock * 6969X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 6970 MachineBasicBlock *MBB, 6971 unsigned regOpc, 6972 unsigned immOpc, 6973 unsigned LoadOpc, 6974 unsigned CXchgOpc, 6975 unsigned copyOpc, 6976 unsigned notOpc, 6977 unsigned EAXreg, 6978 TargetRegisterClass *RC, 6979 bool invSrc) const { 6980 // For the atomic bitwise operator, we generate 6981 // thisMBB: 6982 // newMBB: 6983 // ld t1 = [bitinstr.addr] 6984 // op t2 = t1, [bitinstr.val] 6985 // mov EAX = t1 6986 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 6987 // bz newMBB 6988 // fallthrough -->nextMBB 6989 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 6990 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 6991 MachineFunction::iterator MBBIter = MBB; 6992 ++MBBIter; 6993 6994 /// First build the CFG 6995 MachineFunction *F = MBB->getParent(); 6996 MachineBasicBlock *thisMBB = MBB; 6997 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 6998 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 6999 F->insert(MBBIter, newMBB); 7000 F->insert(MBBIter, nextMBB); 7001 7002 // Move all successors to thisMBB to nextMBB 7003 nextMBB->transferSuccessors(thisMBB); 7004 7005 // Update thisMBB to fall through to newMBB 7006 thisMBB->addSuccessor(newMBB); 7007 7008 // newMBB jumps to itself and fall through to nextMBB 7009 newMBB->addSuccessor(nextMBB); 7010 newMBB->addSuccessor(newMBB); 7011 7012 // Insert instructions into newMBB based on incoming instruction 7013 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7014 "unexpected number of operands"); 7015 DebugLoc dl = bInstr->getDebugLoc(); 7016 MachineOperand& destOper = bInstr->getOperand(0); 7017 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7018 int numArgs = bInstr->getNumOperands() - 1; 7019 for (int i=0; i < numArgs; ++i) 7020 argOpers[i] = &bInstr->getOperand(i+1); 7021 7022 // x86 address has 4 operands: base, index, scale, and displacement 7023 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7024 int valArgIndx = lastAddrIndx + 1; 7025 7026 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7027 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7028 for (int i=0; i <= lastAddrIndx; ++i) 7029 (*MIB).addOperand(*argOpers[i]); 7030 7031 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7032 if (invSrc) { 7033 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7034 } 7035 else 7036 tt = t1; 7037 7038 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7039 assert((argOpers[valArgIndx]->isReg() || 7040 argOpers[valArgIndx]->isImm()) && 7041 "invalid operand"); 7042 if (argOpers[valArgIndx]->isReg()) 7043 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7044 else 7045 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7046 MIB.addReg(tt); 7047 (*MIB).addOperand(*argOpers[valArgIndx]); 7048 7049 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7050 MIB.addReg(t1); 7051 7052 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7053 for (int i=0; i <= lastAddrIndx; ++i) 7054 (*MIB).addOperand(*argOpers[i]); 7055 MIB.addReg(t2); 7056 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7057 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); 7058 7059 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7060 MIB.addReg(EAXreg); 7061 7062 // insert branch 7063 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7064 7065 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7066 return nextMBB; 7067} 7068 7069// private utility function: 64 bit atomics on 32 bit host. 7070MachineBasicBlock * 7071X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7072 MachineBasicBlock *MBB, 7073 unsigned regOpcL, 7074 unsigned regOpcH, 7075 unsigned immOpcL, 7076 unsigned immOpcH, 7077 bool invSrc) const { 7078 // For the atomic bitwise operator, we generate 7079 // thisMBB (instructions are in pairs, except cmpxchg8b) 7080 // ld t1,t2 = [bitinstr.addr] 7081 // newMBB: 7082 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7083 // op t5, t6 <- out1, out2, [bitinstr.val] 7084 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7085 // mov ECX, EBX <- t5, t6 7086 // mov EAX, EDX <- t1, t2 7087 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7088 // mov t3, t4 <- EAX, EDX 7089 // bz newMBB 7090 // result in out1, out2 7091 // fallthrough -->nextMBB 7092 7093 const TargetRegisterClass *RC = X86::GR32RegisterClass; 7094 const unsigned LoadOpc = X86::MOV32rm; 7095 const unsigned copyOpc = X86::MOV32rr; 7096 const unsigned NotOpc = X86::NOT32r; 7097 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7098 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7099 MachineFunction::iterator MBBIter = MBB; 7100 ++MBBIter; 7101 7102 /// First build the CFG 7103 MachineFunction *F = MBB->getParent(); 7104 MachineBasicBlock *thisMBB = MBB; 7105 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7106 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7107 F->insert(MBBIter, newMBB); 7108 F->insert(MBBIter, nextMBB); 7109 7110 // Move all successors to thisMBB to nextMBB 7111 nextMBB->transferSuccessors(thisMBB); 7112 7113 // Update thisMBB to fall through to newMBB 7114 thisMBB->addSuccessor(newMBB); 7115 7116 // newMBB jumps to itself and fall through to nextMBB 7117 newMBB->addSuccessor(nextMBB); 7118 newMBB->addSuccessor(newMBB); 7119 7120 DebugLoc dl = bInstr->getDebugLoc(); 7121 // Insert instructions into newMBB based on incoming instruction 7122 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 7123 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 7124 "unexpected number of operands"); 7125 MachineOperand& dest1Oper = bInstr->getOperand(0); 7126 MachineOperand& dest2Oper = bInstr->getOperand(1); 7127 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7128 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 7129 argOpers[i] = &bInstr->getOperand(i+2); 7130 7131 // x86 address has 4 operands: base, index, scale, and displacement 7132 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7133 7134 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7135 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 7136 for (int i=0; i <= lastAddrIndx; ++i) 7137 (*MIB).addOperand(*argOpers[i]); 7138 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7139 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 7140 // add 4 to displacement. 7141 for (int i=0; i <= lastAddrIndx-2; ++i) 7142 (*MIB).addOperand(*argOpers[i]); 7143 MachineOperand newOp3 = *(argOpers[3]); 7144 if (newOp3.isImm()) 7145 newOp3.setImm(newOp3.getImm()+4); 7146 else 7147 newOp3.setOffset(newOp3.getOffset()+4); 7148 (*MIB).addOperand(newOp3); 7149 (*MIB).addOperand(*argOpers[lastAddrIndx]); 7150 7151 // t3/4 are defined later, at the bottom of the loop 7152 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 7153 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 7154 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 7155 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 7156 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 7157 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 7158 7159 unsigned tt1 = F->getRegInfo().createVirtualRegister(RC); 7160 unsigned tt2 = F->getRegInfo().createVirtualRegister(RC); 7161 if (invSrc) { 7162 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1); 7163 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2); 7164 } else { 7165 tt1 = t1; 7166 tt2 = t2; 7167 } 7168 7169 int valArgIndx = lastAddrIndx + 1; 7170 assert((argOpers[valArgIndx]->isReg() || 7171 argOpers[valArgIndx]->isImm()) && 7172 "invalid operand"); 7173 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 7174 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 7175 if (argOpers[valArgIndx]->isReg()) 7176 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 7177 else 7178 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 7179 if (regOpcL != X86::MOV32rr) 7180 MIB.addReg(tt1); 7181 (*MIB).addOperand(*argOpers[valArgIndx]); 7182 assert(argOpers[valArgIndx + 1]->isReg() == 7183 argOpers[valArgIndx]->isReg()); 7184 assert(argOpers[valArgIndx + 1]->isImm() == 7185 argOpers[valArgIndx]->isImm()); 7186 if (argOpers[valArgIndx + 1]->isReg()) 7187 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 7188 else 7189 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 7190 if (regOpcH != X86::MOV32rr) 7191 MIB.addReg(tt2); 7192 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 7193 7194 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 7195 MIB.addReg(t1); 7196 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 7197 MIB.addReg(t2); 7198 7199 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 7200 MIB.addReg(t5); 7201 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 7202 MIB.addReg(t6); 7203 7204 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 7205 for (int i=0; i <= lastAddrIndx; ++i) 7206 (*MIB).addOperand(*argOpers[i]); 7207 7208 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7209 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); 7210 7211 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 7212 MIB.addReg(X86::EAX); 7213 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 7214 MIB.addReg(X86::EDX); 7215 7216 // insert branch 7217 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7218 7219 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7220 return nextMBB; 7221} 7222 7223// private utility function 7224MachineBasicBlock * 7225X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 7226 MachineBasicBlock *MBB, 7227 unsigned cmovOpc) const { 7228 // For the atomic min/max operator, we generate 7229 // thisMBB: 7230 // newMBB: 7231 // ld t1 = [min/max.addr] 7232 // mov t2 = [min/max.val] 7233 // cmp t1, t2 7234 // cmov[cond] t2 = t1 7235 // mov EAX = t1 7236 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7237 // bz newMBB 7238 // fallthrough -->nextMBB 7239 // 7240 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7241 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7242 MachineFunction::iterator MBBIter = MBB; 7243 ++MBBIter; 7244 7245 /// First build the CFG 7246 MachineFunction *F = MBB->getParent(); 7247 MachineBasicBlock *thisMBB = MBB; 7248 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7249 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7250 F->insert(MBBIter, newMBB); 7251 F->insert(MBBIter, nextMBB); 7252 7253 // Move all successors to thisMBB to nextMBB 7254 nextMBB->transferSuccessors(thisMBB); 7255 7256 // Update thisMBB to fall through to newMBB 7257 thisMBB->addSuccessor(newMBB); 7258 7259 // newMBB jumps to newMBB and fall through to nextMBB 7260 newMBB->addSuccessor(nextMBB); 7261 newMBB->addSuccessor(newMBB); 7262 7263 DebugLoc dl = mInstr->getDebugLoc(); 7264 // Insert instructions into newMBB based on incoming instruction 7265 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 7266 "unexpected number of operands"); 7267 MachineOperand& destOper = mInstr->getOperand(0); 7268 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7269 int numArgs = mInstr->getNumOperands() - 1; 7270 for (int i=0; i < numArgs; ++i) 7271 argOpers[i] = &mInstr->getOperand(i+1); 7272 7273 // x86 address has 4 operands: base, index, scale, and displacement 7274 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7275 int valArgIndx = lastAddrIndx + 1; 7276 7277 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7278 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 7279 for (int i=0; i <= lastAddrIndx; ++i) 7280 (*MIB).addOperand(*argOpers[i]); 7281 7282 // We only support register and immediate values 7283 assert((argOpers[valArgIndx]->isReg() || 7284 argOpers[valArgIndx]->isImm()) && 7285 "invalid operand"); 7286 7287 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7288 if (argOpers[valArgIndx]->isReg()) 7289 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7290 else 7291 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7292 (*MIB).addOperand(*argOpers[valArgIndx]); 7293 7294 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 7295 MIB.addReg(t1); 7296 7297 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 7298 MIB.addReg(t1); 7299 MIB.addReg(t2); 7300 7301 // Generate movc 7302 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7303 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 7304 MIB.addReg(t2); 7305 MIB.addReg(t1); 7306 7307 // Cmp and exchange if none has modified the memory location 7308 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 7309 for (int i=0; i <= lastAddrIndx; ++i) 7310 (*MIB).addOperand(*argOpers[i]); 7311 MIB.addReg(t3); 7312 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7313 (*MIB).addMemOperand(*F, *mInstr->memoperands_begin()); 7314 7315 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 7316 MIB.addReg(X86::EAX); 7317 7318 // insert branch 7319 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7320 7321 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 7322 return nextMBB; 7323} 7324 7325 7326MachineBasicBlock * 7327X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 7328 MachineBasicBlock *BB) const { 7329 DebugLoc dl = MI->getDebugLoc(); 7330 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7331 switch (MI->getOpcode()) { 7332 default: assert(false && "Unexpected instr type to insert"); 7333 case X86::CMOV_V1I64: 7334 case X86::CMOV_FR32: 7335 case X86::CMOV_FR64: 7336 case X86::CMOV_V4F32: 7337 case X86::CMOV_V2F64: 7338 case X86::CMOV_V2I64: { 7339 // To "insert" a SELECT_CC instruction, we actually have to insert the 7340 // diamond control-flow pattern. The incoming instruction knows the 7341 // destination vreg to set, the condition code register to branch on, the 7342 // true/false values to select between, and a branch opcode to use. 7343 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7344 MachineFunction::iterator It = BB; 7345 ++It; 7346 7347 // thisMBB: 7348 // ... 7349 // TrueVal = ... 7350 // cmpTY ccX, r1, r2 7351 // bCC copy1MBB 7352 // fallthrough --> copy0MBB 7353 MachineBasicBlock *thisMBB = BB; 7354 MachineFunction *F = BB->getParent(); 7355 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 7356 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 7357 unsigned Opc = 7358 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 7359 BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB); 7360 F->insert(It, copy0MBB); 7361 F->insert(It, sinkMBB); 7362 // Update machine-CFG edges by transferring all successors of the current 7363 // block to the new block which will contain the Phi node for the select. 7364 sinkMBB->transferSuccessors(BB); 7365 7366 // Add the true and fallthrough blocks as its successors. 7367 BB->addSuccessor(copy0MBB); 7368 BB->addSuccessor(sinkMBB); 7369 7370 // copy0MBB: 7371 // %FalseValue = ... 7372 // # fallthrough to sinkMBB 7373 BB = copy0MBB; 7374 7375 // Update machine-CFG edges 7376 BB->addSuccessor(sinkMBB); 7377 7378 // sinkMBB: 7379 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 7380 // ... 7381 BB = sinkMBB; 7382 BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg()) 7383 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 7384 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 7385 7386 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7387 return BB; 7388 } 7389 7390 case X86::FP32_TO_INT16_IN_MEM: 7391 case X86::FP32_TO_INT32_IN_MEM: 7392 case X86::FP32_TO_INT64_IN_MEM: 7393 case X86::FP64_TO_INT16_IN_MEM: 7394 case X86::FP64_TO_INT32_IN_MEM: 7395 case X86::FP64_TO_INT64_IN_MEM: 7396 case X86::FP80_TO_INT16_IN_MEM: 7397 case X86::FP80_TO_INT32_IN_MEM: 7398 case X86::FP80_TO_INT64_IN_MEM: { 7399 // Change the floating point control register to use "round towards zero" 7400 // mode when truncating to an integer value. 7401 MachineFunction *F = BB->getParent(); 7402 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2); 7403 addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx); 7404 7405 // Load the old value of the high byte of the control word... 7406 unsigned OldCW = 7407 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 7408 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW), 7409 CWFrameIdx); 7410 7411 // Set the high part to be round to zero... 7412 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx) 7413 .addImm(0xC7F); 7414 7415 // Reload the modified control word now... 7416 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); 7417 7418 // Restore the memory image of control word to original value 7419 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx) 7420 .addReg(OldCW); 7421 7422 // Get the X86 opcode to use. 7423 unsigned Opc; 7424 switch (MI->getOpcode()) { 7425 default: assert(0 && "illegal opcode!"); 7426 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 7427 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 7428 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 7429 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 7430 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 7431 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 7432 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 7433 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 7434 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 7435 } 7436 7437 X86AddressMode AM; 7438 MachineOperand &Op = MI->getOperand(0); 7439 if (Op.isReg()) { 7440 AM.BaseType = X86AddressMode::RegBase; 7441 AM.Base.Reg = Op.getReg(); 7442 } else { 7443 AM.BaseType = X86AddressMode::FrameIndexBase; 7444 AM.Base.FrameIndex = Op.getIndex(); 7445 } 7446 Op = MI->getOperand(1); 7447 if (Op.isImm()) 7448 AM.Scale = Op.getImm(); 7449 Op = MI->getOperand(2); 7450 if (Op.isImm()) 7451 AM.IndexReg = Op.getImm(); 7452 Op = MI->getOperand(3); 7453 if (Op.isGlobal()) { 7454 AM.GV = Op.getGlobal(); 7455 } else { 7456 AM.Disp = Op.getImm(); 7457 } 7458 addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM) 7459 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 7460 7461 // Reload the original control word now. 7462 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); 7463 7464 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7465 return BB; 7466 } 7467 case X86::ATOMAND32: 7468 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7469 X86::AND32ri, X86::MOV32rm, 7470 X86::LCMPXCHG32, X86::MOV32rr, 7471 X86::NOT32r, X86::EAX, 7472 X86::GR32RegisterClass); 7473 case X86::ATOMOR32: 7474 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 7475 X86::OR32ri, X86::MOV32rm, 7476 X86::LCMPXCHG32, X86::MOV32rr, 7477 X86::NOT32r, X86::EAX, 7478 X86::GR32RegisterClass); 7479 case X86::ATOMXOR32: 7480 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 7481 X86::XOR32ri, X86::MOV32rm, 7482 X86::LCMPXCHG32, X86::MOV32rr, 7483 X86::NOT32r, X86::EAX, 7484 X86::GR32RegisterClass); 7485 case X86::ATOMNAND32: 7486 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7487 X86::AND32ri, X86::MOV32rm, 7488 X86::LCMPXCHG32, X86::MOV32rr, 7489 X86::NOT32r, X86::EAX, 7490 X86::GR32RegisterClass, true); 7491 case X86::ATOMMIN32: 7492 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 7493 case X86::ATOMMAX32: 7494 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 7495 case X86::ATOMUMIN32: 7496 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 7497 case X86::ATOMUMAX32: 7498 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 7499 7500 case X86::ATOMAND16: 7501 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7502 X86::AND16ri, X86::MOV16rm, 7503 X86::LCMPXCHG16, X86::MOV16rr, 7504 X86::NOT16r, X86::AX, 7505 X86::GR16RegisterClass); 7506 case X86::ATOMOR16: 7507 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 7508 X86::OR16ri, X86::MOV16rm, 7509 X86::LCMPXCHG16, X86::MOV16rr, 7510 X86::NOT16r, X86::AX, 7511 X86::GR16RegisterClass); 7512 case X86::ATOMXOR16: 7513 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 7514 X86::XOR16ri, X86::MOV16rm, 7515 X86::LCMPXCHG16, X86::MOV16rr, 7516 X86::NOT16r, X86::AX, 7517 X86::GR16RegisterClass); 7518 case X86::ATOMNAND16: 7519 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7520 X86::AND16ri, X86::MOV16rm, 7521 X86::LCMPXCHG16, X86::MOV16rr, 7522 X86::NOT16r, X86::AX, 7523 X86::GR16RegisterClass, true); 7524 case X86::ATOMMIN16: 7525 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 7526 case X86::ATOMMAX16: 7527 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 7528 case X86::ATOMUMIN16: 7529 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 7530 case X86::ATOMUMAX16: 7531 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 7532 7533 case X86::ATOMAND8: 7534 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 7535 X86::AND8ri, X86::MOV8rm, 7536 X86::LCMPXCHG8, X86::MOV8rr, 7537 X86::NOT8r, X86::AL, 7538 X86::GR8RegisterClass); 7539 case X86::ATOMOR8: 7540 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 7541 X86::OR8ri, X86::MOV8rm, 7542 X86::LCMPXCHG8, X86::MOV8rr, 7543 X86::NOT8r, X86::AL, 7544 X86::GR8RegisterClass); 7545 case X86::ATOMXOR8: 7546 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 7547 X86::XOR8ri, X86::MOV8rm, 7548 X86::LCMPXCHG8, X86::MOV8rr, 7549 X86::NOT8r, X86::AL, 7550 X86::GR8RegisterClass); 7551 case X86::ATOMNAND8: 7552 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 7553 X86::AND8ri, X86::MOV8rm, 7554 X86::LCMPXCHG8, X86::MOV8rr, 7555 X86::NOT8r, X86::AL, 7556 X86::GR8RegisterClass, true); 7557 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 7558 // This group is for 64-bit host. 7559 case X86::ATOMAND64: 7560 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 7561 X86::AND64ri32, X86::MOV64rm, 7562 X86::LCMPXCHG64, X86::MOV64rr, 7563 X86::NOT64r, X86::RAX, 7564 X86::GR64RegisterClass); 7565 case X86::ATOMOR64: 7566 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 7567 X86::OR64ri32, X86::MOV64rm, 7568 X86::LCMPXCHG64, X86::MOV64rr, 7569 X86::NOT64r, X86::RAX, 7570 X86::GR64RegisterClass); 7571 case X86::ATOMXOR64: 7572 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 7573 X86::XOR64ri32, X86::MOV64rm, 7574 X86::LCMPXCHG64, X86::MOV64rr, 7575 X86::NOT64r, X86::RAX, 7576 X86::GR64RegisterClass); 7577 case X86::ATOMNAND64: 7578 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 7579 X86::AND64ri32, X86::MOV64rm, 7580 X86::LCMPXCHG64, X86::MOV64rr, 7581 X86::NOT64r, X86::RAX, 7582 X86::GR64RegisterClass, true); 7583 case X86::ATOMMIN64: 7584 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 7585 case X86::ATOMMAX64: 7586 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 7587 case X86::ATOMUMIN64: 7588 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 7589 case X86::ATOMUMAX64: 7590 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 7591 7592 // This group does 64-bit operations on a 32-bit host. 7593 case X86::ATOMAND6432: 7594 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7595 X86::AND32rr, X86::AND32rr, 7596 X86::AND32ri, X86::AND32ri, 7597 false); 7598 case X86::ATOMOR6432: 7599 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7600 X86::OR32rr, X86::OR32rr, 7601 X86::OR32ri, X86::OR32ri, 7602 false); 7603 case X86::ATOMXOR6432: 7604 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7605 X86::XOR32rr, X86::XOR32rr, 7606 X86::XOR32ri, X86::XOR32ri, 7607 false); 7608 case X86::ATOMNAND6432: 7609 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7610 X86::AND32rr, X86::AND32rr, 7611 X86::AND32ri, X86::AND32ri, 7612 true); 7613 case X86::ATOMADD6432: 7614 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7615 X86::ADD32rr, X86::ADC32rr, 7616 X86::ADD32ri, X86::ADC32ri, 7617 false); 7618 case X86::ATOMSUB6432: 7619 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7620 X86::SUB32rr, X86::SBB32rr, 7621 X86::SUB32ri, X86::SBB32ri, 7622 false); 7623 case X86::ATOMSWAP6432: 7624 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7625 X86::MOV32rr, X86::MOV32rr, 7626 X86::MOV32ri, X86::MOV32ri, 7627 false); 7628 } 7629} 7630 7631//===----------------------------------------------------------------------===// 7632// X86 Optimization Hooks 7633//===----------------------------------------------------------------------===// 7634 7635void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 7636 const APInt &Mask, 7637 APInt &KnownZero, 7638 APInt &KnownOne, 7639 const SelectionDAG &DAG, 7640 unsigned Depth) const { 7641 unsigned Opc = Op.getOpcode(); 7642 assert((Opc >= ISD::BUILTIN_OP_END || 7643 Opc == ISD::INTRINSIC_WO_CHAIN || 7644 Opc == ISD::INTRINSIC_W_CHAIN || 7645 Opc == ISD::INTRINSIC_VOID) && 7646 "Should use MaskedValueIsZero if you don't know whether Op" 7647 " is a target node!"); 7648 7649 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 7650 switch (Opc) { 7651 default: break; 7652 case X86ISD::ADD: 7653 case X86ISD::SUB: 7654 case X86ISD::SMUL: 7655 case X86ISD::UMUL: 7656 case X86ISD::INC: 7657 case X86ISD::DEC: 7658 // These nodes' second result is a boolean. 7659 if (Op.getResNo() == 0) 7660 break; 7661 // Fallthrough 7662 case X86ISD::SETCC: 7663 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 7664 Mask.getBitWidth() - 1); 7665 break; 7666 } 7667} 7668 7669/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 7670/// node is a GlobalAddress + offset. 7671bool X86TargetLowering::isGAPlusOffset(SDNode *N, 7672 GlobalValue* &GA, int64_t &Offset) const{ 7673 if (N->getOpcode() == X86ISD::Wrapper) { 7674 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 7675 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 7676 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 7677 return true; 7678 } 7679 } 7680 return TargetLowering::isGAPlusOffset(N, GA, Offset); 7681} 7682 7683static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, 7684 const TargetLowering &TLI) { 7685 GlobalValue *GV; 7686 int64_t Offset = 0; 7687 if (TLI.isGAPlusOffset(Base, GV, Offset)) 7688 return (GV->getAlignment() >= N && (Offset % N) == 0); 7689 // DAG combine handles the stack object case. 7690 return false; 7691} 7692 7693static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
|