R600ISelLowering.cpp revision 309124
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10/// \file 11/// \brief Custom DAG lowering for R600 12// 13//===----------------------------------------------------------------------===// 14 15#include "R600ISelLowering.h" 16#include "AMDGPUFrameLowering.h" 17#include "AMDGPUIntrinsicInfo.h" 18#include "AMDGPUSubtarget.h" 19#include "R600Defines.h" 20#include "R600InstrInfo.h" 21#include "R600MachineFunctionInfo.h" 22#include "llvm/Analysis/ValueTracking.h" 23#include "llvm/CodeGen/CallingConvLower.h" 24#include "llvm/CodeGen/MachineFrameInfo.h" 25#include "llvm/CodeGen/MachineInstrBuilder.h" 26#include "llvm/CodeGen/MachineRegisterInfo.h" 27#include "llvm/CodeGen/SelectionDAG.h" 28#include "llvm/IR/Argument.h" 29#include "llvm/IR/Function.h" 30 31using namespace llvm; 32 33R600TargetLowering::R600TargetLowering(const TargetMachine &TM, 34 const R600Subtarget &STI) 35 : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { 36 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 37 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 38 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); 39 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); 40 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 41 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 42 43 computeRegisterProperties(STI.getRegisterInfo()); 44 45 // Legalize loads and stores to the private address space. 46 setOperationAction(ISD::LOAD, MVT::i32, Custom); 47 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 48 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 49 50 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 51 // spaces, so it is custom lowered to handle those where it isn't. 52 for (MVT VT : MVT::integer_valuetypes()) { 53 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 54 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom); 55 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom); 56 57 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); 58 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom); 59 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom); 60 61 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); 62 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom); 63 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom); 64 } 65 66 // Workaround for LegalizeDAG asserting on expansion of i1 vector loads. 67 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 68 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 69 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand); 70 71 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 72 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 73 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand); 74 75 76 setOperationAction(ISD::STORE, MVT::i8, Custom); 77 setOperationAction(ISD::STORE, MVT::i32, Custom); 78 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 79 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 80 81 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 82 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 83 84 // Workaround for LegalizeDAG asserting on expansion of i1 vector stores. 85 setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand); 86 setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand); 87 88 // Set condition code actions 89 setCondCodeAction(ISD::SETO, MVT::f32, Expand); 90 setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 91 setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 92 setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 93 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 94 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 95 setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 96 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 97 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 98 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 99 setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 100 setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 101 102 setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 103 setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 104 setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 105 setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 106 107 setOperationAction(ISD::FCOS, MVT::f32, Custom); 108 setOperationAction(ISD::FSIN, MVT::f32, Custom); 109 110 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 111 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 112 113 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 114 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 115 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 116 117 setOperationAction(ISD::FSUB, MVT::f32, Expand); 118 119 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 120 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 121 122 setOperationAction(ISD::SETCC, MVT::i32, Expand); 123 setOperationAction(ISD::SETCC, MVT::f32, Expand); 124 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 125 setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom); 126 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); 127 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); 128 129 setOperationAction(ISD::SELECT, MVT::i32, Expand); 130 setOperationAction(ISD::SELECT, MVT::f32, Expand); 131 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 132 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 133 134 // ADD, SUB overflow. 135 // TODO: turn these into Legal? 136 if (Subtarget->hasCARRY()) 137 setOperationAction(ISD::UADDO, MVT::i32, Custom); 138 139 if (Subtarget->hasBORROW()) 140 setOperationAction(ISD::USUBO, MVT::i32, Custom); 141 142 // Expand sign extension of vectors 143 if (!Subtarget->hasBFE()) 144 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 145 146 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand); 147 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand); 148 149 if (!Subtarget->hasBFE()) 150 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 151 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand); 152 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand); 153 154 if (!Subtarget->hasBFE()) 155 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 156 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand); 157 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand); 158 159 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 160 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand); 161 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand); 162 163 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); 164 165 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 166 167 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); 168 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); 169 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 170 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 171 172 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); 173 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); 174 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 175 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 176 177 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32 178 // to be Legal/Custom in order to avoid library calls. 179 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 180 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 181 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 182 183 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 184 185 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; 186 for (MVT VT : ScalarIntVTs) { 187 setOperationAction(ISD::ADDC, VT, Expand); 188 setOperationAction(ISD::SUBC, VT, Expand); 189 setOperationAction(ISD::ADDE, VT, Expand); 190 setOperationAction(ISD::SUBE, VT, Expand); 191 } 192 193 setSchedulingPreference(Sched::Source); 194 195 196 setTargetDAGCombine(ISD::FP_ROUND); 197 setTargetDAGCombine(ISD::FP_TO_SINT); 198 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 199 setTargetDAGCombine(ISD::SELECT_CC); 200 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 201} 202 203const R600Subtarget *R600TargetLowering::getSubtarget() const { 204 return static_cast<const R600Subtarget *>(Subtarget); 205} 206 207static inline bool isEOP(MachineBasicBlock::iterator I) { 208 return std::next(I)->getOpcode() == AMDGPU::RETURN; 209} 210 211MachineBasicBlock * 212R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 213 MachineBasicBlock *BB) const { 214 MachineFunction * MF = BB->getParent(); 215 MachineRegisterInfo &MRI = MF->getRegInfo(); 216 MachineBasicBlock::iterator I = MI; 217 const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); 218 219 switch (MI.getOpcode()) { 220 default: 221 // Replace LDS_*_RET instruction that don't have any uses with the 222 // equivalent LDS_*_NORET instruction. 223 if (TII->isLDSRetInstr(MI.getOpcode())) { 224 int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); 225 assert(DstIdx != -1); 226 MachineInstrBuilder NewMI; 227 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add 228 // LDS_1A2D support and remove this special case. 229 if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) || 230 MI.getOpcode() == AMDGPU::LDS_CMPST_RET) 231 return BB; 232 233 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 234 TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode()))); 235 for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { 236 NewMI.addOperand(MI.getOperand(i)); 237 } 238 } else { 239 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 240 } 241 break; 242 case AMDGPU::CLAMP_R600: { 243 MachineInstr *NewMI = TII->buildDefaultInstruction( 244 *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), 245 MI.getOperand(1).getReg()); 246 TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP); 247 break; 248 } 249 250 case AMDGPU::FABS_R600: { 251 MachineInstr *NewMI = TII->buildDefaultInstruction( 252 *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), 253 MI.getOperand(1).getReg()); 254 TII->addFlag(*NewMI, 0, MO_FLAG_ABS); 255 break; 256 } 257 258 case AMDGPU::FNEG_R600: { 259 MachineInstr *NewMI = TII->buildDefaultInstruction( 260 *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), 261 MI.getOperand(1).getReg()); 262 TII->addFlag(*NewMI, 0, MO_FLAG_NEG); 263 break; 264 } 265 266 case AMDGPU::MASK_WRITE: { 267 unsigned maskedRegister = MI.getOperand(0).getReg(); 268 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 269 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 270 TII->addFlag(*defInstr, 0, MO_FLAG_MASK); 271 break; 272 } 273 274 case AMDGPU::MOV_IMM_F32: 275 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1) 276 .getFPImm() 277 ->getValueAPF() 278 .bitcastToAPInt() 279 .getZExtValue()); 280 break; 281 case AMDGPU::MOV_IMM_I32: 282 TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), 283 MI.getOperand(1).getImm()); 284 break; 285 case AMDGPU::MOV_IMM_GLOBAL_ADDR: { 286 //TODO: Perhaps combine this instruction with the next if possible 287 auto MIB = TII->buildDefaultInstruction( 288 *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X); 289 int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal); 290 //TODO: Ugh this is rather ugly 291 MIB->getOperand(Idx) = MI.getOperand(1); 292 break; 293 } 294 case AMDGPU::CONST_COPY: { 295 MachineInstr *NewMI = TII->buildDefaultInstruction( 296 *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); 297 TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel, 298 MI.getOperand(1).getImm()); 299 break; 300 } 301 302 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 303 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 304 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 305 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 306 .addOperand(MI.getOperand(0)) 307 .addOperand(MI.getOperand(1)) 308 .addImm(isEOP(I)); // Set End of program bit 309 break; 310 } 311 case AMDGPU::RAT_STORE_TYPED_eg: { 312 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 313 .addOperand(MI.getOperand(0)) 314 .addOperand(MI.getOperand(1)) 315 .addOperand(MI.getOperand(2)) 316 .addImm(isEOP(I)); // Set End of program bit 317 break; 318 } 319 320 case AMDGPU::TXD: { 321 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 322 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 323 MachineOperand &RID = MI.getOperand(4); 324 MachineOperand &SID = MI.getOperand(5); 325 unsigned TextureId = MI.getOperand(6).getImm(); 326 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 327 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 328 329 switch (TextureId) { 330 case 5: // Rect 331 CTX = CTY = 0; 332 break; 333 case 6: // Shadow1D 334 SrcW = SrcZ; 335 break; 336 case 7: // Shadow2D 337 SrcW = SrcZ; 338 break; 339 case 8: // ShadowRect 340 CTX = CTY = 0; 341 SrcW = SrcZ; 342 break; 343 case 9: // 1DArray 344 SrcZ = SrcY; 345 CTZ = 0; 346 break; 347 case 10: // 2DArray 348 CTZ = 0; 349 break; 350 case 11: // Shadow1DArray 351 SrcZ = SrcY; 352 CTZ = 0; 353 break; 354 case 12: // Shadow2DArray 355 CTZ = 0; 356 break; 357 } 358 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), 359 T0) 360 .addOperand(MI.getOperand(3)) 361 .addImm(SrcX) 362 .addImm(SrcY) 363 .addImm(SrcZ) 364 .addImm(SrcW) 365 .addImm(0) 366 .addImm(0) 367 .addImm(0) 368 .addImm(0) 369 .addImm(1) 370 .addImm(2) 371 .addImm(3) 372 .addOperand(RID) 373 .addOperand(SID) 374 .addImm(CTX) 375 .addImm(CTY) 376 .addImm(CTZ) 377 .addImm(CTW); 378 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), 379 T1) 380 .addOperand(MI.getOperand(2)) 381 .addImm(SrcX) 382 .addImm(SrcY) 383 .addImm(SrcZ) 384 .addImm(SrcW) 385 .addImm(0) 386 .addImm(0) 387 .addImm(0) 388 .addImm(0) 389 .addImm(1) 390 .addImm(2) 391 .addImm(3) 392 .addOperand(RID) 393 .addOperand(SID) 394 .addImm(CTX) 395 .addImm(CTY) 396 .addImm(CTZ) 397 .addImm(CTW); 398 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 399 .addOperand(MI.getOperand(0)) 400 .addOperand(MI.getOperand(1)) 401 .addImm(SrcX) 402 .addImm(SrcY) 403 .addImm(SrcZ) 404 .addImm(SrcW) 405 .addImm(0) 406 .addImm(0) 407 .addImm(0) 408 .addImm(0) 409 .addImm(1) 410 .addImm(2) 411 .addImm(3) 412 .addOperand(RID) 413 .addOperand(SID) 414 .addImm(CTX) 415 .addImm(CTY) 416 .addImm(CTZ) 417 .addImm(CTW) 418 .addReg(T0, RegState::Implicit) 419 .addReg(T1, RegState::Implicit); 420 break; 421 } 422 423 case AMDGPU::TXD_SHADOW: { 424 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 425 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 426 MachineOperand &RID = MI.getOperand(4); 427 MachineOperand &SID = MI.getOperand(5); 428 unsigned TextureId = MI.getOperand(6).getImm(); 429 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 430 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 431 432 switch (TextureId) { 433 case 5: // Rect 434 CTX = CTY = 0; 435 break; 436 case 6: // Shadow1D 437 SrcW = SrcZ; 438 break; 439 case 7: // Shadow2D 440 SrcW = SrcZ; 441 break; 442 case 8: // ShadowRect 443 CTX = CTY = 0; 444 SrcW = SrcZ; 445 break; 446 case 9: // 1DArray 447 SrcZ = SrcY; 448 CTZ = 0; 449 break; 450 case 10: // 2DArray 451 CTZ = 0; 452 break; 453 case 11: // Shadow1DArray 454 SrcZ = SrcY; 455 CTZ = 0; 456 break; 457 case 12: // Shadow2DArray 458 CTZ = 0; 459 break; 460 } 461 462 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), 463 T0) 464 .addOperand(MI.getOperand(3)) 465 .addImm(SrcX) 466 .addImm(SrcY) 467 .addImm(SrcZ) 468 .addImm(SrcW) 469 .addImm(0) 470 .addImm(0) 471 .addImm(0) 472 .addImm(0) 473 .addImm(1) 474 .addImm(2) 475 .addImm(3) 476 .addOperand(RID) 477 .addOperand(SID) 478 .addImm(CTX) 479 .addImm(CTY) 480 .addImm(CTZ) 481 .addImm(CTW); 482 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), 483 T1) 484 .addOperand(MI.getOperand(2)) 485 .addImm(SrcX) 486 .addImm(SrcY) 487 .addImm(SrcZ) 488 .addImm(SrcW) 489 .addImm(0) 490 .addImm(0) 491 .addImm(0) 492 .addImm(0) 493 .addImm(1) 494 .addImm(2) 495 .addImm(3) 496 .addOperand(RID) 497 .addOperand(SID) 498 .addImm(CTX) 499 .addImm(CTY) 500 .addImm(CTZ) 501 .addImm(CTW); 502 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 503 .addOperand(MI.getOperand(0)) 504 .addOperand(MI.getOperand(1)) 505 .addImm(SrcX) 506 .addImm(SrcY) 507 .addImm(SrcZ) 508 .addImm(SrcW) 509 .addImm(0) 510 .addImm(0) 511 .addImm(0) 512 .addImm(0) 513 .addImm(1) 514 .addImm(2) 515 .addImm(3) 516 .addOperand(RID) 517 .addOperand(SID) 518 .addImm(CTX) 519 .addImm(CTY) 520 .addImm(CTZ) 521 .addImm(CTW) 522 .addReg(T0, RegState::Implicit) 523 .addReg(T1, RegState::Implicit); 524 break; 525 } 526 527 case AMDGPU::BRANCH: 528 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 529 .addOperand(MI.getOperand(0)); 530 break; 531 532 case AMDGPU::BRANCH_COND_f32: { 533 MachineInstr *NewMI = 534 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 535 AMDGPU::PREDICATE_BIT) 536 .addOperand(MI.getOperand(1)) 537 .addImm(OPCODE_IS_NOT_ZERO) 538 .addImm(0); // Flags 539 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); 540 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 541 .addOperand(MI.getOperand(0)) 542 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 543 break; 544 } 545 546 case AMDGPU::BRANCH_COND_i32: { 547 MachineInstr *NewMI = 548 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 549 AMDGPU::PREDICATE_BIT) 550 .addOperand(MI.getOperand(1)) 551 .addImm(OPCODE_IS_NOT_ZERO_INT) 552 .addImm(0); // Flags 553 TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); 554 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 555 .addOperand(MI.getOperand(0)) 556 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 557 break; 558 } 559 560 case AMDGPU::EG_ExportSwz: 561 case AMDGPU::R600_ExportSwz: { 562 // Instruction is left unmodified if its not the last one of its type 563 bool isLastInstructionOfItsType = true; 564 unsigned InstExportType = MI.getOperand(1).getImm(); 565 for (MachineBasicBlock::iterator NextExportInst = std::next(I), 566 EndBlock = BB->end(); NextExportInst != EndBlock; 567 NextExportInst = std::next(NextExportInst)) { 568 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 569 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 570 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 571 .getImm(); 572 if (CurrentInstExportType == InstExportType) { 573 isLastInstructionOfItsType = false; 574 break; 575 } 576 } 577 } 578 bool EOP = isEOP(I); 579 if (!EOP && !isLastInstructionOfItsType) 580 return BB; 581 unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40; 582 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) 583 .addOperand(MI.getOperand(0)) 584 .addOperand(MI.getOperand(1)) 585 .addOperand(MI.getOperand(2)) 586 .addOperand(MI.getOperand(3)) 587 .addOperand(MI.getOperand(4)) 588 .addOperand(MI.getOperand(5)) 589 .addOperand(MI.getOperand(6)) 590 .addImm(CfInst) 591 .addImm(EOP); 592 break; 593 } 594 case AMDGPU::RETURN: { 595 // RETURN instructions must have the live-out registers as implicit uses, 596 // otherwise they appear dead. 597 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 598 MachineInstrBuilder MIB(*MF, MI); 599 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 600 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 601 return BB; 602 } 603 } 604 605 MI.eraseFromParent(); 606 return BB; 607} 608 609//===----------------------------------------------------------------------===// 610// Custom DAG Lowering Operations 611//===----------------------------------------------------------------------===// 612 613SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 614 MachineFunction &MF = DAG.getMachineFunction(); 615 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 616 switch (Op.getOpcode()) { 617 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 618 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 619 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 620 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG); 621 case ISD::SRA_PARTS: 622 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG); 623 case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY); 624 case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW); 625 case ISD::FCOS: 626 case ISD::FSIN: return LowerTrig(Op, DAG); 627 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 628 case ISD::STORE: return LowerSTORE(Op, DAG); 629 case ISD::LOAD: { 630 SDValue Result = LowerLOAD(Op, DAG); 631 assert((!Result.getNode() || 632 Result.getNode()->getNumValues() == 2) && 633 "Load should return a value and a chain"); 634 return Result; 635 } 636 637 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 638 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 639 case ISD::FrameIndex: return lowerFrameIndex(Op, DAG); 640 case ISD::INTRINSIC_VOID: { 641 SDValue Chain = Op.getOperand(0); 642 unsigned IntrinsicID = 643 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 644 switch (IntrinsicID) { 645 case AMDGPUIntrinsic::r600_store_swizzle: { 646 SDLoc DL(Op); 647 const SDValue Args[8] = { 648 Chain, 649 Op.getOperand(2), // Export Value 650 Op.getOperand(3), // ArrayBase 651 Op.getOperand(4), // Type 652 DAG.getConstant(0, DL, MVT::i32), // SWZ_X 653 DAG.getConstant(1, DL, MVT::i32), // SWZ_Y 654 DAG.getConstant(2, DL, MVT::i32), // SWZ_Z 655 DAG.getConstant(3, DL, MVT::i32) // SWZ_W 656 }; 657 return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args); 658 } 659 660 // default for switch(IntrinsicID) 661 default: break; 662 } 663 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 664 break; 665 } 666 case ISD::INTRINSIC_WO_CHAIN: { 667 unsigned IntrinsicID = 668 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 669 EVT VT = Op.getValueType(); 670 SDLoc DL(Op); 671 switch(IntrinsicID) { 672 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 673 case AMDGPUIntrinsic::r600_tex: 674 case AMDGPUIntrinsic::r600_texc: 675 case AMDGPUIntrinsic::r600_txl: 676 case AMDGPUIntrinsic::r600_txlc: 677 case AMDGPUIntrinsic::r600_txb: 678 case AMDGPUIntrinsic::r600_txbc: 679 case AMDGPUIntrinsic::r600_txf: 680 case AMDGPUIntrinsic::r600_txq: 681 case AMDGPUIntrinsic::r600_ddx: 682 case AMDGPUIntrinsic::r600_ddy: { 683 unsigned TextureOp; 684 switch (IntrinsicID) { 685 case AMDGPUIntrinsic::r600_tex: 686 TextureOp = 0; 687 break; 688 case AMDGPUIntrinsic::r600_texc: 689 TextureOp = 1; 690 break; 691 case AMDGPUIntrinsic::r600_txl: 692 TextureOp = 2; 693 break; 694 case AMDGPUIntrinsic::r600_txlc: 695 TextureOp = 3; 696 break; 697 case AMDGPUIntrinsic::r600_txb: 698 TextureOp = 4; 699 break; 700 case AMDGPUIntrinsic::r600_txbc: 701 TextureOp = 5; 702 break; 703 case AMDGPUIntrinsic::r600_txf: 704 TextureOp = 6; 705 break; 706 case AMDGPUIntrinsic::r600_txq: 707 TextureOp = 7; 708 break; 709 case AMDGPUIntrinsic::r600_ddx: 710 TextureOp = 8; 711 break; 712 case AMDGPUIntrinsic::r600_ddy: 713 TextureOp = 9; 714 break; 715 default: 716 llvm_unreachable("Unknow Texture Operation"); 717 } 718 719 SDValue TexArgs[19] = { 720 DAG.getConstant(TextureOp, DL, MVT::i32), 721 Op.getOperand(1), 722 DAG.getConstant(0, DL, MVT::i32), 723 DAG.getConstant(1, DL, MVT::i32), 724 DAG.getConstant(2, DL, MVT::i32), 725 DAG.getConstant(3, DL, MVT::i32), 726 Op.getOperand(2), 727 Op.getOperand(3), 728 Op.getOperand(4), 729 DAG.getConstant(0, DL, MVT::i32), 730 DAG.getConstant(1, DL, MVT::i32), 731 DAG.getConstant(2, DL, MVT::i32), 732 DAG.getConstant(3, DL, MVT::i32), 733 Op.getOperand(5), 734 Op.getOperand(6), 735 Op.getOperand(7), 736 Op.getOperand(8), 737 Op.getOperand(9), 738 Op.getOperand(10) 739 }; 740 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); 741 } 742 case AMDGPUIntrinsic::r600_dot4: { 743 SDValue Args[8] = { 744 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 745 DAG.getConstant(0, DL, MVT::i32)), 746 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 747 DAG.getConstant(0, DL, MVT::i32)), 748 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 749 DAG.getConstant(1, DL, MVT::i32)), 750 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 751 DAG.getConstant(1, DL, MVT::i32)), 752 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 753 DAG.getConstant(2, DL, MVT::i32)), 754 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 755 DAG.getConstant(2, DL, MVT::i32)), 756 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 757 DAG.getConstant(3, DL, MVT::i32)), 758 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 759 DAG.getConstant(3, DL, MVT::i32)) 760 }; 761 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args); 762 } 763 764 case Intrinsic::r600_implicitarg_ptr: { 765 MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS); 766 uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); 767 return DAG.getConstant(ByteOffset, DL, PtrVT); 768 } 769 case Intrinsic::r600_read_ngroups_x: 770 return LowerImplicitParameter(DAG, VT, DL, 0); 771 case Intrinsic::r600_read_ngroups_y: 772 return LowerImplicitParameter(DAG, VT, DL, 1); 773 case Intrinsic::r600_read_ngroups_z: 774 return LowerImplicitParameter(DAG, VT, DL, 2); 775 case Intrinsic::r600_read_global_size_x: 776 return LowerImplicitParameter(DAG, VT, DL, 3); 777 case Intrinsic::r600_read_global_size_y: 778 return LowerImplicitParameter(DAG, VT, DL, 4); 779 case Intrinsic::r600_read_global_size_z: 780 return LowerImplicitParameter(DAG, VT, DL, 5); 781 case Intrinsic::r600_read_local_size_x: 782 return LowerImplicitParameter(DAG, VT, DL, 6); 783 case Intrinsic::r600_read_local_size_y: 784 return LowerImplicitParameter(DAG, VT, DL, 7); 785 case Intrinsic::r600_read_local_size_z: 786 return LowerImplicitParameter(DAG, VT, DL, 8); 787 788 case Intrinsic::r600_read_workdim: 789 case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name. 790 uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM); 791 return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4); 792 } 793 794 case Intrinsic::r600_read_tgid_x: 795 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 796 AMDGPU::T1_X, VT); 797 case Intrinsic::r600_read_tgid_y: 798 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 799 AMDGPU::T1_Y, VT); 800 case Intrinsic::r600_read_tgid_z: 801 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 802 AMDGPU::T1_Z, VT); 803 case Intrinsic::r600_read_tidig_x: 804 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 805 AMDGPU::T0_X, VT); 806 case Intrinsic::r600_read_tidig_y: 807 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 808 AMDGPU::T0_Y, VT); 809 case Intrinsic::r600_read_tidig_z: 810 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 811 AMDGPU::T0_Z, VT); 812 813 case Intrinsic::r600_recipsqrt_ieee: 814 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); 815 816 case Intrinsic::r600_recipsqrt_clamped: 817 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); 818 } 819 820 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 821 break; 822 } 823 } // end switch(Op.getOpcode()) 824 return SDValue(); 825} 826 827void R600TargetLowering::ReplaceNodeResults(SDNode *N, 828 SmallVectorImpl<SDValue> &Results, 829 SelectionDAG &DAG) const { 830 switch (N->getOpcode()) { 831 default: 832 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); 833 return; 834 case ISD::FP_TO_UINT: 835 if (N->getValueType(0) == MVT::i1) { 836 Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG)); 837 return; 838 } 839 // Fall-through. Since we don't care about out of bounds values 840 // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint 841 // considers some extra cases which are not necessary here. 842 case ISD::FP_TO_SINT: { 843 if (N->getValueType(0) == MVT::i1) { 844 Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG)); 845 return; 846 } 847 848 SDValue Result; 849 if (expandFP_TO_SINT(N, Result, DAG)) 850 Results.push_back(Result); 851 return; 852 } 853 case ISD::SDIVREM: { 854 SDValue Op = SDValue(N, 1); 855 SDValue RES = LowerSDIVREM(Op, DAG); 856 Results.push_back(RES); 857 Results.push_back(RES.getValue(1)); 858 break; 859 } 860 case ISD::UDIVREM: { 861 SDValue Op = SDValue(N, 0); 862 LowerUDIVREM64(Op, DAG, Results); 863 break; 864 } 865 } 866} 867 868SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, 869 SDValue Vector) const { 870 871 SDLoc DL(Vector); 872 EVT VecVT = Vector.getValueType(); 873 EVT EltVT = VecVT.getVectorElementType(); 874 SmallVector<SDValue, 8> Args; 875 876 for (unsigned i = 0, e = VecVT.getVectorNumElements(); 877 i != e; ++i) { 878 Args.push_back(DAG.getNode( 879 ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector, 880 DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout())))); 881 } 882 883 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); 884} 885 886SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 887 SelectionDAG &DAG) const { 888 889 SDLoc DL(Op); 890 SDValue Vector = Op.getOperand(0); 891 SDValue Index = Op.getOperand(1); 892 893 if (isa<ConstantSDNode>(Index) || 894 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 895 return Op; 896 897 Vector = vectorToVerticalVector(DAG, Vector); 898 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), 899 Vector, Index); 900} 901 902SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 903 SelectionDAG &DAG) const { 904 SDLoc DL(Op); 905 SDValue Vector = Op.getOperand(0); 906 SDValue Value = Op.getOperand(1); 907 SDValue Index = Op.getOperand(2); 908 909 if (isa<ConstantSDNode>(Index) || 910 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) 911 return Op; 912 913 Vector = vectorToVerticalVector(DAG, Vector); 914 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), 915 Vector, Value, Index); 916 return vectorToVerticalVector(DAG, Insert); 917} 918 919SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, 920 SDValue Op, 921 SelectionDAG &DAG) const { 922 923 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); 924 if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) 925 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); 926 927 const DataLayout &DL = DAG.getDataLayout(); 928 const GlobalValue *GV = GSD->getGlobal(); 929 MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); 930 931 SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT); 932 return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA); 933} 934 935SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 936 // On hw >= R700, COS/SIN input must be between -1. and 1. 937 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 938 EVT VT = Op.getValueType(); 939 SDValue Arg = Op.getOperand(0); 940 SDLoc DL(Op); 941 942 // TODO: Should this propagate fast-math-flags? 943 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, 944 DAG.getNode(ISD::FADD, DL, VT, 945 DAG.getNode(ISD::FMUL, DL, VT, Arg, 946 DAG.getConstantFP(0.15915494309, DL, MVT::f32)), 947 DAG.getConstantFP(0.5, DL, MVT::f32))); 948 unsigned TrigNode; 949 switch (Op.getOpcode()) { 950 case ISD::FCOS: 951 TrigNode = AMDGPUISD::COS_HW; 952 break; 953 case ISD::FSIN: 954 TrigNode = AMDGPUISD::SIN_HW; 955 break; 956 default: 957 llvm_unreachable("Wrong trig opcode"); 958 } 959 SDValue TrigVal = DAG.getNode(TrigNode, DL, VT, 960 DAG.getNode(ISD::FADD, DL, VT, FractPart, 961 DAG.getConstantFP(-0.5, DL, MVT::f32))); 962 if (Gen >= R600Subtarget::R700) 963 return TrigVal; 964 // On R600 hw, COS/SIN input must be between -Pi and Pi. 965 return DAG.getNode(ISD::FMUL, DL, VT, TrigVal, 966 DAG.getConstantFP(3.14159265359, DL, MVT::f32)); 967} 968 969SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const { 970 SDLoc DL(Op); 971 EVT VT = Op.getValueType(); 972 973 SDValue Lo = Op.getOperand(0); 974 SDValue Hi = Op.getOperand(1); 975 SDValue Shift = Op.getOperand(2); 976 SDValue Zero = DAG.getConstant(0, DL, VT); 977 SDValue One = DAG.getConstant(1, DL, VT); 978 979 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); 980 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); 981 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 982 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 983 984 // The dance around Width1 is necessary for 0 special case. 985 // Without it the CompShift might be 32, producing incorrect results in 986 // Overflow. So we do the shift in two steps, the alternative is to 987 // add a conditional to filter the special case. 988 989 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift); 990 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One); 991 992 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift); 993 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow); 994 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift); 995 996 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift); 997 SDValue LoBig = Zero; 998 999 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 1000 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 1001 1002 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 1003} 1004 1005SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const { 1006 SDLoc DL(Op); 1007 EVT VT = Op.getValueType(); 1008 1009 SDValue Lo = Op.getOperand(0); 1010 SDValue Hi = Op.getOperand(1); 1011 SDValue Shift = Op.getOperand(2); 1012 SDValue Zero = DAG.getConstant(0, DL, VT); 1013 SDValue One = DAG.getConstant(1, DL, VT); 1014 1015 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS; 1016 1017 SDValue Width = DAG.getConstant(VT.getSizeInBits(), DL, VT); 1018 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); 1019 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width); 1020 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift); 1021 1022 // The dance around Width1 is necessary for 0 special case. 1023 // Without it the CompShift might be 32, producing incorrect results in 1024 // Overflow. So we do the shift in two steps, the alternative is to 1025 // add a conditional to filter the special case. 1026 1027 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift); 1028 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One); 1029 1030 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift); 1031 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift); 1032 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow); 1033 1034 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift); 1035 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero; 1036 1037 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT); 1038 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT); 1039 1040 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi); 1041} 1042 1043SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG, 1044 unsigned mainop, unsigned ovf) const { 1045 SDLoc DL(Op); 1046 EVT VT = Op.getValueType(); 1047 1048 SDValue Lo = Op.getOperand(0); 1049 SDValue Hi = Op.getOperand(1); 1050 1051 SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi); 1052 // Extend sign. 1053 OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF, 1054 DAG.getValueType(MVT::i1)); 1055 1056 SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi); 1057 1058 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF); 1059} 1060 1061SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const { 1062 SDLoc DL(Op); 1063 return DAG.getNode( 1064 ISD::SETCC, 1065 DL, 1066 MVT::i1, 1067 Op, DAG.getConstantFP(1.0f, DL, MVT::f32), 1068 DAG.getCondCode(ISD::SETEQ)); 1069} 1070 1071SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { 1072 SDLoc DL(Op); 1073 return DAG.getNode( 1074 ISD::SETCC, 1075 DL, 1076 MVT::i1, 1077 Op, DAG.getConstantFP(-1.0f, DL, MVT::f32), 1078 DAG.getCondCode(ISD::SETEQ)); 1079} 1080 1081SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 1082 const SDLoc &DL, 1083 unsigned DwordOffset) const { 1084 unsigned ByteOffset = DwordOffset * 4; 1085 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1086 AMDGPUAS::CONSTANT_BUFFER_0); 1087 1088 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 1089 assert(isInt<16>(ByteOffset)); 1090 1091 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 1092 DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR 1093 MachinePointerInfo(ConstantPointerNull::get(PtrType))); 1094} 1095 1096bool R600TargetLowering::isZero(SDValue Op) const { 1097 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 1098 return Cst->isNullValue(); 1099 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 1100 return CstFP->isZero(); 1101 } else { 1102 return false; 1103 } 1104} 1105 1106bool R600TargetLowering::isHWTrueValue(SDValue Op) const { 1107 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 1108 return CFP->isExactlyValue(1.0); 1109 } 1110 return isAllOnesConstant(Op); 1111} 1112 1113bool R600TargetLowering::isHWFalseValue(SDValue Op) const { 1114 if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { 1115 return CFP->getValueAPF().isZero(); 1116 } 1117 return isNullConstant(Op); 1118} 1119 1120SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 1121 SDLoc DL(Op); 1122 EVT VT = Op.getValueType(); 1123 1124 SDValue LHS = Op.getOperand(0); 1125 SDValue RHS = Op.getOperand(1); 1126 SDValue True = Op.getOperand(2); 1127 SDValue False = Op.getOperand(3); 1128 SDValue CC = Op.getOperand(4); 1129 SDValue Temp; 1130 1131 if (VT == MVT::f32) { 1132 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); 1133 SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); 1134 if (MinMax) 1135 return MinMax; 1136 } 1137 1138 // LHS and RHS are guaranteed to be the same value type 1139 EVT CompareVT = LHS.getValueType(); 1140 1141 // Check if we can lower this to a native operation. 1142 1143 // Try to lower to a SET* instruction: 1144 // 1145 // SET* can match the following patterns: 1146 // 1147 // select_cc f32, f32, -1, 0, cc_supported 1148 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 1149 // select_cc i32, i32, -1, 0, cc_supported 1150 // 1151 1152 // Move hardware True/False values to the correct operand. 1153 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1154 ISD::CondCode InverseCC = 1155 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1156 if (isHWTrueValue(False) && isHWFalseValue(True)) { 1157 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 1158 std::swap(False, True); 1159 CC = DAG.getCondCode(InverseCC); 1160 } else { 1161 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 1162 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 1163 std::swap(False, True); 1164 std::swap(LHS, RHS); 1165 CC = DAG.getCondCode(SwapInvCC); 1166 } 1167 } 1168 } 1169 1170 if (isHWTrueValue(True) && isHWFalseValue(False) && 1171 (CompareVT == VT || VT == MVT::i32)) { 1172 // This can be matched by a SET* instruction. 1173 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 1174 } 1175 1176 // Try to lower to a CND* instruction: 1177 // 1178 // CND* can match the following patterns: 1179 // 1180 // select_cc f32, 0.0, f32, f32, cc_supported 1181 // select_cc f32, 0.0, i32, i32, cc_supported 1182 // select_cc i32, 0, f32, f32, cc_supported 1183 // select_cc i32, 0, i32, i32, cc_supported 1184 // 1185 1186 // Try to move the zero value to the RHS 1187 if (isZero(LHS)) { 1188 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1189 // Try swapping the operands 1190 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 1191 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1192 std::swap(LHS, RHS); 1193 CC = DAG.getCondCode(CCSwapped); 1194 } else { 1195 // Try inverting the conditon and then swapping the operands 1196 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); 1197 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 1198 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 1199 std::swap(True, False); 1200 std::swap(LHS, RHS); 1201 CC = DAG.getCondCode(CCSwapped); 1202 } 1203 } 1204 } 1205 if (isZero(RHS)) { 1206 SDValue Cond = LHS; 1207 SDValue Zero = RHS; 1208 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 1209 if (CompareVT != VT) { 1210 // Bitcast True / False to the correct types. This will end up being 1211 // a nop, but it allows us to define only a single pattern in the 1212 // .TD files for each CND* instruction rather than having to have 1213 // one pattern for integer True/False and one for fp True/False 1214 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 1215 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 1216 } 1217 1218 switch (CCOpcode) { 1219 case ISD::SETONE: 1220 case ISD::SETUNE: 1221 case ISD::SETNE: 1222 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 1223 Temp = True; 1224 True = False; 1225 False = Temp; 1226 break; 1227 default: 1228 break; 1229 } 1230 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 1231 Cond, Zero, 1232 True, False, 1233 DAG.getCondCode(CCOpcode)); 1234 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 1235 } 1236 1237 // If we make it this for it means we have no native instructions to handle 1238 // this SELECT_CC, so we must lower it. 1239 SDValue HWTrue, HWFalse; 1240 1241 if (CompareVT == MVT::f32) { 1242 HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT); 1243 HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT); 1244 } else if (CompareVT == MVT::i32) { 1245 HWTrue = DAG.getConstant(-1, DL, CompareVT); 1246 HWFalse = DAG.getConstant(0, DL, CompareVT); 1247 } 1248 else { 1249 llvm_unreachable("Unhandled value type in LowerSELECT_CC"); 1250 } 1251 1252 // Lower this unsupported SELECT_CC into a combination of two supported 1253 // SELECT_CC operations. 1254 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 1255 1256 return DAG.getNode(ISD::SELECT_CC, DL, VT, 1257 Cond, HWFalse, 1258 True, False, 1259 DAG.getCondCode(ISD::SETNE)); 1260} 1261 1262/// LLVM generates byte-addressed pointers. For indirect addressing, we need to 1263/// convert these pointers to a register index. Each register holds 1264/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 1265/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 1266/// for indirect addressing. 1267SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 1268 unsigned StackWidth, 1269 SelectionDAG &DAG) const { 1270 unsigned SRLPad; 1271 switch(StackWidth) { 1272 case 1: 1273 SRLPad = 2; 1274 break; 1275 case 2: 1276 SRLPad = 3; 1277 break; 1278 case 4: 1279 SRLPad = 4; 1280 break; 1281 default: llvm_unreachable("Invalid stack width"); 1282 } 1283 1284 SDLoc DL(Ptr); 1285 return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr, 1286 DAG.getConstant(SRLPad, DL, MVT::i32)); 1287} 1288 1289void R600TargetLowering::getStackAddress(unsigned StackWidth, 1290 unsigned ElemIdx, 1291 unsigned &Channel, 1292 unsigned &PtrIncr) const { 1293 switch (StackWidth) { 1294 default: 1295 case 1: 1296 Channel = 0; 1297 if (ElemIdx > 0) { 1298 PtrIncr = 1; 1299 } else { 1300 PtrIncr = 0; 1301 } 1302 break; 1303 case 2: 1304 Channel = ElemIdx % 2; 1305 if (ElemIdx == 2) { 1306 PtrIncr = 1; 1307 } else { 1308 PtrIncr = 0; 1309 } 1310 break; 1311 case 4: 1312 Channel = ElemIdx; 1313 PtrIncr = 0; 1314 break; 1315 } 1316} 1317 1318SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store, 1319 SelectionDAG &DAG) const { 1320 SDLoc DL(Store); 1321 1322 unsigned Mask = 0; 1323 if (Store->getMemoryVT() == MVT::i8) { 1324 Mask = 0xff; 1325 } else if (Store->getMemoryVT() == MVT::i16) { 1326 Mask = 0xffff; 1327 } 1328 1329 SDValue Chain = Store->getChain(); 1330 SDValue BasePtr = Store->getBasePtr(); 1331 EVT MemVT = Store->getMemoryVT(); 1332 1333 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr, 1334 DAG.getConstant(2, DL, MVT::i32)); 1335 SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32, 1336 Chain, Ptr, 1337 DAG.getTargetConstant(0, DL, MVT::i32)); 1338 1339 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr, 1340 DAG.getConstant(0x3, DL, MVT::i32)); 1341 1342 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1343 DAG.getConstant(3, DL, MVT::i32)); 1344 1345 SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, 1346 Store->getValue()); 1347 1348 SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT); 1349 1350 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32, 1351 MaskedValue, ShiftAmt); 1352 1353 SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32, 1354 DAG.getConstant(Mask, DL, MVT::i32), 1355 ShiftAmt); 1356 DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask, 1357 DAG.getConstant(0xffffffff, DL, MVT::i32)); 1358 Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask); 1359 1360 SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue); 1361 return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1362 Chain, Value, Ptr, 1363 DAG.getTargetConstant(0, DL, MVT::i32)); 1364} 1365 1366SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1367 if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG)) 1368 return Result; 1369 1370 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1371 unsigned AS = StoreNode->getAddressSpace(); 1372 SDValue Value = StoreNode->getValue(); 1373 EVT ValueVT = Value.getValueType(); 1374 1375 if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) && 1376 ValueVT.isVector()) { 1377 return SplitVectorStore(Op, DAG); 1378 } 1379 1380 SDLoc DL(Op); 1381 SDValue Chain = StoreNode->getChain(); 1382 SDValue Ptr = StoreNode->getBasePtr(); 1383 1384 if (AS == AMDGPUAS::GLOBAL_ADDRESS) { 1385 if (StoreNode->isTruncatingStore()) { 1386 EVT VT = Value.getValueType(); 1387 assert(VT.bitsLE(MVT::i32)); 1388 EVT MemVT = StoreNode->getMemoryVT(); 1389 SDValue MaskConstant; 1390 if (MemVT == MVT::i8) { 1391 MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32); 1392 } else { 1393 assert(MemVT == MVT::i16); 1394 MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32); 1395 } 1396 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, 1397 DAG.getConstant(2, DL, MVT::i32)); 1398 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, 1399 DAG.getConstant(0x00000003, DL, VT)); 1400 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1401 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1402 DAG.getConstant(3, DL, VT)); 1403 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); 1404 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); 1405 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1406 // vector instead. 1407 SDValue Src[4] = { 1408 ShiftedValue, 1409 DAG.getConstant(0, DL, MVT::i32), 1410 DAG.getConstant(0, DL, MVT::i32), 1411 Mask 1412 }; 1413 SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src); 1414 SDValue Args[3] = { Chain, Input, DWordAddr }; 1415 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1416 Op->getVTList(), Args, MemVT, 1417 StoreNode->getMemOperand()); 1418 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && 1419 ValueVT.bitsGE(MVT::i32)) { 1420 // Convert pointer from byte address to dword address. 1421 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 1422 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 1423 Ptr, DAG.getConstant(2, DL, MVT::i32))); 1424 1425 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 1426 llvm_unreachable("Truncated and indexed stores not supported yet"); 1427 } else { 1428 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1429 } 1430 return Chain; 1431 } 1432 } 1433 1434 if (AS != AMDGPUAS::PRIVATE_ADDRESS) 1435 return SDValue(); 1436 1437 EVT MemVT = StoreNode->getMemoryVT(); 1438 if (MemVT.bitsLT(MVT::i32)) 1439 return lowerPrivateTruncStore(StoreNode, DAG); 1440 1441 // Lowering for indirect addressing 1442 const MachineFunction &MF = DAG.getMachineFunction(); 1443 const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); 1444 unsigned StackWidth = TFL->getStackWidth(MF); 1445 1446 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1447 1448 if (ValueVT.isVector()) { 1449 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1450 EVT ElemVT = ValueVT.getVectorElementType(); 1451 SmallVector<SDValue, 4> Stores(NumElemVT); 1452 1453 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1454 "vector width in load"); 1455 1456 for (unsigned i = 0; i < NumElemVT; ++i) { 1457 unsigned Channel, PtrIncr; 1458 getStackAddress(StackWidth, i, Channel, PtrIncr); 1459 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1460 DAG.getConstant(PtrIncr, DL, MVT::i32)); 1461 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1462 Value, DAG.getConstant(i, DL, MVT::i32)); 1463 1464 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1465 Chain, Elem, Ptr, 1466 DAG.getTargetConstant(Channel, DL, MVT::i32)); 1467 } 1468 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 1469 } else { 1470 if (ValueVT == MVT::i8) { 1471 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1472 } 1473 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1474 DAG.getTargetConstant(0, DL, MVT::i32)); // Channel 1475 } 1476 1477 return Chain; 1478} 1479 1480// return (512 + (kc_bank << 12) 1481static int 1482ConstantAddressBlock(unsigned AddressSpace) { 1483 switch (AddressSpace) { 1484 case AMDGPUAS::CONSTANT_BUFFER_0: 1485 return 512; 1486 case AMDGPUAS::CONSTANT_BUFFER_1: 1487 return 512 + 4096; 1488 case AMDGPUAS::CONSTANT_BUFFER_2: 1489 return 512 + 4096 * 2; 1490 case AMDGPUAS::CONSTANT_BUFFER_3: 1491 return 512 + 4096 * 3; 1492 case AMDGPUAS::CONSTANT_BUFFER_4: 1493 return 512 + 4096 * 4; 1494 case AMDGPUAS::CONSTANT_BUFFER_5: 1495 return 512 + 4096 * 5; 1496 case AMDGPUAS::CONSTANT_BUFFER_6: 1497 return 512 + 4096 * 6; 1498 case AMDGPUAS::CONSTANT_BUFFER_7: 1499 return 512 + 4096 * 7; 1500 case AMDGPUAS::CONSTANT_BUFFER_8: 1501 return 512 + 4096 * 8; 1502 case AMDGPUAS::CONSTANT_BUFFER_9: 1503 return 512 + 4096 * 9; 1504 case AMDGPUAS::CONSTANT_BUFFER_10: 1505 return 512 + 4096 * 10; 1506 case AMDGPUAS::CONSTANT_BUFFER_11: 1507 return 512 + 4096 * 11; 1508 case AMDGPUAS::CONSTANT_BUFFER_12: 1509 return 512 + 4096 * 12; 1510 case AMDGPUAS::CONSTANT_BUFFER_13: 1511 return 512 + 4096 * 13; 1512 case AMDGPUAS::CONSTANT_BUFFER_14: 1513 return 512 + 4096 * 14; 1514 case AMDGPUAS::CONSTANT_BUFFER_15: 1515 return 512 + 4096 * 15; 1516 default: 1517 return -1; 1518 } 1519} 1520 1521SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op, 1522 SelectionDAG &DAG) const { 1523 SDLoc DL(Op); 1524 LoadSDNode *Load = cast<LoadSDNode>(Op); 1525 ISD::LoadExtType ExtType = Load->getExtensionType(); 1526 EVT MemVT = Load->getMemoryVT(); 1527 1528 // <SI && AS=PRIVATE && EXTLOAD && size < 32bit, 1529 // register (2-)byte extract. 1530 1531 // Get Register holding the target. 1532 SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(), 1533 DAG.getConstant(2, DL, MVT::i32)); 1534 // Load the Register. 1535 SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 1536 Load->getChain(), 1537 Ptr, 1538 DAG.getTargetConstant(0, DL, MVT::i32), 1539 Op.getOperand(2)); 1540 1541 // Get offset within the register. 1542 SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, 1543 Load->getBasePtr(), 1544 DAG.getConstant(0x3, DL, MVT::i32)); 1545 1546 // Bit offset of target byte (byteIdx * 8). 1547 SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx, 1548 DAG.getConstant(3, DL, MVT::i32)); 1549 1550 // Shift to the right. 1551 Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt); 1552 1553 // Eliminate the upper bits by setting them to ... 1554 EVT MemEltVT = MemVT.getScalarType(); 1555 1556 // ... ones. 1557 if (ExtType == ISD::SEXTLOAD) { 1558 SDValue MemEltVTNode = DAG.getValueType(MemEltVT); 1559 1560 SDValue Ops[] = { 1561 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode), 1562 Load->getChain() 1563 }; 1564 1565 return DAG.getMergeValues(Ops, DL); 1566 } 1567 1568 // ... or zeros. 1569 SDValue Ops[] = { 1570 DAG.getZeroExtendInReg(Ret, DL, MemEltVT), 1571 Load->getChain() 1572 }; 1573 1574 return DAG.getMergeValues(Ops, DL); 1575} 1576 1577SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 1578 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1579 unsigned AS = LoadNode->getAddressSpace(); 1580 EVT MemVT = LoadNode->getMemoryVT(); 1581 ISD::LoadExtType ExtType = LoadNode->getExtensionType(); 1582 1583 if (AS == AMDGPUAS::PRIVATE_ADDRESS && 1584 ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) { 1585 return lowerPrivateExtLoad(Op, DAG); 1586 } 1587 1588 SDLoc DL(Op); 1589 EVT VT = Op.getValueType(); 1590 SDValue Chain = LoadNode->getChain(); 1591 SDValue Ptr = LoadNode->getBasePtr(); 1592 1593 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { 1594 SDValue MergedValues[2] = { 1595 scalarizeVectorLoad(LoadNode, DAG), 1596 Chain 1597 }; 1598 return DAG.getMergeValues(MergedValues, DL); 1599 } 1600 1601 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1602 if (ConstantBlock > -1 && 1603 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1604 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1605 SDValue Result; 1606 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) || 1607 isa<Constant>(LoadNode->getMemOperand()->getValue()) || 1608 isa<ConstantSDNode>(Ptr)) { 1609 SDValue Slots[4]; 1610 for (unsigned i = 0; i < 4; i++) { 1611 // We want Const position encoded with the following formula : 1612 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1613 // const_index is Ptr computed by llvm using an alignment of 16. 1614 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1615 // then div by 4 at the ISel step 1616 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1617 DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); 1618 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1619 } 1620 EVT NewVT = MVT::v4i32; 1621 unsigned NumElements = 4; 1622 if (VT.isVector()) { 1623 NewVT = VT; 1624 NumElements = VT.getVectorNumElements(); 1625 } 1626 Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); 1627 } else { 1628 // non-constant ptr can't be folded, keeps it as a v4f32 load 1629 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1630 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, 1631 DAG.getConstant(4, DL, MVT::i32)), 1632 DAG.getConstant(LoadNode->getAddressSpace() - 1633 AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32) 1634 ); 1635 } 1636 1637 if (!VT.isVector()) { 1638 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1639 DAG.getConstant(0, DL, MVT::i32)); 1640 } 1641 1642 SDValue MergedValues[2] = { 1643 Result, 1644 Chain 1645 }; 1646 return DAG.getMergeValues(MergedValues, DL); 1647 } 1648 1649 SDValue LoweredLoad; 1650 1651 // For most operations returning SDValue() will result in the node being 1652 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1653 // need to manually expand loads that may be legal in some address spaces and 1654 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1655 // compute shaders, since the data is sign extended when it is uploaded to the 1656 // buffer. However SEXT loads from other address spaces are not supported, so 1657 // we need to expand them here. 1658 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1659 EVT MemVT = LoadNode->getMemoryVT(); 1660 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1661 SDValue NewLoad = DAG.getExtLoad( 1662 ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT, 1663 LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags()); 1664 SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad, 1665 DAG.getValueType(MemVT)); 1666 1667 SDValue MergedValues[2] = { Res, Chain }; 1668 return DAG.getMergeValues(MergedValues, DL); 1669 } 1670 1671 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1672 return SDValue(); 1673 } 1674 1675 // Lowering for indirect addressing 1676 const MachineFunction &MF = DAG.getMachineFunction(); 1677 const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); 1678 unsigned StackWidth = TFL->getStackWidth(MF); 1679 1680 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1681 1682 if (VT.isVector()) { 1683 unsigned NumElemVT = VT.getVectorNumElements(); 1684 EVT ElemVT = VT.getVectorElementType(); 1685 SDValue Loads[4]; 1686 1687 assert(NumElemVT <= 4); 1688 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1689 "vector width in load"); 1690 1691 for (unsigned i = 0; i < NumElemVT; ++i) { 1692 unsigned Channel, PtrIncr; 1693 getStackAddress(StackWidth, i, Channel, PtrIncr); 1694 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1695 DAG.getConstant(PtrIncr, DL, MVT::i32)); 1696 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1697 Chain, Ptr, 1698 DAG.getTargetConstant(Channel, DL, MVT::i32), 1699 Op.getOperand(2)); 1700 } 1701 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT); 1702 LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT)); 1703 } else { 1704 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1705 Chain, Ptr, 1706 DAG.getTargetConstant(0, DL, MVT::i32), // Channel 1707 Op.getOperand(2)); 1708 } 1709 1710 SDValue Ops[2] = { 1711 LoweredLoad, 1712 Chain 1713 }; 1714 1715 return DAG.getMergeValues(Ops, DL); 1716} 1717 1718SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 1719 SDValue Chain = Op.getOperand(0); 1720 SDValue Cond = Op.getOperand(1); 1721 SDValue Jump = Op.getOperand(2); 1722 1723 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(), 1724 Chain, Jump, Cond); 1725} 1726 1727SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, 1728 SelectionDAG &DAG) const { 1729 MachineFunction &MF = DAG.getMachineFunction(); 1730 const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); 1731 1732 FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); 1733 1734 unsigned FrameIndex = FIN->getIndex(); 1735 unsigned IgnoredFrameReg; 1736 unsigned Offset = 1737 TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); 1738 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), 1739 Op.getValueType()); 1740} 1741 1742/// XXX Only kernel functions are supported, so we can assume for now that 1743/// every function is a kernel function, but in the future we should use 1744/// separate calling conventions for kernel and non-kernel functions. 1745SDValue R600TargetLowering::LowerFormalArguments( 1746 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 1747 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, 1748 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 1749 SmallVector<CCValAssign, 16> ArgLocs; 1750 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 1751 *DAG.getContext()); 1752 MachineFunction &MF = DAG.getMachineFunction(); 1753 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 1754 1755 SmallVector<ISD::InputArg, 8> LocalIns; 1756 1757 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); 1758 1759 AnalyzeFormalArguments(CCInfo, LocalIns); 1760 1761 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1762 CCValAssign &VA = ArgLocs[i]; 1763 const ISD::InputArg &In = Ins[i]; 1764 EVT VT = In.VT; 1765 EVT MemVT = VA.getLocVT(); 1766 if (!VT.isVector() && MemVT.isVector()) { 1767 // Get load source type if scalarized. 1768 MemVT = MemVT.getVectorElementType(); 1769 } 1770 1771 if (AMDGPU::isShader(CallConv)) { 1772 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); 1773 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1774 InVals.push_back(Register); 1775 continue; 1776 } 1777 1778 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1779 AMDGPUAS::CONSTANT_BUFFER_0); 1780 1781 // i64 isn't a legal type, so the register type used ends up as i32, which 1782 // isn't expected here. It attempts to create this sextload, but it ends up 1783 // being invalid. Somehow this seems to work with i64 arguments, but breaks 1784 // for <1 x i64>. 1785 1786 // The first 36 bytes of the input buffer contains information about 1787 // thread group and global sizes. 1788 ISD::LoadExtType Ext = ISD::NON_EXTLOAD; 1789 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) { 1790 // FIXME: This should really check the extload type, but the handling of 1791 // extload vector parameters seems to be broken. 1792 1793 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 1794 Ext = ISD::SEXTLOAD; 1795 } 1796 1797 // Compute the offset from the value. 1798 // XXX - I think PartOffset should give you this, but it seems to give the 1799 // size of the register which isn't useful. 1800 1801 unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); 1802 unsigned PartOffset = VA.getLocMemOffset(); 1803 unsigned Offset = 36 + VA.getLocMemOffset(); 1804 1805 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); 1806 SDValue Arg = DAG.getLoad( 1807 ISD::UNINDEXED, Ext, VT, DL, Chain, 1808 DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, 1809 MemVT, /* Alignment = */ 4, 1810 MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant); 1811 1812 // 4 is the preferred alignment for the CONSTANT memory space. 1813 InVals.push_back(Arg); 1814 MFI->ABIArgOffset = Offset + MemVT.getStoreSize(); 1815 } 1816 return Chain; 1817} 1818 1819EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1820 EVT VT) const { 1821 if (!VT.isVector()) 1822 return MVT::i32; 1823 return VT.changeVectorElementTypeToInteger(); 1824} 1825 1826bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, 1827 unsigned AddrSpace, 1828 unsigned Align, 1829 bool *IsFast) const { 1830 if (IsFast) 1831 *IsFast = false; 1832 1833 if (!VT.isSimple() || VT == MVT::Other) 1834 return false; 1835 1836 if (VT.bitsLT(MVT::i32)) 1837 return false; 1838 1839 // TODO: This is a rough estimate. 1840 if (IsFast) 1841 *IsFast = true; 1842 1843 return VT.bitsGT(MVT::i32) && Align % 4 == 0; 1844} 1845 1846static SDValue CompactSwizzlableVector( 1847 SelectionDAG &DAG, SDValue VectorEntry, 1848 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1849 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1850 assert(RemapSwizzle.empty()); 1851 SDValue NewBldVec[4] = { 1852 VectorEntry.getOperand(0), 1853 VectorEntry.getOperand(1), 1854 VectorEntry.getOperand(2), 1855 VectorEntry.getOperand(3) 1856 }; 1857 1858 for (unsigned i = 0; i < 4; i++) { 1859 if (NewBldVec[i].isUndef()) 1860 // We mask write here to teach later passes that the ith element of this 1861 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1862 // break false dependencies and additionnaly make assembly easier to read. 1863 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1864 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1865 if (C->isZero()) { 1866 RemapSwizzle[i] = 4; // SEL_0 1867 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1868 } else if (C->isExactlyValue(1.0)) { 1869 RemapSwizzle[i] = 5; // SEL_1 1870 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1871 } 1872 } 1873 1874 if (NewBldVec[i].isUndef()) 1875 continue; 1876 for (unsigned j = 0; j < i; j++) { 1877 if (NewBldVec[i] == NewBldVec[j]) { 1878 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1879 RemapSwizzle[i] = j; 1880 break; 1881 } 1882 } 1883 } 1884 1885 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1886 NewBldVec); 1887} 1888 1889static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1890 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1891 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1892 assert(RemapSwizzle.empty()); 1893 SDValue NewBldVec[4] = { 1894 VectorEntry.getOperand(0), 1895 VectorEntry.getOperand(1), 1896 VectorEntry.getOperand(2), 1897 VectorEntry.getOperand(3) 1898 }; 1899 bool isUnmovable[4] = { false, false, false, false }; 1900 for (unsigned i = 0; i < 4; i++) { 1901 RemapSwizzle[i] = i; 1902 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1903 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1904 ->getZExtValue(); 1905 if (i == Idx) 1906 isUnmovable[Idx] = true; 1907 } 1908 } 1909 1910 for (unsigned i = 0; i < 4; i++) { 1911 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1912 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1913 ->getZExtValue(); 1914 if (isUnmovable[Idx]) 1915 continue; 1916 // Swap i and Idx 1917 std::swap(NewBldVec[Idx], NewBldVec[i]); 1918 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1919 break; 1920 } 1921 } 1922 1923 return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry), 1924 NewBldVec); 1925} 1926 1927SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4], 1928 SelectionDAG &DAG, 1929 const SDLoc &DL) const { 1930 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 1931 // Old -> New swizzle values 1932 DenseMap<unsigned, unsigned> SwizzleRemap; 1933 1934 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1935 for (unsigned i = 0; i < 4; i++) { 1936 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1937 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1938 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1939 } 1940 1941 SwizzleRemap.clear(); 1942 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1943 for (unsigned i = 0; i < 4; i++) { 1944 unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1945 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1946 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32); 1947 } 1948 1949 return BuildVector; 1950} 1951 1952 1953//===----------------------------------------------------------------------===// 1954// Custom DAG Optimizations 1955//===----------------------------------------------------------------------===// 1956 1957SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1958 DAGCombinerInfo &DCI) const { 1959 SelectionDAG &DAG = DCI.DAG; 1960 1961 switch (N->getOpcode()) { 1962 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 1963 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1964 case ISD::FP_ROUND: { 1965 SDValue Arg = N->getOperand(0); 1966 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1967 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1968 Arg.getOperand(0)); 1969 } 1970 break; 1971 } 1972 1973 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1974 // (i32 select_cc f32, f32, -1, 0 cc) 1975 // 1976 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1977 // this to one of the SET*_DX10 instructions. 1978 case ISD::FP_TO_SINT: { 1979 SDValue FNeg = N->getOperand(0); 1980 if (FNeg.getOpcode() != ISD::FNEG) { 1981 return SDValue(); 1982 } 1983 SDValue SelectCC = FNeg.getOperand(0); 1984 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1985 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1986 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1987 !isHWTrueValue(SelectCC.getOperand(2)) || 1988 !isHWFalseValue(SelectCC.getOperand(3))) { 1989 return SDValue(); 1990 } 1991 1992 SDLoc dl(N); 1993 return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0), 1994 SelectCC.getOperand(0), // LHS 1995 SelectCC.getOperand(1), // RHS 1996 DAG.getConstant(-1, dl, MVT::i32), // True 1997 DAG.getConstant(0, dl, MVT::i32), // False 1998 SelectCC.getOperand(4)); // CC 1999 2000 break; 2001 } 2002 2003 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 2004 // => build_vector elt0, ... , NewEltIdx, ... , eltN 2005 case ISD::INSERT_VECTOR_ELT: { 2006 SDValue InVec = N->getOperand(0); 2007 SDValue InVal = N->getOperand(1); 2008 SDValue EltNo = N->getOperand(2); 2009 SDLoc dl(N); 2010 2011 // If the inserted element is an UNDEF, just use the input vector. 2012 if (InVal.isUndef()) 2013 return InVec; 2014 2015 EVT VT = InVec.getValueType(); 2016 2017 // If we can't generate a legal BUILD_VECTOR, exit 2018 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 2019 return SDValue(); 2020 2021 // Check that we know which element is being inserted 2022 if (!isa<ConstantSDNode>(EltNo)) 2023 return SDValue(); 2024 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 2025 2026 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 2027 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 2028 // vector elements. 2029 SmallVector<SDValue, 8> Ops; 2030 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 2031 Ops.append(InVec.getNode()->op_begin(), 2032 InVec.getNode()->op_end()); 2033 } else if (InVec.isUndef()) { 2034 unsigned NElts = VT.getVectorNumElements(); 2035 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 2036 } else { 2037 return SDValue(); 2038 } 2039 2040 // Insert the element 2041 if (Elt < Ops.size()) { 2042 // All the operands of BUILD_VECTOR must have the same type; 2043 // we enforce that here. 2044 EVT OpVT = Ops[0].getValueType(); 2045 if (InVal.getValueType() != OpVT) 2046 InVal = OpVT.bitsGT(InVal.getValueType()) ? 2047 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : 2048 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); 2049 Ops[Elt] = InVal; 2050 } 2051 2052 // Return the new vector 2053 return DAG.getBuildVector(VT, dl, Ops); 2054 } 2055 2056 // Extract_vec (Build_vector) generated by custom lowering 2057 // also needs to be customly combined 2058 case ISD::EXTRACT_VECTOR_ELT: { 2059 SDValue Arg = N->getOperand(0); 2060 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 2061 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 2062 unsigned Element = Const->getZExtValue(); 2063 return Arg->getOperand(Element); 2064 } 2065 } 2066 if (Arg.getOpcode() == ISD::BITCAST && 2067 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 2068 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 2069 unsigned Element = Const->getZExtValue(); 2070 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 2071 Arg->getOperand(0).getOperand(Element)); 2072 } 2073 } 2074 break; 2075 } 2076 2077 case ISD::SELECT_CC: { 2078 // Try common optimizations 2079 if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI)) 2080 return Ret; 2081 2082 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 2083 // selectcc x, y, a, b, inv(cc) 2084 // 2085 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 2086 // selectcc x, y, a, b, cc 2087 SDValue LHS = N->getOperand(0); 2088 if (LHS.getOpcode() != ISD::SELECT_CC) { 2089 return SDValue(); 2090 } 2091 2092 SDValue RHS = N->getOperand(1); 2093 SDValue True = N->getOperand(2); 2094 SDValue False = N->getOperand(3); 2095 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 2096 2097 if (LHS.getOperand(2).getNode() != True.getNode() || 2098 LHS.getOperand(3).getNode() != False.getNode() || 2099 RHS.getNode() != False.getNode()) { 2100 return SDValue(); 2101 } 2102 2103 switch (NCC) { 2104 default: return SDValue(); 2105 case ISD::SETNE: return LHS; 2106 case ISD::SETEQ: { 2107 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 2108 LHSCC = ISD::getSetCCInverse(LHSCC, 2109 LHS.getOperand(0).getValueType().isInteger()); 2110 if (DCI.isBeforeLegalizeOps() || 2111 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 2112 return DAG.getSelectCC(SDLoc(N), 2113 LHS.getOperand(0), 2114 LHS.getOperand(1), 2115 LHS.getOperand(2), 2116 LHS.getOperand(3), 2117 LHSCC); 2118 break; 2119 } 2120 } 2121 return SDValue(); 2122 } 2123 2124 case AMDGPUISD::EXPORT: { 2125 SDValue Arg = N->getOperand(1); 2126 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2127 break; 2128 2129 SDValue NewArgs[8] = { 2130 N->getOperand(0), // Chain 2131 SDValue(), 2132 N->getOperand(2), // ArrayBase 2133 N->getOperand(3), // Type 2134 N->getOperand(4), // SWZ_X 2135 N->getOperand(5), // SWZ_Y 2136 N->getOperand(6), // SWZ_Z 2137 N->getOperand(7) // SWZ_W 2138 }; 2139 SDLoc DL(N); 2140 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL); 2141 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs); 2142 } 2143 case AMDGPUISD::TEXTURE_FETCH: { 2144 SDValue Arg = N->getOperand(1); 2145 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 2146 break; 2147 2148 SDValue NewArgs[19] = { 2149 N->getOperand(0), 2150 N->getOperand(1), 2151 N->getOperand(2), 2152 N->getOperand(3), 2153 N->getOperand(4), 2154 N->getOperand(5), 2155 N->getOperand(6), 2156 N->getOperand(7), 2157 N->getOperand(8), 2158 N->getOperand(9), 2159 N->getOperand(10), 2160 N->getOperand(11), 2161 N->getOperand(12), 2162 N->getOperand(13), 2163 N->getOperand(14), 2164 N->getOperand(15), 2165 N->getOperand(16), 2166 N->getOperand(17), 2167 N->getOperand(18), 2168 }; 2169 SDLoc DL(N); 2170 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); 2171 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); 2172 } 2173 } 2174 2175 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); 2176} 2177 2178bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx, 2179 SDValue &Src, SDValue &Neg, SDValue &Abs, 2180 SDValue &Sel, SDValue &Imm, 2181 SelectionDAG &DAG) const { 2182 const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); 2183 if (!Src.isMachineOpcode()) 2184 return false; 2185 2186 switch (Src.getMachineOpcode()) { 2187 case AMDGPU::FNEG_R600: 2188 if (!Neg.getNode()) 2189 return false; 2190 Src = Src.getOperand(0); 2191 Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2192 return true; 2193 case AMDGPU::FABS_R600: 2194 if (!Abs.getNode()) 2195 return false; 2196 Src = Src.getOperand(0); 2197 Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); 2198 return true; 2199 case AMDGPU::CONST_COPY: { 2200 unsigned Opcode = ParentNode->getMachineOpcode(); 2201 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2202 2203 if (!Sel.getNode()) 2204 return false; 2205 2206 SDValue CstOffset = Src.getOperand(0); 2207 if (ParentNode->getValueType(0).isVector()) 2208 return false; 2209 2210 // Gather constants values 2211 int SrcIndices[] = { 2212 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2213 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2214 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), 2215 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2216 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2217 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2218 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2219 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2220 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2221 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2222 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2223 }; 2224 std::vector<unsigned> Consts; 2225 for (int OtherSrcIdx : SrcIndices) { 2226 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 2227 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 2228 continue; 2229 if (HasDst) { 2230 OtherSrcIdx--; 2231 OtherSelIdx--; 2232 } 2233 if (RegisterSDNode *Reg = 2234 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 2235 if (Reg->getReg() == AMDGPU::ALU_CONST) { 2236 ConstantSDNode *Cst 2237 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx)); 2238 Consts.push_back(Cst->getZExtValue()); 2239 } 2240 } 2241 } 2242 2243 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset); 2244 Consts.push_back(Cst->getZExtValue()); 2245 if (!TII->fitsConstReadLimitations(Consts)) { 2246 return false; 2247 } 2248 2249 Sel = CstOffset; 2250 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); 2251 return true; 2252 } 2253 case AMDGPU::MOV_IMM_GLOBAL_ADDR: 2254 // Check if the Imm slot is used. Taken from below. 2255 if (cast<ConstantSDNode>(Imm)->getZExtValue()) 2256 return false; 2257 Imm = Src.getOperand(0); 2258 Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32); 2259 return true; 2260 case AMDGPU::MOV_IMM_I32: 2261 case AMDGPU::MOV_IMM_F32: { 2262 unsigned ImmReg = AMDGPU::ALU_LITERAL_X; 2263 uint64_t ImmValue = 0; 2264 2265 2266 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { 2267 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 2268 float FloatValue = FPC->getValueAPF().convertToFloat(); 2269 if (FloatValue == 0.0) { 2270 ImmReg = AMDGPU::ZERO; 2271 } else if (FloatValue == 0.5) { 2272 ImmReg = AMDGPU::HALF; 2273 } else if (FloatValue == 1.0) { 2274 ImmReg = AMDGPU::ONE; 2275 } else { 2276 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 2277 } 2278 } else { 2279 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); 2280 uint64_t Value = C->getZExtValue(); 2281 if (Value == 0) { 2282 ImmReg = AMDGPU::ZERO; 2283 } else if (Value == 1) { 2284 ImmReg = AMDGPU::ONE_INT; 2285 } else { 2286 ImmValue = Value; 2287 } 2288 } 2289 2290 // Check that we aren't already using an immediate. 2291 // XXX: It's possible for an instruction to have more than one 2292 // immediate operand, but this is not supported yet. 2293 if (ImmReg == AMDGPU::ALU_LITERAL_X) { 2294 if (!Imm.getNode()) 2295 return false; 2296 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); 2297 assert(C); 2298 if (C->getZExtValue()) 2299 return false; 2300 Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32); 2301 } 2302 Src = DAG.getRegister(ImmReg, MVT::i32); 2303 return true; 2304 } 2305 default: 2306 return false; 2307 } 2308} 2309 2310/// \brief Fold the instructions after selecting them 2311SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 2312 SelectionDAG &DAG) const { 2313 const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); 2314 if (!Node->isMachineOpcode()) 2315 return Node; 2316 2317 unsigned Opcode = Node->getMachineOpcode(); 2318 SDValue FakeOp; 2319 2320 std::vector<SDValue> Ops(Node->op_begin(), Node->op_end()); 2321 2322 if (Opcode == AMDGPU::DOT_4) { 2323 int OperandIdx[] = { 2324 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 2325 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 2326 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 2327 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 2328 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 2329 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 2330 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 2331 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 2332 }; 2333 int NegIdx[] = { 2334 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), 2335 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), 2336 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), 2337 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), 2338 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), 2339 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), 2340 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), 2341 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) 2342 }; 2343 int AbsIdx[] = { 2344 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), 2345 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), 2346 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), 2347 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), 2348 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), 2349 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), 2350 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), 2351 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) 2352 }; 2353 for (unsigned i = 0; i < 8; i++) { 2354 if (OperandIdx[i] < 0) 2355 return Node; 2356 SDValue &Src = Ops[OperandIdx[i] - 1]; 2357 SDValue &Neg = Ops[NegIdx[i] - 1]; 2358 SDValue &Abs = Ops[AbsIdx[i] - 1]; 2359 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2360 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2361 if (HasDst) 2362 SelIdx--; 2363 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2364 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 2365 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2366 } 2367 } else if (Opcode == AMDGPU::REG_SEQUENCE) { 2368 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 2369 SDValue &Src = Ops[i]; 2370 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 2371 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2372 } 2373 } else if (Opcode == AMDGPU::CLAMP_R600) { 2374 SDValue Src = Node->getOperand(0); 2375 if (!Src.isMachineOpcode() || 2376 !TII->hasInstrModifiers(Src.getMachineOpcode())) 2377 return Node; 2378 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), 2379 AMDGPU::OpName::clamp); 2380 if (ClampIdx < 0) 2381 return Node; 2382 SDLoc DL(Node); 2383 std::vector<SDValue> Ops(Src->op_begin(), Src->op_end()); 2384 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32); 2385 return DAG.getMachineNode(Src.getMachineOpcode(), DL, 2386 Node->getVTList(), Ops); 2387 } else { 2388 if (!TII->hasInstrModifiers(Opcode)) 2389 return Node; 2390 int OperandIdx[] = { 2391 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 2392 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 2393 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) 2394 }; 2395 int NegIdx[] = { 2396 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), 2397 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), 2398 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) 2399 }; 2400 int AbsIdx[] = { 2401 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), 2402 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), 2403 -1 2404 }; 2405 for (unsigned i = 0; i < 3; i++) { 2406 if (OperandIdx[i] < 0) 2407 return Node; 2408 SDValue &Src = Ops[OperandIdx[i] - 1]; 2409 SDValue &Neg = Ops[NegIdx[i] - 1]; 2410 SDValue FakeAbs; 2411 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 2412 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 2413 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 2414 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); 2415 if (HasDst) { 2416 SelIdx--; 2417 ImmIdx--; 2418 } 2419 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 2420 SDValue &Imm = Ops[ImmIdx]; 2421 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 2422 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 2423 } 2424 } 2425 2426 return Node; 2427} 2428