R600ISelLowering.cpp revision 285830
1147078Sgshapiro//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 2261363Sgshapiro// 3147078Sgshapiro// The LLVM Compiler Infrastructure 4147078Sgshapiro// 5147078Sgshapiro// This file is distributed under the University of Illinois Open Source 6147078Sgshapiro// License. See LICENSE.TXT for details. 7147078Sgshapiro// 8147078Sgshapiro//===----------------------------------------------------------------------===// 9266692Sgshapiro// 10147078Sgshapiro/// \file 11147078Sgshapiro/// \brief Custom DAG lowering for R600 12147078Sgshapiro// 13147078Sgshapiro//===----------------------------------------------------------------------===// 14147078Sgshapiro 15147078Sgshapiro#include "R600ISelLowering.h" 16147078Sgshapiro#include "R600Defines.h" 17147078Sgshapiro#include "R600InstrInfo.h" 18147078Sgshapiro#include "R600MachineFunctionInfo.h" 19147078Sgshapiro#include "llvm/CodeGen/CallingConvLower.h" 20147078Sgshapiro#include "llvm/CodeGen/MachineFrameInfo.h" 21147078Sgshapiro#include "llvm/CodeGen/MachineInstrBuilder.h" 22147078Sgshapiro#include "llvm/CodeGen/MachineRegisterInfo.h" 23147078Sgshapiro#include "llvm/CodeGen/SelectionDAG.h" 24147078Sgshapiro#include "llvm/IR/Argument.h" 25147078Sgshapiro#include "llvm/IR/Function.h" 26147078Sgshapiro 27147078Sgshapirousing namespace llvm; 28147078Sgshapiro 29147078SgshapiroR600TargetLowering::R600TargetLowering(TargetMachine &TM) : 30147078Sgshapiro AMDGPUTargetLowering(TM), 31147078Sgshapiro Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) { 32147078Sgshapiro addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 33147078Sgshapiro addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 34147078Sgshapiro addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 35147078Sgshapiro addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 36147078Sgshapiro addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); 37147078Sgshapiro addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); 38147078Sgshapiro 39147078Sgshapiro computeRegisterProperties(); 40147078Sgshapiro 41147078Sgshapiro // Set condition code actions 42147078Sgshapiro setCondCodeAction(ISD::SETO, MVT::f32, Expand); 43147078Sgshapiro setCondCodeAction(ISD::SETUO, MVT::f32, Expand); 44147078Sgshapiro setCondCodeAction(ISD::SETLT, MVT::f32, Expand); 45147078Sgshapiro setCondCodeAction(ISD::SETLE, MVT::f32, Expand); 46147078Sgshapiro setCondCodeAction(ISD::SETOLT, MVT::f32, Expand); 47147078Sgshapiro setCondCodeAction(ISD::SETOLE, MVT::f32, Expand); 48147078Sgshapiro setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 49147078Sgshapiro setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 50203004Sgshapiro setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 51147078Sgshapiro setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 52147078Sgshapiro setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 53147078Sgshapiro setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 54147078Sgshapiro 55147078Sgshapiro setCondCodeAction(ISD::SETLE, MVT::i32, Expand); 56147078Sgshapiro setCondCodeAction(ISD::SETLT, MVT::i32, Expand); 57147078Sgshapiro setCondCodeAction(ISD::SETULE, MVT::i32, Expand); 58147078Sgshapiro setCondCodeAction(ISD::SETULT, MVT::i32, Expand); 59147078Sgshapiro 60147078Sgshapiro setOperationAction(ISD::FCOS, MVT::f32, Custom); 61 setOperationAction(ISD::FSIN, MVT::f32, Custom); 62 63 setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 64 setOperationAction(ISD::SETCC, MVT::v2i32, Expand); 65 66 setOperationAction(ISD::BR_CC, MVT::i32, Expand); 67 setOperationAction(ISD::BR_CC, MVT::f32, Expand); 68 69 setOperationAction(ISD::FSUB, MVT::f32, Expand); 70 71 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 72 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 73 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 74 75 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 76 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 77 78 setOperationAction(ISD::SETCC, MVT::i32, Expand); 79 setOperationAction(ISD::SETCC, MVT::f32, Expand); 80 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 81 82 setOperationAction(ISD::SELECT, MVT::i32, Expand); 83 setOperationAction(ISD::SELECT, MVT::f32, Expand); 84 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 85 setOperationAction(ISD::SELECT, MVT::v2f32, Expand); 86 setOperationAction(ISD::SELECT, MVT::v4i32, Expand); 87 setOperationAction(ISD::SELECT, MVT::v4f32, Expand); 88 89 // Legalize loads and stores to the private address space. 90 setOperationAction(ISD::LOAD, MVT::i32, Custom); 91 setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 92 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 93 94 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address 95 // spaces, so it is custom lowered to handle those where it isn't. 96 setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom); 97 setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom); 98 setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 99 setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom); 100 setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 101 setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom); 102 103 setOperationAction(ISD::STORE, MVT::i8, Custom); 104 setOperationAction(ISD::STORE, MVT::i32, Custom); 105 setOperationAction(ISD::STORE, MVT::v2i32, Custom); 106 setOperationAction(ISD::STORE, MVT::v4i32, Custom); 107 setTruncStoreAction(MVT::i32, MVT::i8, Custom); 108 setTruncStoreAction(MVT::i32, MVT::i16, Custom); 109 110 setOperationAction(ISD::LOAD, MVT::i32, Custom); 111 setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 112 setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 113 114 setTargetDAGCombine(ISD::FP_ROUND); 115 setTargetDAGCombine(ISD::FP_TO_SINT); 116 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 117 setTargetDAGCombine(ISD::SELECT_CC); 118 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 119 120 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 121 122 setBooleanContents(ZeroOrNegativeOneBooleanContent); 123 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 124 setSchedulingPreference(Sched::Source); 125} 126 127MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 128 MachineInstr * MI, MachineBasicBlock * BB) const { 129 MachineFunction * MF = BB->getParent(); 130 MachineRegisterInfo &MRI = MF->getRegInfo(); 131 MachineBasicBlock::iterator I = *MI; 132 const R600InstrInfo *TII = 133 static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo()); 134 135 switch (MI->getOpcode()) { 136 default: 137 // Replace LDS_*_RET instruction that don't have any uses with the 138 // equivalent LDS_*_NORET instruction. 139 if (TII->isLDSRetInstr(MI->getOpcode())) { 140 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); 141 assert(DstIdx != -1); 142 MachineInstrBuilder NewMI; 143 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg())) 144 return BB; 145 146 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), 147 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode()))); 148 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) { 149 NewMI.addOperand(MI->getOperand(i)); 150 } 151 } else { 152 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 153 } 154 break; 155 case AMDGPU::CLAMP_R600: { 156 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 157 AMDGPU::MOV, 158 MI->getOperand(0).getReg(), 159 MI->getOperand(1).getReg()); 160 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 161 break; 162 } 163 164 case AMDGPU::FABS_R600: { 165 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 166 AMDGPU::MOV, 167 MI->getOperand(0).getReg(), 168 MI->getOperand(1).getReg()); 169 TII->addFlag(NewMI, 0, MO_FLAG_ABS); 170 break; 171 } 172 173 case AMDGPU::FNEG_R600: { 174 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 175 AMDGPU::MOV, 176 MI->getOperand(0).getReg(), 177 MI->getOperand(1).getReg()); 178 TII->addFlag(NewMI, 0, MO_FLAG_NEG); 179 break; 180 } 181 182 case AMDGPU::MASK_WRITE: { 183 unsigned maskedRegister = MI->getOperand(0).getReg(); 184 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 185 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 186 TII->addFlag(defInstr, 0, MO_FLAG_MASK); 187 break; 188 } 189 190 case AMDGPU::MOV_IMM_F32: 191 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 192 MI->getOperand(1).getFPImm()->getValueAPF() 193 .bitcastToAPInt().getZExtValue()); 194 break; 195 case AMDGPU::MOV_IMM_I32: 196 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 197 MI->getOperand(1).getImm()); 198 break; 199 case AMDGPU::CONST_COPY: { 200 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 201 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 202 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel, 203 MI->getOperand(1).getImm()); 204 break; 205 } 206 207 case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 208 case AMDGPU::RAT_WRITE_CACHELESS_64_eg: 209 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 210 unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 211 212 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 213 .addOperand(MI->getOperand(0)) 214 .addOperand(MI->getOperand(1)) 215 .addImm(EOP); // Set End of program bit 216 break; 217 } 218 219 case AMDGPU::TXD: { 220 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 221 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 222 MachineOperand &RID = MI->getOperand(4); 223 MachineOperand &SID = MI->getOperand(5); 224 unsigned TextureId = MI->getOperand(6).getImm(); 225 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 226 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 227 228 switch (TextureId) { 229 case 5: // Rect 230 CTX = CTY = 0; 231 break; 232 case 6: // Shadow1D 233 SrcW = SrcZ; 234 break; 235 case 7: // Shadow2D 236 SrcW = SrcZ; 237 break; 238 case 8: // ShadowRect 239 CTX = CTY = 0; 240 SrcW = SrcZ; 241 break; 242 case 9: // 1DArray 243 SrcZ = SrcY; 244 CTZ = 0; 245 break; 246 case 10: // 2DArray 247 CTZ = 0; 248 break; 249 case 11: // Shadow1DArray 250 SrcZ = SrcY; 251 CTZ = 0; 252 break; 253 case 12: // Shadow2DArray 254 CTZ = 0; 255 break; 256 } 257 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 258 .addOperand(MI->getOperand(3)) 259 .addImm(SrcX) 260 .addImm(SrcY) 261 .addImm(SrcZ) 262 .addImm(SrcW) 263 .addImm(0) 264 .addImm(0) 265 .addImm(0) 266 .addImm(0) 267 .addImm(1) 268 .addImm(2) 269 .addImm(3) 270 .addOperand(RID) 271 .addOperand(SID) 272 .addImm(CTX) 273 .addImm(CTY) 274 .addImm(CTZ) 275 .addImm(CTW); 276 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 277 .addOperand(MI->getOperand(2)) 278 .addImm(SrcX) 279 .addImm(SrcY) 280 .addImm(SrcZ) 281 .addImm(SrcW) 282 .addImm(0) 283 .addImm(0) 284 .addImm(0) 285 .addImm(0) 286 .addImm(1) 287 .addImm(2) 288 .addImm(3) 289 .addOperand(RID) 290 .addOperand(SID) 291 .addImm(CTX) 292 .addImm(CTY) 293 .addImm(CTZ) 294 .addImm(CTW); 295 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 296 .addOperand(MI->getOperand(0)) 297 .addOperand(MI->getOperand(1)) 298 .addImm(SrcX) 299 .addImm(SrcY) 300 .addImm(SrcZ) 301 .addImm(SrcW) 302 .addImm(0) 303 .addImm(0) 304 .addImm(0) 305 .addImm(0) 306 .addImm(1) 307 .addImm(2) 308 .addImm(3) 309 .addOperand(RID) 310 .addOperand(SID) 311 .addImm(CTX) 312 .addImm(CTY) 313 .addImm(CTZ) 314 .addImm(CTW) 315 .addReg(T0, RegState::Implicit) 316 .addReg(T1, RegState::Implicit); 317 break; 318 } 319 320 case AMDGPU::TXD_SHADOW: { 321 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 322 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 323 MachineOperand &RID = MI->getOperand(4); 324 MachineOperand &SID = MI->getOperand(5); 325 unsigned TextureId = MI->getOperand(6).getImm(); 326 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3; 327 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1; 328 329 switch (TextureId) { 330 case 5: // Rect 331 CTX = CTY = 0; 332 break; 333 case 6: // Shadow1D 334 SrcW = SrcZ; 335 break; 336 case 7: // Shadow2D 337 SrcW = SrcZ; 338 break; 339 case 8: // ShadowRect 340 CTX = CTY = 0; 341 SrcW = SrcZ; 342 break; 343 case 9: // 1DArray 344 SrcZ = SrcY; 345 CTZ = 0; 346 break; 347 case 10: // 2DArray 348 CTZ = 0; 349 break; 350 case 11: // Shadow1DArray 351 SrcZ = SrcY; 352 CTZ = 0; 353 break; 354 case 12: // Shadow2DArray 355 CTZ = 0; 356 break; 357 } 358 359 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 360 .addOperand(MI->getOperand(3)) 361 .addImm(SrcX) 362 .addImm(SrcY) 363 .addImm(SrcZ) 364 .addImm(SrcW) 365 .addImm(0) 366 .addImm(0) 367 .addImm(0) 368 .addImm(0) 369 .addImm(1) 370 .addImm(2) 371 .addImm(3) 372 .addOperand(RID) 373 .addOperand(SID) 374 .addImm(CTX) 375 .addImm(CTY) 376 .addImm(CTZ) 377 .addImm(CTW); 378 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 379 .addOperand(MI->getOperand(2)) 380 .addImm(SrcX) 381 .addImm(SrcY) 382 .addImm(SrcZ) 383 .addImm(SrcW) 384 .addImm(0) 385 .addImm(0) 386 .addImm(0) 387 .addImm(0) 388 .addImm(1) 389 .addImm(2) 390 .addImm(3) 391 .addOperand(RID) 392 .addOperand(SID) 393 .addImm(CTX) 394 .addImm(CTY) 395 .addImm(CTZ) 396 .addImm(CTW); 397 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 398 .addOperand(MI->getOperand(0)) 399 .addOperand(MI->getOperand(1)) 400 .addImm(SrcX) 401 .addImm(SrcY) 402 .addImm(SrcZ) 403 .addImm(SrcW) 404 .addImm(0) 405 .addImm(0) 406 .addImm(0) 407 .addImm(0) 408 .addImm(1) 409 .addImm(2) 410 .addImm(3) 411 .addOperand(RID) 412 .addOperand(SID) 413 .addImm(CTX) 414 .addImm(CTY) 415 .addImm(CTZ) 416 .addImm(CTW) 417 .addReg(T0, RegState::Implicit) 418 .addReg(T1, RegState::Implicit); 419 break; 420 } 421 422 case AMDGPU::BRANCH: 423 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 424 .addOperand(MI->getOperand(0)); 425 break; 426 427 case AMDGPU::BRANCH_COND_f32: { 428 MachineInstr *NewMI = 429 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 430 AMDGPU::PREDICATE_BIT) 431 .addOperand(MI->getOperand(1)) 432 .addImm(OPCODE_IS_NOT_ZERO) 433 .addImm(0); // Flags 434 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 435 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 436 .addOperand(MI->getOperand(0)) 437 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 438 break; 439 } 440 441 case AMDGPU::BRANCH_COND_i32: { 442 MachineInstr *NewMI = 443 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 444 AMDGPU::PREDICATE_BIT) 445 .addOperand(MI->getOperand(1)) 446 .addImm(OPCODE_IS_NOT_ZERO_INT) 447 .addImm(0); // Flags 448 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 449 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 450 .addOperand(MI->getOperand(0)) 451 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 452 break; 453 } 454 455 case AMDGPU::EG_ExportSwz: 456 case AMDGPU::R600_ExportSwz: { 457 // Instruction is left unmodified if its not the last one of its type 458 bool isLastInstructionOfItsType = true; 459 unsigned InstExportType = MI->getOperand(1).getImm(); 460 for (MachineBasicBlock::iterator NextExportInst = llvm::next(I), 461 EndBlock = BB->end(); NextExportInst != EndBlock; 462 NextExportInst = llvm::next(NextExportInst)) { 463 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 464 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 465 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 466 .getImm(); 467 if (CurrentInstExportType == InstExportType) { 468 isLastInstructionOfItsType = false; 469 break; 470 } 471 } 472 } 473 bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0; 474 if (!EOP && !isLastInstructionOfItsType) 475 return BB; 476 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 477 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 478 .addOperand(MI->getOperand(0)) 479 .addOperand(MI->getOperand(1)) 480 .addOperand(MI->getOperand(2)) 481 .addOperand(MI->getOperand(3)) 482 .addOperand(MI->getOperand(4)) 483 .addOperand(MI->getOperand(5)) 484 .addOperand(MI->getOperand(6)) 485 .addImm(CfInst) 486 .addImm(EOP); 487 break; 488 } 489 case AMDGPU::RETURN: { 490 // RETURN instructions must have the live-out registers as implicit uses, 491 // otherwise they appear dead. 492 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 493 MachineInstrBuilder MIB(*MF, MI); 494 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 495 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 496 return BB; 497 } 498 } 499 500 MI->eraseFromParent(); 501 return BB; 502} 503 504//===----------------------------------------------------------------------===// 505// Custom DAG Lowering Operations 506//===----------------------------------------------------------------------===// 507 508SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 509 MachineFunction &MF = DAG.getMachineFunction(); 510 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 511 switch (Op.getOpcode()) { 512 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 513 case ISD::FCOS: 514 case ISD::FSIN: return LowerTrig(Op, DAG); 515 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 516 case ISD::STORE: return LowerSTORE(Op, DAG); 517 case ISD::LOAD: return LowerLOAD(Op, DAG); 518 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 519 case ISD::INTRINSIC_VOID: { 520 SDValue Chain = Op.getOperand(0); 521 unsigned IntrinsicID = 522 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 523 switch (IntrinsicID) { 524 case AMDGPUIntrinsic::AMDGPU_store_output: { 525 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 526 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 527 MFI->LiveOuts.push_back(Reg); 528 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2)); 529 } 530 case AMDGPUIntrinsic::R600_store_swizzle: { 531 const SDValue Args[8] = { 532 Chain, 533 Op.getOperand(2), // Export Value 534 Op.getOperand(3), // ArrayBase 535 Op.getOperand(4), // Type 536 DAG.getConstant(0, MVT::i32), // SWZ_X 537 DAG.getConstant(1, MVT::i32), // SWZ_Y 538 DAG.getConstant(2, MVT::i32), // SWZ_Z 539 DAG.getConstant(3, MVT::i32) // SWZ_W 540 }; 541 return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), 542 Args, 8); 543 } 544 545 // default for switch(IntrinsicID) 546 default: break; 547 } 548 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 549 break; 550 } 551 case ISD::INTRINSIC_WO_CHAIN: { 552 unsigned IntrinsicID = 553 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 554 EVT VT = Op.getValueType(); 555 SDLoc DL(Op); 556 switch(IntrinsicID) { 557 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 558 case AMDGPUIntrinsic::R600_load_input: { 559 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 560 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 561 MachineFunction &MF = DAG.getMachineFunction(); 562 MachineRegisterInfo &MRI = MF.getRegInfo(); 563 MRI.addLiveIn(Reg); 564 return DAG.getCopyFromReg(DAG.getEntryNode(), 565 SDLoc(DAG.getEntryNode()), Reg, VT); 566 } 567 568 case AMDGPUIntrinsic::R600_interp_input: { 569 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 570 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 571 MachineSDNode *interp; 572 if (ijb < 0) { 573 const MachineFunction &MF = DAG.getMachineFunction(); 574 const R600InstrInfo *TII = 575 static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo()); 576 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 577 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); 578 return DAG.getTargetExtractSubreg( 579 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 580 DL, MVT::f32, SDValue(interp, 0)); 581 } 582 MachineFunction &MF = DAG.getMachineFunction(); 583 MachineRegisterInfo &MRI = MF.getRegInfo(); 584 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb); 585 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1); 586 MRI.addLiveIn(RegisterI); 587 MRI.addLiveIn(RegisterJ); 588 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(), 589 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32); 590 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(), 591 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32); 592 593 if (slot % 4 < 2) 594 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 595 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 596 RegisterJNode, RegisterINode); 597 else 598 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 599 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 600 RegisterJNode, RegisterINode); 601 return SDValue(interp, slot % 2); 602 } 603 case AMDGPUIntrinsic::R600_interp_xy: 604 case AMDGPUIntrinsic::R600_interp_zw: { 605 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 606 MachineSDNode *interp; 607 SDValue RegisterINode = Op.getOperand(2); 608 SDValue RegisterJNode = Op.getOperand(3); 609 610 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy) 611 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 612 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 613 RegisterJNode, RegisterINode); 614 else 615 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 616 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32), 617 RegisterJNode, RegisterINode); 618 return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, 619 SDValue(interp, 0), SDValue(interp, 1)); 620 } 621 case AMDGPUIntrinsic::R600_tex: 622 case AMDGPUIntrinsic::R600_texc: 623 case AMDGPUIntrinsic::R600_txl: 624 case AMDGPUIntrinsic::R600_txlc: 625 case AMDGPUIntrinsic::R600_txb: 626 case AMDGPUIntrinsic::R600_txbc: 627 case AMDGPUIntrinsic::R600_txf: 628 case AMDGPUIntrinsic::R600_txq: 629 case AMDGPUIntrinsic::R600_ddx: 630 case AMDGPUIntrinsic::R600_ddy: 631 case AMDGPUIntrinsic::R600_ldptr: { 632 unsigned TextureOp; 633 switch (IntrinsicID) { 634 case AMDGPUIntrinsic::R600_tex: 635 TextureOp = 0; 636 break; 637 case AMDGPUIntrinsic::R600_texc: 638 TextureOp = 1; 639 break; 640 case AMDGPUIntrinsic::R600_txl: 641 TextureOp = 2; 642 break; 643 case AMDGPUIntrinsic::R600_txlc: 644 TextureOp = 3; 645 break; 646 case AMDGPUIntrinsic::R600_txb: 647 TextureOp = 4; 648 break; 649 case AMDGPUIntrinsic::R600_txbc: 650 TextureOp = 5; 651 break; 652 case AMDGPUIntrinsic::R600_txf: 653 TextureOp = 6; 654 break; 655 case AMDGPUIntrinsic::R600_txq: 656 TextureOp = 7; 657 break; 658 case AMDGPUIntrinsic::R600_ddx: 659 TextureOp = 8; 660 break; 661 case AMDGPUIntrinsic::R600_ddy: 662 TextureOp = 9; 663 break; 664 case AMDGPUIntrinsic::R600_ldptr: 665 TextureOp = 10; 666 break; 667 default: 668 llvm_unreachable("Unknow Texture Operation"); 669 } 670 671 SDValue TexArgs[19] = { 672 DAG.getConstant(TextureOp, MVT::i32), 673 Op.getOperand(1), 674 DAG.getConstant(0, MVT::i32), 675 DAG.getConstant(1, MVT::i32), 676 DAG.getConstant(2, MVT::i32), 677 DAG.getConstant(3, MVT::i32), 678 Op.getOperand(2), 679 Op.getOperand(3), 680 Op.getOperand(4), 681 DAG.getConstant(0, MVT::i32), 682 DAG.getConstant(1, MVT::i32), 683 DAG.getConstant(2, MVT::i32), 684 DAG.getConstant(3, MVT::i32), 685 Op.getOperand(5), 686 Op.getOperand(6), 687 Op.getOperand(7), 688 Op.getOperand(8), 689 Op.getOperand(9), 690 Op.getOperand(10) 691 }; 692 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19); 693 } 694 case AMDGPUIntrinsic::AMDGPU_dp4: { 695 SDValue Args[8] = { 696 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 697 DAG.getConstant(0, MVT::i32)), 698 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 699 DAG.getConstant(0, MVT::i32)), 700 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 701 DAG.getConstant(1, MVT::i32)), 702 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 703 DAG.getConstant(1, MVT::i32)), 704 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 705 DAG.getConstant(2, MVT::i32)), 706 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 707 DAG.getConstant(2, MVT::i32)), 708 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), 709 DAG.getConstant(3, MVT::i32)), 710 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), 711 DAG.getConstant(3, MVT::i32)) 712 }; 713 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8); 714 } 715 716 case Intrinsic::r600_read_ngroups_x: 717 return LowerImplicitParameter(DAG, VT, DL, 0); 718 case Intrinsic::r600_read_ngroups_y: 719 return LowerImplicitParameter(DAG, VT, DL, 1); 720 case Intrinsic::r600_read_ngroups_z: 721 return LowerImplicitParameter(DAG, VT, DL, 2); 722 case Intrinsic::r600_read_global_size_x: 723 return LowerImplicitParameter(DAG, VT, DL, 3); 724 case Intrinsic::r600_read_global_size_y: 725 return LowerImplicitParameter(DAG, VT, DL, 4); 726 case Intrinsic::r600_read_global_size_z: 727 return LowerImplicitParameter(DAG, VT, DL, 5); 728 case Intrinsic::r600_read_local_size_x: 729 return LowerImplicitParameter(DAG, VT, DL, 6); 730 case Intrinsic::r600_read_local_size_y: 731 return LowerImplicitParameter(DAG, VT, DL, 7); 732 case Intrinsic::r600_read_local_size_z: 733 return LowerImplicitParameter(DAG, VT, DL, 8); 734 735 case Intrinsic::r600_read_tgid_x: 736 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 737 AMDGPU::T1_X, VT); 738 case Intrinsic::r600_read_tgid_y: 739 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 740 AMDGPU::T1_Y, VT); 741 case Intrinsic::r600_read_tgid_z: 742 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 743 AMDGPU::T1_Z, VT); 744 case Intrinsic::r600_read_tidig_x: 745 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 746 AMDGPU::T0_X, VT); 747 case Intrinsic::r600_read_tidig_y: 748 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 749 AMDGPU::T0_Y, VT); 750 case Intrinsic::r600_read_tidig_z: 751 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 752 AMDGPU::T0_Z, VT); 753 } 754 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 755 break; 756 } 757 } // end switch(Op.getOpcode()) 758 return SDValue(); 759} 760 761void R600TargetLowering::ReplaceNodeResults(SDNode *N, 762 SmallVectorImpl<SDValue> &Results, 763 SelectionDAG &DAG) const { 764 switch (N->getOpcode()) { 765 default: return; 766 case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 767 return; 768 case ISD::LOAD: { 769 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); 770 Results.push_back(SDValue(Node, 0)); 771 Results.push_back(SDValue(Node, 1)); 772 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode 773 // function 774 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); 775 return; 776 } 777 case ISD::STORE: 778 SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); 779 Results.push_back(SDValue(Node, 0)); 780 return; 781 } 782} 783 784SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { 785 // On hw >= R700, COS/SIN input must be between -1. and 1. 786 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) 787 EVT VT = Op.getValueType(); 788 SDValue Arg = Op.getOperand(0); 789 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT, 790 DAG.getNode(ISD::FADD, SDLoc(Op), VT, 791 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg, 792 DAG.getConstantFP(0.15915494309, MVT::f32)), 793 DAG.getConstantFP(0.5, MVT::f32))); 794 unsigned TrigNode; 795 switch (Op.getOpcode()) { 796 case ISD::FCOS: 797 TrigNode = AMDGPUISD::COS_HW; 798 break; 799 case ISD::FSIN: 800 TrigNode = AMDGPUISD::SIN_HW; 801 break; 802 default: 803 llvm_unreachable("Wrong trig opcode"); 804 } 805 SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT, 806 DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart, 807 DAG.getConstantFP(-0.5, MVT::f32))); 808 if (Gen >= AMDGPUSubtarget::R700) 809 return TrigVal; 810 // On R600 hw, COS/SIN input must be between -Pi and Pi. 811 return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal, 812 DAG.getConstantFP(3.14159265359, MVT::f32)); 813} 814 815SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 816 return DAG.getNode( 817 ISD::SETCC, 818 SDLoc(Op), 819 MVT::i1, 820 Op, DAG.getConstantFP(0.0f, MVT::f32), 821 DAG.getCondCode(ISD::SETNE) 822 ); 823} 824 825SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 826 SDLoc DL, 827 unsigned DwordOffset) const { 828 unsigned ByteOffset = DwordOffset * 4; 829 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 830 AMDGPUAS::CONSTANT_BUFFER_0); 831 832 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 833 assert(isInt<16>(ByteOffset)); 834 835 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 836 DAG.getConstant(ByteOffset, MVT::i32), // PTR 837 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 838 false, false, false, 0); 839} 840 841bool R600TargetLowering::isZero(SDValue Op) const { 842 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 843 return Cst->isNullValue(); 844 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 845 return CstFP->isZero(); 846 } else { 847 return false; 848 } 849} 850 851SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 852 SDLoc DL(Op); 853 EVT VT = Op.getValueType(); 854 855 SDValue LHS = Op.getOperand(0); 856 SDValue RHS = Op.getOperand(1); 857 SDValue True = Op.getOperand(2); 858 SDValue False = Op.getOperand(3); 859 SDValue CC = Op.getOperand(4); 860 SDValue Temp; 861 862 // LHS and RHS are guaranteed to be the same value type 863 EVT CompareVT = LHS.getValueType(); 864 865 // Check if we can lower this to a native operation. 866 867 // Try to lower to a SET* instruction: 868 // 869 // SET* can match the following patterns: 870 // 871 // select_cc f32, f32, -1, 0, cc_supported 872 // select_cc f32, f32, 1.0f, 0.0f, cc_supported 873 // select_cc i32, i32, -1, 0, cc_supported 874 // 875 876 // Move hardware True/False values to the correct operand. 877 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 878 ISD::CondCode InverseCC = 879 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 880 if (isHWTrueValue(False) && isHWFalseValue(True)) { 881 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) { 882 std::swap(False, True); 883 CC = DAG.getCondCode(InverseCC); 884 } else { 885 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC); 886 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) { 887 std::swap(False, True); 888 std::swap(LHS, RHS); 889 CC = DAG.getCondCode(SwapInvCC); 890 } 891 } 892 } 893 894 if (isHWTrueValue(True) && isHWFalseValue(False) && 895 (CompareVT == VT || VT == MVT::i32)) { 896 // This can be matched by a SET* instruction. 897 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 898 } 899 900 // Try to lower to a CND* instruction: 901 // 902 // CND* can match the following patterns: 903 // 904 // select_cc f32, 0.0, f32, f32, cc_supported 905 // select_cc f32, 0.0, i32, i32, cc_supported 906 // select_cc i32, 0, f32, f32, cc_supported 907 // select_cc i32, 0, i32, i32, cc_supported 908 // 909 910 // Try to move the zero value to the RHS 911 if (isZero(LHS)) { 912 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 913 // Try swapping the operands 914 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode); 915 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 916 std::swap(LHS, RHS); 917 CC = DAG.getCondCode(CCSwapped); 918 } else { 919 // Try inverting the conditon and then swapping the operands 920 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger()); 921 CCSwapped = ISD::getSetCCSwappedOperands(CCInv); 922 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) { 923 std::swap(True, False); 924 std::swap(LHS, RHS); 925 CC = DAG.getCondCode(CCSwapped); 926 } 927 } 928 } 929 if (isZero(RHS)) { 930 SDValue Cond = LHS; 931 SDValue Zero = RHS; 932 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 933 if (CompareVT != VT) { 934 // Bitcast True / False to the correct types. This will end up being 935 // a nop, but it allows us to define only a single pattern in the 936 // .TD files for each CND* instruction rather than having to have 937 // one pattern for integer True/False and one for fp True/False 938 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 939 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 940 } 941 942 switch (CCOpcode) { 943 case ISD::SETONE: 944 case ISD::SETUNE: 945 case ISD::SETNE: 946 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 947 Temp = True; 948 True = False; 949 False = Temp; 950 break; 951 default: 952 break; 953 } 954 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 955 Cond, Zero, 956 True, False, 957 DAG.getCondCode(CCOpcode)); 958 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 959 } 960 961 962 // Possible Min/Max pattern 963 SDValue MinMax = LowerMinMax(Op, DAG); 964 if (MinMax.getNode()) { 965 return MinMax; 966 } 967 968 // If we make it this for it means we have no native instructions to handle 969 // this SELECT_CC, so we must lower it. 970 SDValue HWTrue, HWFalse; 971 972 if (CompareVT == MVT::f32) { 973 HWTrue = DAG.getConstantFP(1.0f, CompareVT); 974 HWFalse = DAG.getConstantFP(0.0f, CompareVT); 975 } else if (CompareVT == MVT::i32) { 976 HWTrue = DAG.getConstant(-1, CompareVT); 977 HWFalse = DAG.getConstant(0, CompareVT); 978 } 979 else { 980 assert(!"Unhandled value type in LowerSELECT_CC"); 981 } 982 983 // Lower this unsupported SELECT_CC into a combination of two supported 984 // SELECT_CC operations. 985 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 986 987 return DAG.getNode(ISD::SELECT_CC, DL, VT, 988 Cond, HWFalse, 989 True, False, 990 DAG.getCondCode(ISD::SETNE)); 991} 992 993/// LLVM generates byte-addresed pointers. For indirect addressing, we need to 994/// convert these pointers to a register index. Each register holds 995/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 996/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 997/// for indirect addressing. 998SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 999 unsigned StackWidth, 1000 SelectionDAG &DAG) const { 1001 unsigned SRLPad; 1002 switch(StackWidth) { 1003 case 1: 1004 SRLPad = 2; 1005 break; 1006 case 2: 1007 SRLPad = 3; 1008 break; 1009 case 4: 1010 SRLPad = 4; 1011 break; 1012 default: llvm_unreachable("Invalid stack width"); 1013 } 1014 1015 return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr, 1016 DAG.getConstant(SRLPad, MVT::i32)); 1017} 1018 1019void R600TargetLowering::getStackAddress(unsigned StackWidth, 1020 unsigned ElemIdx, 1021 unsigned &Channel, 1022 unsigned &PtrIncr) const { 1023 switch (StackWidth) { 1024 default: 1025 case 1: 1026 Channel = 0; 1027 if (ElemIdx > 0) { 1028 PtrIncr = 1; 1029 } else { 1030 PtrIncr = 0; 1031 } 1032 break; 1033 case 2: 1034 Channel = ElemIdx % 2; 1035 if (ElemIdx == 2) { 1036 PtrIncr = 1; 1037 } else { 1038 PtrIncr = 0; 1039 } 1040 break; 1041 case 4: 1042 Channel = ElemIdx; 1043 PtrIncr = 0; 1044 break; 1045 } 1046} 1047 1048SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 1049 SDLoc DL(Op); 1050 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 1051 SDValue Chain = Op.getOperand(0); 1052 SDValue Value = Op.getOperand(1); 1053 SDValue Ptr = Op.getOperand(2); 1054 1055 SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 1056 if (Result.getNode()) { 1057 return Result; 1058 } 1059 1060 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) { 1061 if (StoreNode->isTruncatingStore()) { 1062 EVT VT = Value.getValueType(); 1063 assert(VT.bitsLE(MVT::i32)); 1064 EVT MemVT = StoreNode->getMemoryVT(); 1065 SDValue MaskConstant; 1066 if (MemVT == MVT::i8) { 1067 MaskConstant = DAG.getConstant(0xFF, MVT::i32); 1068 } else { 1069 assert(MemVT == MVT::i16); 1070 MaskConstant = DAG.getConstant(0xFFFF, MVT::i32); 1071 } 1072 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr, 1073 DAG.getConstant(2, MVT::i32)); 1074 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr, 1075 DAG.getConstant(0x00000003, VT)); 1076 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant); 1077 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex, 1078 DAG.getConstant(3, VT)); 1079 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift); 1080 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift); 1081 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32 1082 // vector instead. 1083 SDValue Src[4] = { 1084 ShiftedValue, 1085 DAG.getConstant(0, MVT::i32), 1086 DAG.getConstant(0, MVT::i32), 1087 Mask 1088 }; 1089 SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4); 1090 SDValue Args[3] = { Chain, Input, DWordAddr }; 1091 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL, 1092 Op->getVTList(), Args, 3, MemVT, 1093 StoreNode->getMemOperand()); 1094 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR && 1095 Value.getValueType().bitsGE(MVT::i32)) { 1096 // Convert pointer from byte address to dword address. 1097 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 1098 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 1099 Ptr, DAG.getConstant(2, MVT::i32))); 1100 1101 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 1102 assert(!"Truncated and indexed stores not supported yet"); 1103 } else { 1104 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 1105 } 1106 return Chain; 1107 } 1108 } 1109 1110 EVT ValueVT = Value.getValueType(); 1111 1112 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1113 return SDValue(); 1114 } 1115 1116 // Lowering for indirect addressing 1117 1118 const MachineFunction &MF = DAG.getMachineFunction(); 1119 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 1120 getTargetMachine().getFrameLowering()); 1121 unsigned StackWidth = TFL->getStackWidth(MF); 1122 1123 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1124 1125 if (ValueVT.isVector()) { 1126 unsigned NumElemVT = ValueVT.getVectorNumElements(); 1127 EVT ElemVT = ValueVT.getVectorElementType(); 1128 SDValue Stores[4]; 1129 1130 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1131 "vector width in load"); 1132 1133 for (unsigned i = 0; i < NumElemVT; ++i) { 1134 unsigned Channel, PtrIncr; 1135 getStackAddress(StackWidth, i, Channel, PtrIncr); 1136 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1137 DAG.getConstant(PtrIncr, MVT::i32)); 1138 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 1139 Value, DAG.getConstant(i, MVT::i32)); 1140 1141 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 1142 Chain, Elem, Ptr, 1143 DAG.getTargetConstant(Channel, MVT::i32)); 1144 } 1145 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); 1146 } else { 1147 if (ValueVT == MVT::i8) { 1148 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 1149 } 1150 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 1151 DAG.getTargetConstant(0, MVT::i32)); // Channel 1152 } 1153 1154 return Chain; 1155} 1156 1157// return (512 + (kc_bank << 12) 1158static int 1159ConstantAddressBlock(unsigned AddressSpace) { 1160 switch (AddressSpace) { 1161 case AMDGPUAS::CONSTANT_BUFFER_0: 1162 return 512; 1163 case AMDGPUAS::CONSTANT_BUFFER_1: 1164 return 512 + 4096; 1165 case AMDGPUAS::CONSTANT_BUFFER_2: 1166 return 512 + 4096 * 2; 1167 case AMDGPUAS::CONSTANT_BUFFER_3: 1168 return 512 + 4096 * 3; 1169 case AMDGPUAS::CONSTANT_BUFFER_4: 1170 return 512 + 4096 * 4; 1171 case AMDGPUAS::CONSTANT_BUFFER_5: 1172 return 512 + 4096 * 5; 1173 case AMDGPUAS::CONSTANT_BUFFER_6: 1174 return 512 + 4096 * 6; 1175 case AMDGPUAS::CONSTANT_BUFFER_7: 1176 return 512 + 4096 * 7; 1177 case AMDGPUAS::CONSTANT_BUFFER_8: 1178 return 512 + 4096 * 8; 1179 case AMDGPUAS::CONSTANT_BUFFER_9: 1180 return 512 + 4096 * 9; 1181 case AMDGPUAS::CONSTANT_BUFFER_10: 1182 return 512 + 4096 * 10; 1183 case AMDGPUAS::CONSTANT_BUFFER_11: 1184 return 512 + 4096 * 11; 1185 case AMDGPUAS::CONSTANT_BUFFER_12: 1186 return 512 + 4096 * 12; 1187 case AMDGPUAS::CONSTANT_BUFFER_13: 1188 return 512 + 4096 * 13; 1189 case AMDGPUAS::CONSTANT_BUFFER_14: 1190 return 512 + 4096 * 14; 1191 case AMDGPUAS::CONSTANT_BUFFER_15: 1192 return 512 + 4096 * 15; 1193 default: 1194 return -1; 1195 } 1196} 1197 1198SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 1199{ 1200 EVT VT = Op.getValueType(); 1201 SDLoc DL(Op); 1202 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 1203 SDValue Chain = Op.getOperand(0); 1204 SDValue Ptr = Op.getOperand(1); 1205 SDValue LoweredLoad; 1206 1207 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) { 1208 SDValue MergedValues[2] = { 1209 SplitVectorLoad(Op, DAG), 1210 Chain 1211 }; 1212 return DAG.getMergeValues(MergedValues, 2, DL); 1213 } 1214 1215 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 1216 if (ConstantBlock > -1 && 1217 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || 1218 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { 1219 SDValue Result; 1220 if (isa<ConstantExpr>(LoadNode->getSrcValue()) || 1221 isa<Constant>(LoadNode->getSrcValue()) || 1222 isa<ConstantSDNode>(Ptr)) { 1223 SDValue Slots[4]; 1224 for (unsigned i = 0; i < 4; i++) { 1225 // We want Const position encoded with the following formula : 1226 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 1227 // const_index is Ptr computed by llvm using an alignment of 16. 1228 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 1229 // then div by 4 at the ISel step 1230 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 1231 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); 1232 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 1233 } 1234 EVT NewVT = MVT::v4i32; 1235 unsigned NumElements = 4; 1236 if (VT.isVector()) { 1237 NewVT = VT; 1238 NumElements = VT.getVectorNumElements(); 1239 } 1240 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements); 1241 } else { 1242 // non constant ptr cant be folded, keeps it as a v4f32 load 1243 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 1244 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), 1245 DAG.getConstant(LoadNode->getAddressSpace() - 1246 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) 1247 ); 1248 } 1249 1250 if (!VT.isVector()) { 1251 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 1252 DAG.getConstant(0, MVT::i32)); 1253 } 1254 1255 SDValue MergedValues[2] = { 1256 Result, 1257 Chain 1258 }; 1259 return DAG.getMergeValues(MergedValues, 2, DL); 1260 } 1261 1262 // For most operations returning SDValue() will result in the node being 1263 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we 1264 // need to manually expand loads that may be legal in some address spaces and 1265 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for 1266 // compute shaders, since the data is sign extended when it is uploaded to the 1267 // buffer. However SEXT loads from other address spaces are not supported, so 1268 // we need to expand them here. 1269 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) { 1270 EVT MemVT = LoadNode->getMemoryVT(); 1271 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8)); 1272 SDValue ShiftAmount = 1273 DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32); 1274 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr, 1275 LoadNode->getPointerInfo(), MemVT, 1276 LoadNode->isVolatile(), 1277 LoadNode->isNonTemporal(), 1278 LoadNode->getAlignment()); 1279 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount); 1280 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount); 1281 1282 SDValue MergedValues[2] = { Sra, Chain }; 1283 return DAG.getMergeValues(MergedValues, 2, DL); 1284 } 1285 1286 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 1287 return SDValue(); 1288 } 1289 1290 // Lowering for indirect addressing 1291 const MachineFunction &MF = DAG.getMachineFunction(); 1292 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 1293 getTargetMachine().getFrameLowering()); 1294 unsigned StackWidth = TFL->getStackWidth(MF); 1295 1296 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 1297 1298 if (VT.isVector()) { 1299 unsigned NumElemVT = VT.getVectorNumElements(); 1300 EVT ElemVT = VT.getVectorElementType(); 1301 SDValue Loads[4]; 1302 1303 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 1304 "vector width in load"); 1305 1306 for (unsigned i = 0; i < NumElemVT; ++i) { 1307 unsigned Channel, PtrIncr; 1308 getStackAddress(StackWidth, i, Channel, PtrIncr); 1309 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 1310 DAG.getConstant(PtrIncr, MVT::i32)); 1311 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 1312 Chain, Ptr, 1313 DAG.getTargetConstant(Channel, MVT::i32), 1314 Op.getOperand(2)); 1315 } 1316 for (unsigned i = NumElemVT; i < 4; ++i) { 1317 Loads[i] = DAG.getUNDEF(ElemVT); 1318 } 1319 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 1320 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); 1321 } else { 1322 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 1323 Chain, Ptr, 1324 DAG.getTargetConstant(0, MVT::i32), // Channel 1325 Op.getOperand(2)); 1326 } 1327 1328 SDValue Ops[2]; 1329 Ops[0] = LoweredLoad; 1330 Ops[1] = Chain; 1331 1332 return DAG.getMergeValues(Ops, 2, DL); 1333} 1334 1335/// XXX Only kernel functions are supported, so we can assume for now that 1336/// every function is a kernel function, but in the future we should use 1337/// separate calling conventions for kernel and non-kernel functions. 1338SDValue R600TargetLowering::LowerFormalArguments( 1339 SDValue Chain, 1340 CallingConv::ID CallConv, 1341 bool isVarArg, 1342 const SmallVectorImpl<ISD::InputArg> &Ins, 1343 SDLoc DL, SelectionDAG &DAG, 1344 SmallVectorImpl<SDValue> &InVals) const { 1345 SmallVector<CCValAssign, 16> ArgLocs; 1346 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 1347 getTargetMachine(), ArgLocs, *DAG.getContext()); 1348 MachineFunction &MF = DAG.getMachineFunction(); 1349 unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType; 1350 1351 SmallVector<ISD::InputArg, 8> LocalIns; 1352 1353 getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 1354 LocalIns); 1355 1356 AnalyzeFormalArguments(CCInfo, LocalIns); 1357 1358 for (unsigned i = 0, e = Ins.size(); i < e; ++i) { 1359 CCValAssign &VA = ArgLocs[i]; 1360 EVT VT = Ins[i].VT; 1361 EVT MemVT = LocalIns[i].VT; 1362 1363 if (ShaderType != ShaderType::COMPUTE) { 1364 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); 1365 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); 1366 InVals.push_back(Register); 1367 continue; 1368 } 1369 1370 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 1371 AMDGPUAS::CONSTANT_BUFFER_0); 1372 1373 // The first 36 bytes of the input buffer contains information about 1374 // thread group and global sizes. 1375 SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, 1376 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32), 1377 MachinePointerInfo(UndefValue::get(PtrTy)), 1378 MemVT, false, false, 4); 1379 // 4 is the prefered alignment for 1380 // the CONSTANT memory space. 1381 InVals.push_back(Arg); 1382 } 1383 return Chain; 1384} 1385 1386EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 1387 if (!VT.isVector()) return MVT::i32; 1388 return VT.changeVectorElementTypeToInteger(); 1389} 1390 1391static SDValue 1392CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry, 1393 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1394 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1395 assert(RemapSwizzle.empty()); 1396 SDValue NewBldVec[4] = { 1397 VectorEntry.getOperand(0), 1398 VectorEntry.getOperand(1), 1399 VectorEntry.getOperand(2), 1400 VectorEntry.getOperand(3) 1401 }; 1402 1403 for (unsigned i = 0; i < 4; i++) { 1404 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1405 // We mask write here to teach later passes that the ith element of this 1406 // vector is undef. Thus we can use it to reduce 128 bits reg usage, 1407 // break false dependencies and additionnaly make assembly easier to read. 1408 RemapSwizzle[i] = 7; // SEL_MASK_WRITE 1409 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) { 1410 if (C->isZero()) { 1411 RemapSwizzle[i] = 4; // SEL_0 1412 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1413 } else if (C->isExactlyValue(1.0)) { 1414 RemapSwizzle[i] = 5; // SEL_1 1415 NewBldVec[i] = DAG.getUNDEF(MVT::f32); 1416 } 1417 } 1418 1419 if (NewBldVec[i].getOpcode() == ISD::UNDEF) 1420 continue; 1421 for (unsigned j = 0; j < i; j++) { 1422 if (NewBldVec[i] == NewBldVec[j]) { 1423 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType()); 1424 RemapSwizzle[i] = j; 1425 break; 1426 } 1427 } 1428 } 1429 1430 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1431 VectorEntry.getValueType(), NewBldVec, 4); 1432} 1433 1434static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry, 1435 DenseMap<unsigned, unsigned> &RemapSwizzle) { 1436 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR); 1437 assert(RemapSwizzle.empty()); 1438 SDValue NewBldVec[4] = { 1439 VectorEntry.getOperand(0), 1440 VectorEntry.getOperand(1), 1441 VectorEntry.getOperand(2), 1442 VectorEntry.getOperand(3) 1443 }; 1444 bool isUnmovable[4] = { false, false, false, false }; 1445 for (unsigned i = 0; i < 4; i++) 1446 RemapSwizzle[i] = i; 1447 1448 for (unsigned i = 0; i < 4; i++) { 1449 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 1450 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1)) 1451 ->getZExtValue(); 1452 if (i == Idx) { 1453 isUnmovable[Idx] = true; 1454 continue; 1455 } 1456 if (isUnmovable[Idx]) 1457 continue; 1458 // Swap i and Idx 1459 std::swap(NewBldVec[Idx], NewBldVec[i]); 1460 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]); 1461 break; 1462 } 1463 } 1464 1465 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry), 1466 VectorEntry.getValueType(), NewBldVec, 4); 1467} 1468 1469 1470SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, 1471SDValue Swz[4], SelectionDAG &DAG) const { 1472 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR); 1473 // Old -> New swizzle values 1474 DenseMap<unsigned, unsigned> SwizzleRemap; 1475 1476 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap); 1477 for (unsigned i = 0; i < 4; i++) { 1478 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1479 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1480 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1481 } 1482 1483 SwizzleRemap.clear(); 1484 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap); 1485 for (unsigned i = 0; i < 4; i++) { 1486 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue(); 1487 if (SwizzleRemap.find(Idx) != SwizzleRemap.end()) 1488 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32); 1489 } 1490 1491 return BuildVector; 1492} 1493 1494 1495//===----------------------------------------------------------------------===// 1496// Custom DAG Optimizations 1497//===----------------------------------------------------------------------===// 1498 1499SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 1500 DAGCombinerInfo &DCI) const { 1501 SelectionDAG &DAG = DCI.DAG; 1502 1503 switch (N->getOpcode()) { 1504 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 1505 case ISD::FP_ROUND: { 1506 SDValue Arg = N->getOperand(0); 1507 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 1508 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0), 1509 Arg.getOperand(0)); 1510 } 1511 break; 1512 } 1513 1514 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1515 // (i32 select_cc f32, f32, -1, 0 cc) 1516 // 1517 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1518 // this to one of the SET*_DX10 instructions. 1519 case ISD::FP_TO_SINT: { 1520 SDValue FNeg = N->getOperand(0); 1521 if (FNeg.getOpcode() != ISD::FNEG) { 1522 return SDValue(); 1523 } 1524 SDValue SelectCC = FNeg.getOperand(0); 1525 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1526 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1527 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1528 !isHWTrueValue(SelectCC.getOperand(2)) || 1529 !isHWFalseValue(SelectCC.getOperand(3))) { 1530 return SDValue(); 1531 } 1532 1533 return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), 1534 SelectCC.getOperand(0), // LHS 1535 SelectCC.getOperand(1), // RHS 1536 DAG.getConstant(-1, MVT::i32), // True 1537 DAG.getConstant(0, MVT::i32), // Flase 1538 SelectCC.getOperand(4)); // CC 1539 1540 break; 1541 } 1542 1543 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx 1544 // => build_vector elt0, ... , NewEltIdx, ... , eltN 1545 case ISD::INSERT_VECTOR_ELT: { 1546 SDValue InVec = N->getOperand(0); 1547 SDValue InVal = N->getOperand(1); 1548 SDValue EltNo = N->getOperand(2); 1549 SDLoc dl(N); 1550 1551 // If the inserted element is an UNDEF, just use the input vector. 1552 if (InVal.getOpcode() == ISD::UNDEF) 1553 return InVec; 1554 1555 EVT VT = InVec.getValueType(); 1556 1557 // If we can't generate a legal BUILD_VECTOR, exit 1558 if (!isOperationLegal(ISD::BUILD_VECTOR, VT)) 1559 return SDValue(); 1560 1561 // Check that we know which element is being inserted 1562 if (!isa<ConstantSDNode>(EltNo)) 1563 return SDValue(); 1564 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 1565 1566 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially 1567 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the 1568 // vector elements. 1569 SmallVector<SDValue, 8> Ops; 1570 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 1571 Ops.append(InVec.getNode()->op_begin(), 1572 InVec.getNode()->op_end()); 1573 } else if (InVec.getOpcode() == ISD::UNDEF) { 1574 unsigned NElts = VT.getVectorNumElements(); 1575 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); 1576 } else { 1577 return SDValue(); 1578 } 1579 1580 // Insert the element 1581 if (Elt < Ops.size()) { 1582 // All the operands of BUILD_VECTOR must have the same type; 1583 // we enforce that here. 1584 EVT OpVT = Ops[0].getValueType(); 1585 if (InVal.getValueType() != OpVT) 1586 InVal = OpVT.bitsGT(InVal.getValueType()) ? 1587 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) : 1588 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal); 1589 Ops[Elt] = InVal; 1590 } 1591 1592 // Return the new vector 1593 return DAG.getNode(ISD::BUILD_VECTOR, dl, 1594 VT, &Ops[0], Ops.size()); 1595 } 1596 1597 // Extract_vec (Build_vector) generated by custom lowering 1598 // also needs to be customly combined 1599 case ISD::EXTRACT_VECTOR_ELT: { 1600 SDValue Arg = N->getOperand(0); 1601 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1602 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1603 unsigned Element = Const->getZExtValue(); 1604 return Arg->getOperand(Element); 1605 } 1606 } 1607 if (Arg.getOpcode() == ISD::BITCAST && 1608 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 1609 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1610 unsigned Element = Const->getZExtValue(); 1611 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(), 1612 Arg->getOperand(0).getOperand(Element)); 1613 } 1614 } 1615 } 1616 1617 case ISD::SELECT_CC: { 1618 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1619 // selectcc x, y, a, b, inv(cc) 1620 // 1621 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1622 // selectcc x, y, a, b, cc 1623 SDValue LHS = N->getOperand(0); 1624 if (LHS.getOpcode() != ISD::SELECT_CC) { 1625 return SDValue(); 1626 } 1627 1628 SDValue RHS = N->getOperand(1); 1629 SDValue True = N->getOperand(2); 1630 SDValue False = N->getOperand(3); 1631 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1632 1633 if (LHS.getOperand(2).getNode() != True.getNode() || 1634 LHS.getOperand(3).getNode() != False.getNode() || 1635 RHS.getNode() != False.getNode()) { 1636 return SDValue(); 1637 } 1638 1639 switch (NCC) { 1640 default: return SDValue(); 1641 case ISD::SETNE: return LHS; 1642 case ISD::SETEQ: { 1643 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1644 LHSCC = ISD::getSetCCInverse(LHSCC, 1645 LHS.getOperand(0).getValueType().isInteger()); 1646 if (DCI.isBeforeLegalizeOps() || 1647 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType())) 1648 return DAG.getSelectCC(SDLoc(N), 1649 LHS.getOperand(0), 1650 LHS.getOperand(1), 1651 LHS.getOperand(2), 1652 LHS.getOperand(3), 1653 LHSCC); 1654 break; 1655 } 1656 } 1657 return SDValue(); 1658 } 1659 1660 case AMDGPUISD::EXPORT: { 1661 SDValue Arg = N->getOperand(1); 1662 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1663 break; 1664 1665 SDValue NewArgs[8] = { 1666 N->getOperand(0), // Chain 1667 SDValue(), 1668 N->getOperand(2), // ArrayBase 1669 N->getOperand(3), // Type 1670 N->getOperand(4), // SWZ_X 1671 N->getOperand(5), // SWZ_Y 1672 N->getOperand(6), // SWZ_Z 1673 N->getOperand(7) // SWZ_W 1674 }; 1675 SDLoc DL(N); 1676 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG); 1677 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8); 1678 } 1679 case AMDGPUISD::TEXTURE_FETCH: { 1680 SDValue Arg = N->getOperand(1); 1681 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1682 break; 1683 1684 SDValue NewArgs[19] = { 1685 N->getOperand(0), 1686 N->getOperand(1), 1687 N->getOperand(2), 1688 N->getOperand(3), 1689 N->getOperand(4), 1690 N->getOperand(5), 1691 N->getOperand(6), 1692 N->getOperand(7), 1693 N->getOperand(8), 1694 N->getOperand(9), 1695 N->getOperand(10), 1696 N->getOperand(11), 1697 N->getOperand(12), 1698 N->getOperand(13), 1699 N->getOperand(14), 1700 N->getOperand(15), 1701 N->getOperand(16), 1702 N->getOperand(17), 1703 N->getOperand(18), 1704 }; 1705 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG); 1706 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(), 1707 NewArgs, 19); 1708 } 1709 } 1710 return SDValue(); 1711} 1712 1713static bool 1714FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg, 1715 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) { 1716 const R600InstrInfo *TII = 1717 static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); 1718 if (!Src.isMachineOpcode()) 1719 return false; 1720 switch (Src.getMachineOpcode()) { 1721 case AMDGPU::FNEG_R600: 1722 if (!Neg.getNode()) 1723 return false; 1724 Src = Src.getOperand(0); 1725 Neg = DAG.getTargetConstant(1, MVT::i32); 1726 return true; 1727 case AMDGPU::FABS_R600: 1728 if (!Abs.getNode()) 1729 return false; 1730 Src = Src.getOperand(0); 1731 Abs = DAG.getTargetConstant(1, MVT::i32); 1732 return true; 1733 case AMDGPU::CONST_COPY: { 1734 unsigned Opcode = ParentNode->getMachineOpcode(); 1735 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 1736 1737 if (!Sel.getNode()) 1738 return false; 1739 1740 SDValue CstOffset = Src.getOperand(0); 1741 if (ParentNode->getValueType(0).isVector()) 1742 return false; 1743 1744 // Gather constants values 1745 int SrcIndices[] = { 1746 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 1747 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 1748 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), 1749 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 1750 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 1751 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 1752 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 1753 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 1754 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 1755 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 1756 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 1757 }; 1758 std::vector<unsigned> Consts; 1759 for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) { 1760 int OtherSrcIdx = SrcIndices[i]; 1761 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx); 1762 if (OtherSrcIdx < 0 || OtherSelIdx < 0) 1763 continue; 1764 if (HasDst) { 1765 OtherSrcIdx--; 1766 OtherSelIdx--; 1767 } 1768 if (RegisterSDNode *Reg = 1769 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) { 1770 if (Reg->getReg() == AMDGPU::ALU_CONST) { 1771 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>( 1772 ParentNode->getOperand(OtherSelIdx)); 1773 Consts.push_back(Cst->getZExtValue()); 1774 } 1775 } 1776 } 1777 1778 ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset); 1779 Consts.push_back(Cst->getZExtValue()); 1780 if (!TII->fitsConstReadLimitations(Consts)) { 1781 return false; 1782 } 1783 1784 Sel = CstOffset; 1785 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); 1786 return true; 1787 } 1788 case AMDGPU::MOV_IMM_I32: 1789 case AMDGPU::MOV_IMM_F32: { 1790 unsigned ImmReg = AMDGPU::ALU_LITERAL_X; 1791 uint64_t ImmValue = 0; 1792 1793 1794 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { 1795 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0)); 1796 float FloatValue = FPC->getValueAPF().convertToFloat(); 1797 if (FloatValue == 0.0) { 1798 ImmReg = AMDGPU::ZERO; 1799 } else if (FloatValue == 0.5) { 1800 ImmReg = AMDGPU::HALF; 1801 } else if (FloatValue == 1.0) { 1802 ImmReg = AMDGPU::ONE; 1803 } else { 1804 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); 1805 } 1806 } else { 1807 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0)); 1808 uint64_t Value = C->getZExtValue(); 1809 if (Value == 0) { 1810 ImmReg = AMDGPU::ZERO; 1811 } else if (Value == 1) { 1812 ImmReg = AMDGPU::ONE_INT; 1813 } else { 1814 ImmValue = Value; 1815 } 1816 } 1817 1818 // Check that we aren't already using an immediate. 1819 // XXX: It's possible for an instruction to have more than one 1820 // immediate operand, but this is not supported yet. 1821 if (ImmReg == AMDGPU::ALU_LITERAL_X) { 1822 if (!Imm.getNode()) 1823 return false; 1824 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm); 1825 assert(C); 1826 if (C->getZExtValue()) 1827 return false; 1828 Imm = DAG.getTargetConstant(ImmValue, MVT::i32); 1829 } 1830 Src = DAG.getRegister(ImmReg, MVT::i32); 1831 return true; 1832 } 1833 default: 1834 return false; 1835 } 1836} 1837 1838 1839/// \brief Fold the instructions after selecting them 1840SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, 1841 SelectionDAG &DAG) const { 1842 const R600InstrInfo *TII = 1843 static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo()); 1844 if (!Node->isMachineOpcode()) 1845 return Node; 1846 unsigned Opcode = Node->getMachineOpcode(); 1847 SDValue FakeOp; 1848 1849 std::vector<SDValue> Ops; 1850 for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end(); 1851 I != E; ++I) 1852 Ops.push_back(*I); 1853 1854 if (Opcode == AMDGPU::DOT_4) { 1855 int OperandIdx[] = { 1856 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), 1857 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), 1858 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), 1859 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), 1860 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), 1861 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), 1862 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), 1863 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) 1864 }; 1865 int NegIdx[] = { 1866 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), 1867 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), 1868 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), 1869 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), 1870 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), 1871 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), 1872 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), 1873 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) 1874 }; 1875 int AbsIdx[] = { 1876 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), 1877 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), 1878 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), 1879 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), 1880 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), 1881 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), 1882 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), 1883 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) 1884 }; 1885 for (unsigned i = 0; i < 8; i++) { 1886 if (OperandIdx[i] < 0) 1887 return Node; 1888 SDValue &Src = Ops[OperandIdx[i] - 1]; 1889 SDValue &Neg = Ops[NegIdx[i] - 1]; 1890 SDValue &Abs = Ops[AbsIdx[i] - 1]; 1891 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 1892 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 1893 if (HasDst) 1894 SelIdx--; 1895 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 1896 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) 1897 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 1898 } 1899 } else if (Opcode == AMDGPU::REG_SEQUENCE) { 1900 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { 1901 SDValue &Src = Ops[i]; 1902 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) 1903 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 1904 } 1905 } else if (Opcode == AMDGPU::CLAMP_R600) { 1906 SDValue Src = Node->getOperand(0); 1907 if (!Src.isMachineOpcode() || 1908 !TII->hasInstrModifiers(Src.getMachineOpcode())) 1909 return Node; 1910 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), 1911 AMDGPU::OpName::clamp); 1912 if (ClampIdx < 0) 1913 return Node; 1914 std::vector<SDValue> Ops; 1915 unsigned NumOp = Src.getNumOperands(); 1916 for(unsigned i = 0; i < NumOp; ++i) 1917 Ops.push_back(Src.getOperand(i)); 1918 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32); 1919 return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node), 1920 Node->getVTList(), Ops); 1921 } else { 1922 if (!TII->hasInstrModifiers(Opcode)) 1923 return Node; 1924 int OperandIdx[] = { 1925 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), 1926 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), 1927 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) 1928 }; 1929 int NegIdx[] = { 1930 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), 1931 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), 1932 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) 1933 }; 1934 int AbsIdx[] = { 1935 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), 1936 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), 1937 -1 1938 }; 1939 for (unsigned i = 0; i < 3; i++) { 1940 if (OperandIdx[i] < 0) 1941 return Node; 1942 SDValue &Src = Ops[OperandIdx[i] - 1]; 1943 SDValue &Neg = Ops[NegIdx[i] - 1]; 1944 SDValue FakeAbs; 1945 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; 1946 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; 1947 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); 1948 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); 1949 if (HasDst) { 1950 SelIdx--; 1951 ImmIdx--; 1952 } 1953 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp; 1954 SDValue &Imm = Ops[ImmIdx]; 1955 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG)) 1956 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); 1957 } 1958 } 1959 1960 return Node; 1961} 1962