R600ISelLowering.cpp revision 251662
1809Sdarcy//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===// 29330Slana// 3809Sdarcy// The LLVM Compiler Infrastructure 4809Sdarcy// 5809Sdarcy// This file is distributed under the University of Illinois Open Source 6809Sdarcy// License. See LICENSE.TXT for details. 7809Sdarcy// 8809Sdarcy//===----------------------------------------------------------------------===// 9809Sdarcy// 10809Sdarcy/// \file 11809Sdarcy/// \brief Custom DAG lowering for R600 12809Sdarcy// 13809Sdarcy//===----------------------------------------------------------------------===// 14809Sdarcy 15809Sdarcy#include "R600ISelLowering.h" 16809Sdarcy#include "R600Defines.h" 17809Sdarcy#include "R600InstrInfo.h" 18809Sdarcy#include "R600MachineFunctionInfo.h" 192362Sohair#include "llvm/CodeGen/MachineFrameInfo.h" 202362Sohair#include "llvm/CodeGen/MachineInstrBuilder.h" 212362Sohair#include "llvm/CodeGen/MachineRegisterInfo.h" 22809Sdarcy#include "llvm/CodeGen/SelectionDAG.h" 23809Sdarcy#include "llvm/IR/Argument.h" 24809Sdarcy#include "llvm/IR/Function.h" 25809Sdarcy 26809Sdarcyusing namespace llvm; 27809Sdarcy 28809SdarcyR600TargetLowering::R600TargetLowering(TargetMachine &TM) : 29809Sdarcy AMDGPUTargetLowering(TM), 30809Sdarcy TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) { 31809Sdarcy addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); 32809Sdarcy addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); 33809Sdarcy addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); 34809Sdarcy addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); 35809Sdarcy computeRegisterProperties(); 36809Sdarcy 37809Sdarcy setOperationAction(ISD::FADD, MVT::v4f32, Expand); 38809Sdarcy setOperationAction(ISD::FMUL, MVT::v4f32, Expand); 39809Sdarcy setOperationAction(ISD::FDIV, MVT::v4f32, Expand); 40809Sdarcy setOperationAction(ISD::FSUB, MVT::v4f32, Expand); 41809Sdarcy 42809Sdarcy setOperationAction(ISD::ADD, MVT::v4i32, Expand); 43809Sdarcy setOperationAction(ISD::AND, MVT::v4i32, Expand); 44809Sdarcy setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); 45809Sdarcy setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); 46809Sdarcy setOperationAction(ISD::MUL, MVT::v2i32, Expand); 47809Sdarcy setOperationAction(ISD::MUL, MVT::v4i32, Expand); 48809Sdarcy setOperationAction(ISD::OR, MVT::v4i32, Expand); 49809Sdarcy setOperationAction(ISD::OR, MVT::v2i32, Expand); 50809Sdarcy setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); 51809Sdarcy setOperationAction(ISD::SHL, MVT::v4i32, Expand); 52809Sdarcy setOperationAction(ISD::SHL, MVT::v2i32, Expand); 53809Sdarcy setOperationAction(ISD::SRL, MVT::v4i32, Expand); 54809Sdarcy setOperationAction(ISD::SRL, MVT::v2i32, Expand); 55809Sdarcy setOperationAction(ISD::SRA, MVT::v4i32, Expand); 56809Sdarcy setOperationAction(ISD::SRA, MVT::v2i32, Expand); 57809Sdarcy setOperationAction(ISD::SUB, MVT::v4i32, Expand); 58809Sdarcy setOperationAction(ISD::SUB, MVT::v2i32, Expand); 59809Sdarcy setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); 60809Sdarcy setOperationAction(ISD::UDIV, MVT::v4i32, Expand); 61809Sdarcy setOperationAction(ISD::UREM, MVT::v4i32, Expand); 62809Sdarcy setOperationAction(ISD::SETCC, MVT::v4i32, Expand); 63809Sdarcy setOperationAction(ISD::XOR, MVT::v4i32, Expand); 64809Sdarcy setOperationAction(ISD::XOR, MVT::v2i32, Expand); 65809Sdarcy 66809Sdarcy setOperationAction(ISD::BR_CC, MVT::i32, Expand); 67809Sdarcy setOperationAction(ISD::BR_CC, MVT::f32, Expand); 68809Sdarcy 69809Sdarcy setOperationAction(ISD::FSUB, MVT::f32, Expand); 70809Sdarcy 71809Sdarcy setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 72809Sdarcy setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 73809Sdarcy setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom); 74809Sdarcy 75809Sdarcy setOperationAction(ISD::ROTL, MVT::i32, Custom); 76809Sdarcy 77809Sdarcy setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 78809Sdarcy setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 79809Sdarcy 80809Sdarcy setOperationAction(ISD::SETCC, MVT::i32, Expand); 81809Sdarcy setOperationAction(ISD::SETCC, MVT::f32, Expand); 824591Sdarcy setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom); 83809Sdarcy 84809Sdarcy setOperationAction(ISD::SELECT, MVT::i32, Custom); 85809Sdarcy setOperationAction(ISD::SELECT, MVT::f32, Custom); 86809Sdarcy 87809Sdarcy setOperationAction(ISD::VSELECT, MVT::v4i32, Expand); 88809Sdarcy setOperationAction(ISD::VSELECT, MVT::v2i32, Expand); 89809Sdarcy 90809Sdarcy // Legalize loads and stores to the private address space. 91809Sdarcy setOperationAction(ISD::LOAD, MVT::i32, Custom); 92809Sdarcy setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 93809Sdarcy setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 94809Sdarcy setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom); 95809Sdarcy setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom); 96809Sdarcy setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom); 97809Sdarcy setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom); 98809Sdarcy setOperationAction(ISD::STORE, MVT::i8, Custom); 99809Sdarcy setOperationAction(ISD::STORE, MVT::i32, Custom); 10010532Smchung setOperationAction(ISD::STORE, MVT::v2i32, Custom); 1014591Sdarcy setOperationAction(ISD::STORE, MVT::v4i32, Custom); 102809Sdarcy 103809Sdarcy setOperationAction(ISD::LOAD, MVT::i32, Custom); 104809Sdarcy setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 105809Sdarcy setOperationAction(ISD::FrameIndex, MVT::i32, Custom); 106809Sdarcy 107809Sdarcy setTargetDAGCombine(ISD::FP_ROUND); 108809Sdarcy setTargetDAGCombine(ISD::FP_TO_SINT); 109809Sdarcy setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 110809Sdarcy setTargetDAGCombine(ISD::SELECT_CC); 111809Sdarcy 112809Sdarcy setBooleanContents(ZeroOrNegativeOneBooleanContent); 113809Sdarcy setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 114809Sdarcy setSchedulingPreference(Sched::VLIW); 115809Sdarcy} 116809Sdarcy 11710532SmchungMachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( 1184591Sdarcy MachineInstr * MI, MachineBasicBlock * BB) const { 119809Sdarcy MachineFunction * MF = BB->getParent(); 120809Sdarcy MachineRegisterInfo &MRI = MF->getRegInfo(); 121809Sdarcy MachineBasicBlock::iterator I = *MI; 122809Sdarcy 123809Sdarcy switch (MI->getOpcode()) { 124809Sdarcy default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 125809Sdarcy case AMDGPU::CLAMP_R600: { 126809Sdarcy MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 127809Sdarcy AMDGPU::MOV, 128809Sdarcy MI->getOperand(0).getReg(), 129809Sdarcy MI->getOperand(1).getReg()); 130809Sdarcy TII->addFlag(NewMI, 0, MO_FLAG_CLAMP); 131809Sdarcy break; 132809Sdarcy } 133809Sdarcy 134809Sdarcy case AMDGPU::FABS_R600: { 135809Sdarcy MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 136809Sdarcy AMDGPU::MOV, 137809Sdarcy MI->getOperand(0).getReg(), 138809Sdarcy MI->getOperand(1).getReg()); 139809Sdarcy TII->addFlag(NewMI, 0, MO_FLAG_ABS); 140809Sdarcy break; 141809Sdarcy } 142809Sdarcy 1434601Sdarcy case AMDGPU::FNEG_R600: { 1444601Sdarcy MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I, 1454591Sdarcy AMDGPU::MOV, 1464591Sdarcy MI->getOperand(0).getReg(), 147809Sdarcy MI->getOperand(1).getReg()); 148809Sdarcy TII->addFlag(NewMI, 0, MO_FLAG_NEG); 149809Sdarcy break; 150809Sdarcy } 151809Sdarcy 152809Sdarcy case AMDGPU::MASK_WRITE: { 153809Sdarcy unsigned maskedRegister = MI->getOperand(0).getReg(); 154809Sdarcy assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); 155809Sdarcy MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); 156809Sdarcy TII->addFlag(defInstr, 0, MO_FLAG_MASK); 157809Sdarcy break; 158809Sdarcy } 159809Sdarcy 160809Sdarcy case AMDGPU::MOV_IMM_F32: 161809Sdarcy TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 162809Sdarcy MI->getOperand(1).getFPImm()->getValueAPF() 163809Sdarcy .bitcastToAPInt().getZExtValue()); 164809Sdarcy break; 165809Sdarcy case AMDGPU::MOV_IMM_I32: 166809Sdarcy TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(), 167809Sdarcy MI->getOperand(1).getImm()); 168809Sdarcy break; 169809Sdarcy case AMDGPU::CONST_COPY: { 170809Sdarcy MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV, 171809Sdarcy MI->getOperand(0).getReg(), AMDGPU::ALU_CONST); 172809Sdarcy TII->setImmOperand(NewMI, R600Operands::SRC0_SEL, 173809Sdarcy MI->getOperand(1).getImm()); 174809Sdarcy break; 175809Sdarcy } 176809Sdarcy 177809Sdarcy case AMDGPU::RAT_WRITE_CACHELESS_32_eg: 178809Sdarcy case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { 179809Sdarcy unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; 180809Sdarcy 181809Sdarcy BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 182809Sdarcy .addOperand(MI->getOperand(0)) 183809Sdarcy .addOperand(MI->getOperand(1)) 184809Sdarcy .addImm(EOP); // Set End of program bit 185809Sdarcy break; 186809Sdarcy } 187809Sdarcy 188809Sdarcy case AMDGPU::TXD: { 189809Sdarcy unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 190809Sdarcy unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 191809Sdarcy 192809Sdarcy BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 193809Sdarcy .addOperand(MI->getOperand(3)) 194809Sdarcy .addOperand(MI->getOperand(4)) 195809Sdarcy .addOperand(MI->getOperand(5)) 196809Sdarcy .addOperand(MI->getOperand(6)); 197809Sdarcy BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 198809Sdarcy .addOperand(MI->getOperand(2)) 199809Sdarcy .addOperand(MI->getOperand(4)) 200809Sdarcy .addOperand(MI->getOperand(5)) 201809Sdarcy .addOperand(MI->getOperand(6)); 202809Sdarcy BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G)) 203809Sdarcy .addOperand(MI->getOperand(0)) 204809Sdarcy .addOperand(MI->getOperand(1)) 205809Sdarcy .addOperand(MI->getOperand(4)) 206809Sdarcy .addOperand(MI->getOperand(5)) 207809Sdarcy .addOperand(MI->getOperand(6)) 208809Sdarcy .addReg(T0, RegState::Implicit) 209809Sdarcy .addReg(T1, RegState::Implicit); 210809Sdarcy break; 211809Sdarcy } 212809Sdarcy 213809Sdarcy case AMDGPU::TXD_SHADOW: { 214809Sdarcy unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 215809Sdarcy unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass); 216809Sdarcy 217809Sdarcy BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0) 218809Sdarcy .addOperand(MI->getOperand(3)) 219809Sdarcy .addOperand(MI->getOperand(4)) 220809Sdarcy .addOperand(MI->getOperand(5)) 221809Sdarcy .addOperand(MI->getOperand(6)); 222809Sdarcy BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1) 223809Sdarcy .addOperand(MI->getOperand(2)) 224809Sdarcy .addOperand(MI->getOperand(4)) 225809Sdarcy .addOperand(MI->getOperand(5)) 226809Sdarcy .addOperand(MI->getOperand(6)); 227809Sdarcy BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G)) 228809Sdarcy .addOperand(MI->getOperand(0)) 229 .addOperand(MI->getOperand(1)) 230 .addOperand(MI->getOperand(4)) 231 .addOperand(MI->getOperand(5)) 232 .addOperand(MI->getOperand(6)) 233 .addReg(T0, RegState::Implicit) 234 .addReg(T1, RegState::Implicit); 235 break; 236 } 237 238 case AMDGPU::BRANCH: 239 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) 240 .addOperand(MI->getOperand(0)); 241 break; 242 243 case AMDGPU::BRANCH_COND_f32: { 244 MachineInstr *NewMI = 245 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 246 AMDGPU::PREDICATE_BIT) 247 .addOperand(MI->getOperand(1)) 248 .addImm(OPCODE_IS_NOT_ZERO) 249 .addImm(0); // Flags 250 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 251 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 252 .addOperand(MI->getOperand(0)) 253 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 254 break; 255 } 256 257 case AMDGPU::BRANCH_COND_i32: { 258 MachineInstr *NewMI = 259 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), 260 AMDGPU::PREDICATE_BIT) 261 .addOperand(MI->getOperand(1)) 262 .addImm(OPCODE_IS_NOT_ZERO_INT) 263 .addImm(0); // Flags 264 TII->addFlag(NewMI, 0, MO_FLAG_PUSH); 265 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) 266 .addOperand(MI->getOperand(0)) 267 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); 268 break; 269 } 270 271 case AMDGPU::EG_ExportSwz: 272 case AMDGPU::R600_ExportSwz: { 273 // Instruction is left unmodified if its not the last one of its type 274 bool isLastInstructionOfItsType = true; 275 unsigned InstExportType = MI->getOperand(1).getImm(); 276 for (MachineBasicBlock::iterator NextExportInst = llvm::next(I), 277 EndBlock = BB->end(); NextExportInst != EndBlock; 278 NextExportInst = llvm::next(NextExportInst)) { 279 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || 280 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { 281 unsigned CurrentInstExportType = NextExportInst->getOperand(1) 282 .getImm(); 283 if (CurrentInstExportType == InstExportType) { 284 isLastInstructionOfItsType = false; 285 break; 286 } 287 } 288 } 289 bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0; 290 if (!EOP && !isLastInstructionOfItsType) 291 return BB; 292 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; 293 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) 294 .addOperand(MI->getOperand(0)) 295 .addOperand(MI->getOperand(1)) 296 .addOperand(MI->getOperand(2)) 297 .addOperand(MI->getOperand(3)) 298 .addOperand(MI->getOperand(4)) 299 .addOperand(MI->getOperand(5)) 300 .addOperand(MI->getOperand(6)) 301 .addImm(CfInst) 302 .addImm(EOP); 303 break; 304 } 305 case AMDGPU::RETURN: { 306 // RETURN instructions must have the live-out registers as implicit uses, 307 // otherwise they appear dead. 308 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>(); 309 MachineInstrBuilder MIB(*MF, MI); 310 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i) 311 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit); 312 return BB; 313 } 314 } 315 316 MI->eraseFromParent(); 317 return BB; 318} 319 320//===----------------------------------------------------------------------===// 321// Custom DAG Lowering Operations 322//===----------------------------------------------------------------------===// 323 324using namespace llvm::Intrinsic; 325using namespace llvm::AMDGPUIntrinsic; 326 327SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 328 switch (Op.getOpcode()) { 329 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 330 case ISD::ROTL: return LowerROTL(Op, DAG); 331 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 332 case ISD::SELECT: return LowerSELECT(Op, DAG); 333 case ISD::STORE: return LowerSTORE(Op, DAG); 334 case ISD::LOAD: return LowerLOAD(Op, DAG); 335 case ISD::FrameIndex: return LowerFrameIndex(Op, DAG); 336 case ISD::INTRINSIC_VOID: { 337 SDValue Chain = Op.getOperand(0); 338 unsigned IntrinsicID = 339 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 340 switch (IntrinsicID) { 341 case AMDGPUIntrinsic::AMDGPU_store_output: { 342 MachineFunction &MF = DAG.getMachineFunction(); 343 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 344 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 345 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 346 MFI->LiveOuts.push_back(Reg); 347 return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2)); 348 } 349 case AMDGPUIntrinsic::R600_store_swizzle: { 350 const SDValue Args[8] = { 351 Chain, 352 Op.getOperand(2), // Export Value 353 Op.getOperand(3), // ArrayBase 354 Op.getOperand(4), // Type 355 DAG.getConstant(0, MVT::i32), // SWZ_X 356 DAG.getConstant(1, MVT::i32), // SWZ_Y 357 DAG.getConstant(2, MVT::i32), // SWZ_Z 358 DAG.getConstant(3, MVT::i32) // SWZ_W 359 }; 360 return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(), 361 Args, 8); 362 } 363 364 // default for switch(IntrinsicID) 365 default: break; 366 } 367 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode()) 368 break; 369 } 370 case ISD::INTRINSIC_WO_CHAIN: { 371 unsigned IntrinsicID = 372 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 373 EVT VT = Op.getValueType(); 374 DebugLoc DL = Op.getDebugLoc(); 375 switch(IntrinsicID) { 376 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 377 case AMDGPUIntrinsic::R600_load_input: { 378 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 379 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex); 380 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT); 381 } 382 383 case AMDGPUIntrinsic::R600_interp_input: { 384 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 385 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue(); 386 MachineSDNode *interp; 387 if (ijb < 0) { 388 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL, 389 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32)); 390 return DAG.getTargetExtractSubreg( 391 TII->getRegisterInfo().getSubRegFromChannel(slot % 4), 392 DL, MVT::f32, SDValue(interp, 0)); 393 } 394 395 if (slot % 4 < 2) 396 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL, 397 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 398 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 399 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32), 400 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 401 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32)); 402 else 403 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL, 404 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32), 405 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 406 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32), 407 CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 408 AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32)); 409 410 return SDValue(interp, slot % 2); 411 } 412 413 case r600_read_ngroups_x: 414 return LowerImplicitParameter(DAG, VT, DL, 0); 415 case r600_read_ngroups_y: 416 return LowerImplicitParameter(DAG, VT, DL, 1); 417 case r600_read_ngroups_z: 418 return LowerImplicitParameter(DAG, VT, DL, 2); 419 case r600_read_global_size_x: 420 return LowerImplicitParameter(DAG, VT, DL, 3); 421 case r600_read_global_size_y: 422 return LowerImplicitParameter(DAG, VT, DL, 4); 423 case r600_read_global_size_z: 424 return LowerImplicitParameter(DAG, VT, DL, 5); 425 case r600_read_local_size_x: 426 return LowerImplicitParameter(DAG, VT, DL, 6); 427 case r600_read_local_size_y: 428 return LowerImplicitParameter(DAG, VT, DL, 7); 429 case r600_read_local_size_z: 430 return LowerImplicitParameter(DAG, VT, DL, 8); 431 432 case r600_read_tgid_x: 433 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 434 AMDGPU::T1_X, VT); 435 case r600_read_tgid_y: 436 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 437 AMDGPU::T1_Y, VT); 438 case r600_read_tgid_z: 439 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 440 AMDGPU::T1_Z, VT); 441 case r600_read_tidig_x: 442 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 443 AMDGPU::T0_X, VT); 444 case r600_read_tidig_y: 445 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 446 AMDGPU::T0_Y, VT); 447 case r600_read_tidig_z: 448 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, 449 AMDGPU::T0_Z, VT); 450 } 451 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) 452 break; 453 } 454 } // end switch(Op.getOpcode()) 455 return SDValue(); 456} 457 458void R600TargetLowering::ReplaceNodeResults(SDNode *N, 459 SmallVectorImpl<SDValue> &Results, 460 SelectionDAG &DAG) const { 461 switch (N->getOpcode()) { 462 default: return; 463 case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG)); 464 return; 465 case ISD::LOAD: { 466 SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode(); 467 Results.push_back(SDValue(Node, 0)); 468 Results.push_back(SDValue(Node, 1)); 469 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode 470 // function 471 DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1)); 472 return; 473 } 474 case ISD::STORE: 475 SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode(); 476 Results.push_back(SDValue(Node, 0)); 477 return; 478 } 479} 480 481SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const { 482 return DAG.getNode( 483 ISD::SETCC, 484 Op.getDebugLoc(), 485 MVT::i1, 486 Op, DAG.getConstantFP(0.0f, MVT::f32), 487 DAG.getCondCode(ISD::SETNE) 488 ); 489} 490 491SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT, 492 DebugLoc DL, 493 unsigned DwordOffset) const { 494 unsigned ByteOffset = DwordOffset * 4; 495 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 496 AMDGPUAS::PARAM_I_ADDRESS); 497 498 // We shouldn't be using an offset wider than 16-bits for implicit parameters. 499 assert(isInt<16>(ByteOffset)); 500 501 return DAG.getLoad(VT, DL, DAG.getEntryNode(), 502 DAG.getConstant(ByteOffset, MVT::i32), // PTR 503 MachinePointerInfo(ConstantPointerNull::get(PtrType)), 504 false, false, false, 0); 505} 506 507SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { 508 509 MachineFunction &MF = DAG.getMachineFunction(); 510 const AMDGPUFrameLowering *TFL = 511 static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering()); 512 513 FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op); 514 assert(FIN); 515 516 unsigned FrameIndex = FIN->getIndex(); 517 unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); 518 return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32); 519} 520 521SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const { 522 DebugLoc DL = Op.getDebugLoc(); 523 EVT VT = Op.getValueType(); 524 525 return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT, 526 Op.getOperand(0), 527 Op.getOperand(0), 528 DAG.getNode(ISD::SUB, DL, VT, 529 DAG.getConstant(32, MVT::i32), 530 Op.getOperand(1))); 531} 532 533bool R600TargetLowering::isZero(SDValue Op) const { 534 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { 535 return Cst->isNullValue(); 536 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){ 537 return CstFP->isZero(); 538 } else { 539 return false; 540 } 541} 542 543SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 544 DebugLoc DL = Op.getDebugLoc(); 545 EVT VT = Op.getValueType(); 546 547 SDValue LHS = Op.getOperand(0); 548 SDValue RHS = Op.getOperand(1); 549 SDValue True = Op.getOperand(2); 550 SDValue False = Op.getOperand(3); 551 SDValue CC = Op.getOperand(4); 552 SDValue Temp; 553 554 // LHS and RHS are guaranteed to be the same value type 555 EVT CompareVT = LHS.getValueType(); 556 557 // Check if we can lower this to a native operation. 558 559 // Try to lower to a SET* instruction: 560 // 561 // SET* can match the following patterns: 562 // 563 // select_cc f32, f32, -1, 0, cc_any 564 // select_cc f32, f32, 1.0f, 0.0f, cc_any 565 // select_cc i32, i32, -1, 0, cc_any 566 // 567 568 // Move hardware True/False values to the correct operand. 569 if (isHWTrueValue(False) && isHWFalseValue(True)) { 570 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 571 std::swap(False, True); 572 CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32)); 573 } 574 575 if (isHWTrueValue(True) && isHWFalseValue(False) && 576 (CompareVT == VT || VT == MVT::i32)) { 577 // This can be matched by a SET* instruction. 578 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC); 579 } 580 581 // Try to lower to a CND* instruction: 582 // 583 // CND* can match the following patterns: 584 // 585 // select_cc f32, 0.0, f32, f32, cc_any 586 // select_cc f32, 0.0, i32, i32, cc_any 587 // select_cc i32, 0, f32, f32, cc_any 588 // select_cc i32, 0, i32, i32, cc_any 589 // 590 if (isZero(LHS) || isZero(RHS)) { 591 SDValue Cond = (isZero(LHS) ? RHS : LHS); 592 SDValue Zero = (isZero(LHS) ? LHS : RHS); 593 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); 594 if (CompareVT != VT) { 595 // Bitcast True / False to the correct types. This will end up being 596 // a nop, but it allows us to define only a single pattern in the 597 // .TD files for each CND* instruction rather than having to have 598 // one pattern for integer True/False and one for fp True/False 599 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True); 600 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False); 601 } 602 if (isZero(LHS)) { 603 CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode); 604 } 605 606 switch (CCOpcode) { 607 case ISD::SETONE: 608 case ISD::SETUNE: 609 case ISD::SETNE: 610 case ISD::SETULE: 611 case ISD::SETULT: 612 case ISD::SETOLE: 613 case ISD::SETOLT: 614 case ISD::SETLE: 615 case ISD::SETLT: 616 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32); 617 Temp = True; 618 True = False; 619 False = Temp; 620 break; 621 default: 622 break; 623 } 624 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, 625 Cond, Zero, 626 True, False, 627 DAG.getCondCode(CCOpcode)); 628 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode); 629 } 630 631 632 // Possible Min/Max pattern 633 SDValue MinMax = LowerMinMax(Op, DAG); 634 if (MinMax.getNode()) { 635 return MinMax; 636 } 637 638 // If we make it this for it means we have no native instructions to handle 639 // this SELECT_CC, so we must lower it. 640 SDValue HWTrue, HWFalse; 641 642 if (CompareVT == MVT::f32) { 643 HWTrue = DAG.getConstantFP(1.0f, CompareVT); 644 HWFalse = DAG.getConstantFP(0.0f, CompareVT); 645 } else if (CompareVT == MVT::i32) { 646 HWTrue = DAG.getConstant(-1, CompareVT); 647 HWFalse = DAG.getConstant(0, CompareVT); 648 } 649 else { 650 assert(!"Unhandled value type in LowerSELECT_CC"); 651 } 652 653 // Lower this unsupported SELECT_CC into a combination of two supported 654 // SELECT_CC operations. 655 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC); 656 657 return DAG.getNode(ISD::SELECT_CC, DL, VT, 658 Cond, HWFalse, 659 True, False, 660 DAG.getCondCode(ISD::SETNE)); 661} 662 663SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 664 return DAG.getNode(ISD::SELECT_CC, 665 Op.getDebugLoc(), 666 Op.getValueType(), 667 Op.getOperand(0), 668 DAG.getConstant(0, MVT::i32), 669 Op.getOperand(1), 670 Op.getOperand(2), 671 DAG.getCondCode(ISD::SETNE)); 672} 673 674/// LLVM generates byte-addresed pointers. For indirect addressing, we need to 675/// convert these pointers to a register index. Each register holds 676/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the 677/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used 678/// for indirect addressing. 679SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr, 680 unsigned StackWidth, 681 SelectionDAG &DAG) const { 682 unsigned SRLPad; 683 switch(StackWidth) { 684 case 1: 685 SRLPad = 2; 686 break; 687 case 2: 688 SRLPad = 3; 689 break; 690 case 4: 691 SRLPad = 4; 692 break; 693 default: llvm_unreachable("Invalid stack width"); 694 } 695 696 return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr, 697 DAG.getConstant(SRLPad, MVT::i32)); 698} 699 700void R600TargetLowering::getStackAddress(unsigned StackWidth, 701 unsigned ElemIdx, 702 unsigned &Channel, 703 unsigned &PtrIncr) const { 704 switch (StackWidth) { 705 default: 706 case 1: 707 Channel = 0; 708 if (ElemIdx > 0) { 709 PtrIncr = 1; 710 } else { 711 PtrIncr = 0; 712 } 713 break; 714 case 2: 715 Channel = ElemIdx % 2; 716 if (ElemIdx == 2) { 717 PtrIncr = 1; 718 } else { 719 PtrIncr = 0; 720 } 721 break; 722 case 4: 723 Channel = ElemIdx; 724 PtrIncr = 0; 725 break; 726 } 727} 728 729SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 730 DebugLoc DL = Op.getDebugLoc(); 731 StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 732 SDValue Chain = Op.getOperand(0); 733 SDValue Value = Op.getOperand(1); 734 SDValue Ptr = Op.getOperand(2); 735 736 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && 737 Ptr->getOpcode() != AMDGPUISD::DWORDADDR) { 738 // Convert pointer from byte address to dword address. 739 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(), 740 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), 741 Ptr, DAG.getConstant(2, MVT::i32))); 742 743 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) { 744 assert(!"Truncated and indexed stores not supported yet"); 745 } else { 746 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand()); 747 } 748 return Chain; 749 } 750 751 EVT ValueVT = Value.getValueType(); 752 753 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 754 return SDValue(); 755 } 756 757 // Lowering for indirect addressing 758 759 const MachineFunction &MF = DAG.getMachineFunction(); 760 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 761 getTargetMachine().getFrameLowering()); 762 unsigned StackWidth = TFL->getStackWidth(MF); 763 764 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 765 766 if (ValueVT.isVector()) { 767 unsigned NumElemVT = ValueVT.getVectorNumElements(); 768 EVT ElemVT = ValueVT.getVectorElementType(); 769 SDValue Stores[4]; 770 771 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 772 "vector width in load"); 773 774 for (unsigned i = 0; i < NumElemVT; ++i) { 775 unsigned Channel, PtrIncr; 776 getStackAddress(StackWidth, i, Channel, PtrIncr); 777 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 778 DAG.getConstant(PtrIncr, MVT::i32)); 779 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, 780 Value, DAG.getConstant(i, MVT::i32)); 781 782 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 783 Chain, Elem, Ptr, 784 DAG.getTargetConstant(Channel, MVT::i32)); 785 } 786 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT); 787 } else { 788 if (ValueVT == MVT::i8) { 789 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value); 790 } 791 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr, 792 DAG.getTargetConstant(0, MVT::i32)); // Channel 793 } 794 795 return Chain; 796} 797 798// return (512 + (kc_bank << 12) 799static int 800ConstantAddressBlock(unsigned AddressSpace) { 801 switch (AddressSpace) { 802 case AMDGPUAS::CONSTANT_BUFFER_0: 803 return 512; 804 case AMDGPUAS::CONSTANT_BUFFER_1: 805 return 512 + 4096; 806 case AMDGPUAS::CONSTANT_BUFFER_2: 807 return 512 + 4096 * 2; 808 case AMDGPUAS::CONSTANT_BUFFER_3: 809 return 512 + 4096 * 3; 810 case AMDGPUAS::CONSTANT_BUFFER_4: 811 return 512 + 4096 * 4; 812 case AMDGPUAS::CONSTANT_BUFFER_5: 813 return 512 + 4096 * 5; 814 case AMDGPUAS::CONSTANT_BUFFER_6: 815 return 512 + 4096 * 6; 816 case AMDGPUAS::CONSTANT_BUFFER_7: 817 return 512 + 4096 * 7; 818 case AMDGPUAS::CONSTANT_BUFFER_8: 819 return 512 + 4096 * 8; 820 case AMDGPUAS::CONSTANT_BUFFER_9: 821 return 512 + 4096 * 9; 822 case AMDGPUAS::CONSTANT_BUFFER_10: 823 return 512 + 4096 * 10; 824 case AMDGPUAS::CONSTANT_BUFFER_11: 825 return 512 + 4096 * 11; 826 case AMDGPUAS::CONSTANT_BUFFER_12: 827 return 512 + 4096 * 12; 828 case AMDGPUAS::CONSTANT_BUFFER_13: 829 return 512 + 4096 * 13; 830 case AMDGPUAS::CONSTANT_BUFFER_14: 831 return 512 + 4096 * 14; 832 case AMDGPUAS::CONSTANT_BUFFER_15: 833 return 512 + 4096 * 15; 834 default: 835 return -1; 836 } 837} 838 839SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const 840{ 841 EVT VT = Op.getValueType(); 842 DebugLoc DL = Op.getDebugLoc(); 843 LoadSDNode *LoadNode = cast<LoadSDNode>(Op); 844 SDValue Chain = Op.getOperand(0); 845 SDValue Ptr = Op.getOperand(1); 846 SDValue LoweredLoad; 847 848 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); 849 if (ConstantBlock > -1) { 850 SDValue Result; 851 if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) || 852 dyn_cast<Constant>(LoadNode->getSrcValue()) || 853 dyn_cast<ConstantSDNode>(Ptr)) { 854 SDValue Slots[4]; 855 for (unsigned i = 0; i < 4; i++) { 856 // We want Const position encoded with the following formula : 857 // (((512 + (kc_bank << 12) + const_index) << 2) + chan) 858 // const_index is Ptr computed by llvm using an alignment of 16. 859 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and 860 // then div by 4 at the ISel step 861 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, 862 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32)); 863 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); 864 } 865 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4); 866 } else { 867 // non constant ptr cant be folded, keeps it as a v4f32 load 868 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, 869 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)), 870 DAG.getConstant(LoadNode->getAddressSpace() - 871 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32) 872 ); 873 } 874 875 if (!VT.isVector()) { 876 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, 877 DAG.getConstant(0, MVT::i32)); 878 } 879 880 SDValue MergedValues[2] = { 881 Result, 882 Chain 883 }; 884 return DAG.getMergeValues(MergedValues, 2, DL); 885 } 886 887 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { 888 return SDValue(); 889 } 890 891 // Lowering for indirect addressing 892 const MachineFunction &MF = DAG.getMachineFunction(); 893 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>( 894 getTargetMachine().getFrameLowering()); 895 unsigned StackWidth = TFL->getStackWidth(MF); 896 897 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG); 898 899 if (VT.isVector()) { 900 unsigned NumElemVT = VT.getVectorNumElements(); 901 EVT ElemVT = VT.getVectorElementType(); 902 SDValue Loads[4]; 903 904 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than " 905 "vector width in load"); 906 907 for (unsigned i = 0; i < NumElemVT; ++i) { 908 unsigned Channel, PtrIncr; 909 getStackAddress(StackWidth, i, Channel, PtrIncr); 910 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr, 911 DAG.getConstant(PtrIncr, MVT::i32)); 912 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT, 913 Chain, Ptr, 914 DAG.getTargetConstant(Channel, MVT::i32), 915 Op.getOperand(2)); 916 } 917 for (unsigned i = NumElemVT; i < 4; ++i) { 918 Loads[i] = DAG.getUNDEF(ElemVT); 919 } 920 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4); 921 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4); 922 } else { 923 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT, 924 Chain, Ptr, 925 DAG.getTargetConstant(0, MVT::i32), // Channel 926 Op.getOperand(2)); 927 } 928 929 SDValue Ops[2]; 930 Ops[0] = LoweredLoad; 931 Ops[1] = Chain; 932 933 return DAG.getMergeValues(Ops, 2, DL); 934} 935 936/// XXX Only kernel functions are supported, so we can assume for now that 937/// every function is a kernel function, but in the future we should use 938/// separate calling conventions for kernel and non-kernel functions. 939SDValue R600TargetLowering::LowerFormalArguments( 940 SDValue Chain, 941 CallingConv::ID CallConv, 942 bool isVarArg, 943 const SmallVectorImpl<ISD::InputArg> &Ins, 944 DebugLoc DL, SelectionDAG &DAG, 945 SmallVectorImpl<SDValue> &InVals) const { 946 unsigned ParamOffsetBytes = 36; 947 Function::const_arg_iterator FuncArg = 948 DAG.getMachineFunction().getFunction()->arg_begin(); 949 for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) { 950 EVT VT = Ins[i].VT; 951 Type *ArgType = FuncArg->getType(); 952 unsigned ArgSizeInBits = ArgType->isPointerTy() ? 953 32 : ArgType->getPrimitiveSizeInBits(); 954 unsigned ArgBytes = ArgSizeInBits >> 3; 955 EVT ArgVT; 956 if (ArgSizeInBits < VT.getSizeInBits()) { 957 assert(!ArgType->isFloatTy() && 958 "Extending floating point arguments not supported yet"); 959 ArgVT = MVT::getIntegerVT(ArgSizeInBits); 960 } else { 961 ArgVT = VT; 962 } 963 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 964 AMDGPUAS::PARAM_I_ADDRESS); 965 SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(), 966 DAG.getConstant(ParamOffsetBytes, MVT::i32), 967 MachinePointerInfo(UndefValue::get(PtrTy)), 968 ArgVT, false, false, ArgBytes); 969 InVals.push_back(Arg); 970 ParamOffsetBytes += ArgBytes; 971 } 972 return Chain; 973} 974 975EVT R600TargetLowering::getSetCCResultType(EVT VT) const { 976 if (!VT.isVector()) return MVT::i32; 977 return VT.changeVectorElementTypeToInteger(); 978} 979 980//===----------------------------------------------------------------------===// 981// Custom DAG Optimizations 982//===----------------------------------------------------------------------===// 983 984SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, 985 DAGCombinerInfo &DCI) const { 986 SelectionDAG &DAG = DCI.DAG; 987 988 switch (N->getOpcode()) { 989 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a) 990 case ISD::FP_ROUND: { 991 SDValue Arg = N->getOperand(0); 992 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) { 993 return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0), 994 Arg.getOperand(0)); 995 } 996 break; 997 } 998 999 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) -> 1000 // (i32 select_cc f32, f32, -1, 0 cc) 1001 // 1002 // Mesa's GLSL frontend generates the above pattern a lot and we can lower 1003 // this to one of the SET*_DX10 instructions. 1004 case ISD::FP_TO_SINT: { 1005 SDValue FNeg = N->getOperand(0); 1006 if (FNeg.getOpcode() != ISD::FNEG) { 1007 return SDValue(); 1008 } 1009 SDValue SelectCC = FNeg.getOperand(0); 1010 if (SelectCC.getOpcode() != ISD::SELECT_CC || 1011 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS 1012 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True 1013 !isHWTrueValue(SelectCC.getOperand(2)) || 1014 !isHWFalseValue(SelectCC.getOperand(3))) { 1015 return SDValue(); 1016 } 1017 1018 return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0), 1019 SelectCC.getOperand(0), // LHS 1020 SelectCC.getOperand(1), // RHS 1021 DAG.getConstant(-1, MVT::i32), // True 1022 DAG.getConstant(0, MVT::i32), // Flase 1023 SelectCC.getOperand(4)); // CC 1024 1025 break; 1026 } 1027 // Extract_vec (Build_vector) generated by custom lowering 1028 // also needs to be customly combined 1029 case ISD::EXTRACT_VECTOR_ELT: { 1030 SDValue Arg = N->getOperand(0); 1031 if (Arg.getOpcode() == ISD::BUILD_VECTOR) { 1032 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1033 unsigned Element = Const->getZExtValue(); 1034 return Arg->getOperand(Element); 1035 } 1036 } 1037 if (Arg.getOpcode() == ISD::BITCAST && 1038 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { 1039 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 1040 unsigned Element = Const->getZExtValue(); 1041 return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(), 1042 Arg->getOperand(0).getOperand(Element)); 1043 } 1044 } 1045 } 1046 1047 case ISD::SELECT_CC: { 1048 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq -> 1049 // selectcc x, y, a, b, inv(cc) 1050 // 1051 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne -> 1052 // selectcc x, y, a, b, cc 1053 SDValue LHS = N->getOperand(0); 1054 if (LHS.getOpcode() != ISD::SELECT_CC) { 1055 return SDValue(); 1056 } 1057 1058 SDValue RHS = N->getOperand(1); 1059 SDValue True = N->getOperand(2); 1060 SDValue False = N->getOperand(3); 1061 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 1062 1063 if (LHS.getOperand(2).getNode() != True.getNode() || 1064 LHS.getOperand(3).getNode() != False.getNode() || 1065 RHS.getNode() != False.getNode()) { 1066 return SDValue(); 1067 } 1068 1069 switch (NCC) { 1070 default: return SDValue(); 1071 case ISD::SETNE: return LHS; 1072 case ISD::SETEQ: { 1073 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get(); 1074 LHSCC = ISD::getSetCCInverse(LHSCC, 1075 LHS.getOperand(0).getValueType().isInteger()); 1076 return DAG.getSelectCC(N->getDebugLoc(), 1077 LHS.getOperand(0), 1078 LHS.getOperand(1), 1079 LHS.getOperand(2), 1080 LHS.getOperand(3), 1081 LHSCC); 1082 } 1083 } 1084 } 1085 case AMDGPUISD::EXPORT: { 1086 SDValue Arg = N->getOperand(1); 1087 if (Arg.getOpcode() != ISD::BUILD_VECTOR) 1088 break; 1089 SDValue NewBldVec[4] = { 1090 DAG.getUNDEF(MVT::f32), 1091 DAG.getUNDEF(MVT::f32), 1092 DAG.getUNDEF(MVT::f32), 1093 DAG.getUNDEF(MVT::f32) 1094 }; 1095 SDValue NewArgs[8] = { 1096 N->getOperand(0), // Chain 1097 SDValue(), 1098 N->getOperand(2), // ArrayBase 1099 N->getOperand(3), // Type 1100 N->getOperand(4), // SWZ_X 1101 N->getOperand(5), // SWZ_Y 1102 N->getOperand(6), // SWZ_Z 1103 N->getOperand(7) // SWZ_W 1104 }; 1105 for (unsigned i = 0; i < Arg.getNumOperands(); i++) { 1106 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) { 1107 if (C->isZero()) { 1108 NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0 1109 } else if (C->isExactlyValue(1.0)) { 1110 NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0 1111 } else { 1112 NewBldVec[i] = Arg.getOperand(i); 1113 } 1114 } else { 1115 NewBldVec[i] = Arg.getOperand(i); 1116 } 1117 } 1118 DebugLoc DL = N->getDebugLoc(); 1119 NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4); 1120 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8); 1121 } 1122 } 1123 return SDValue(); 1124} 1125