1249259Sdim//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2249259Sdim// 3249259Sdim// The LLVM Compiler Infrastructure 4249259Sdim// 5249259Sdim// This file is distributed under the University of Illinois Open Source 6249259Sdim// License. See LICENSE.TXT for details. 7249259Sdim// 8249259Sdim//===----------------------------------------------------------------------===// 9249259Sdim// 10249259Sdim/// \file 11249259Sdim/// \brief Custom DAG lowering for SI 12249259Sdim// 13249259Sdim//===----------------------------------------------------------------------===// 14249259Sdim 15249259Sdim#include "SIISelLowering.h" 16249259Sdim#include "AMDGPU.h" 17249259Sdim#include "AMDILIntrinsicInfo.h" 18249259Sdim#include "SIInstrInfo.h" 19249259Sdim#include "SIMachineFunctionInfo.h" 20249259Sdim#include "SIRegisterInfo.h" 21249259Sdim#include "llvm/CodeGen/CallingConvLower.h" 22249259Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 23249259Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 24249259Sdim#include "llvm/CodeGen/SelectionDAG.h" 25263508Sdim#include "llvm/IR/Function.h" 26249259Sdim 27263508Sdimconst uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; 28263508Sdim 29249259Sdimusing namespace llvm; 30249259Sdim 31249259SdimSITargetLowering::SITargetLowering(TargetMachine &TM) : 32263508Sdim AMDGPUTargetLowering(TM) { 33249259Sdim 34249259Sdim addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); 35263508Sdim addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); 36249259Sdim 37249259Sdim addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 38249259Sdim addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 39249259Sdim 40263508Sdim addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass); 41263508Sdim addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); 42249259Sdim 43263508Sdim addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); 44263508Sdim addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); 45263508Sdim addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); 46249259Sdim 47249259Sdim addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); 48249259Sdim addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 49251662Sdim addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); 50249259Sdim 51249259Sdim addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); 52249259Sdim addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 53249259Sdim 54249259Sdim addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); 55249259Sdim addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 56249259Sdim 57249259Sdim computeRegisterProperties(); 58249259Sdim 59263508Sdim // Condition Codes 60263508Sdim setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 61263508Sdim setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 62263508Sdim setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 63263508Sdim setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 64263508Sdim setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 65263508Sdim setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 66263508Sdim 67263508Sdim setCondCodeAction(ISD::SETONE, MVT::f64, Expand); 68263508Sdim setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); 69263508Sdim setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); 70263508Sdim setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); 71263508Sdim setCondCodeAction(ISD::SETULE, MVT::f64, Expand); 72263508Sdim setCondCodeAction(ISD::SETULT, MVT::f64, Expand); 73263508Sdim 74249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 75249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 76249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 77249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 78249259Sdim 79249259Sdim setOperationAction(ISD::ADD, MVT::i64, Legal); 80249259Sdim setOperationAction(ISD::ADD, MVT::i32, Legal); 81263508Sdim setOperationAction(ISD::ADDC, MVT::i32, Legal); 82263508Sdim setOperationAction(ISD::ADDE, MVT::i32, Legal); 83249259Sdim 84263508Sdim setOperationAction(ISD::BITCAST, MVT::i128, Legal); 85263508Sdim 86263508Sdim // We need to custom lower vector stores from local memory 87263508Sdim setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 88263508Sdim setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 89263508Sdim setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 90263508Sdim setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 91263508Sdim 92263508Sdim setOperationAction(ISD::STORE, MVT::v8i32, Custom); 93263508Sdim setOperationAction(ISD::STORE, MVT::v16i32, Custom); 94263508Sdim 95263508Sdim // We need to custom lower loads/stores from private memory 96263508Sdim setOperationAction(ISD::LOAD, MVT::i32, Custom); 97263508Sdim setOperationAction(ISD::LOAD, MVT::i64, Custom); 98263508Sdim setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 99263508Sdim setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 100263508Sdim 101263508Sdim setOperationAction(ISD::STORE, MVT::i32, Custom); 102263508Sdim setOperationAction(ISD::STORE, MVT::i64, Custom); 103263508Sdim setOperationAction(ISD::STORE, MVT::i128, Custom); 104263508Sdim setOperationAction(ISD::STORE, MVT::v2i32, Custom); 105263508Sdim setOperationAction(ISD::STORE, MVT::v4i32, Custom); 106263508Sdim 107263508Sdim 108249259Sdim setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 109249259Sdim setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 110249259Sdim 111249259Sdim setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); 112251662Sdim 113263508Sdim setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 114263508Sdim setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 115251662Sdim 116263508Sdim setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); 117263508Sdim setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); 118263508Sdim setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); 119263508Sdim 120263508Sdim setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 121263508Sdim setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 122263508Sdim setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 123263508Sdim setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 124263508Sdim 125263508Sdim setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 126263508Sdim 127263508Sdim setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); 128263508Sdim setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); 129263508Sdim setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); 130263508Sdim setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); 131263508Sdim 132263508Sdim setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 133263508Sdim setTruncStoreAction(MVT::f64, MVT::f32, Expand); 134263508Sdim setTruncStoreAction(MVT::i64, MVT::i32, Expand); 135263508Sdim setTruncStoreAction(MVT::i128, MVT::i64, Expand); 136263508Sdim setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 137263508Sdim setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 138263508Sdim 139263508Sdim setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 140263508Sdim setOperationAction(ISD::FrameIndex, MVT::i64, Custom); 141263508Sdim 142249259Sdim setTargetDAGCombine(ISD::SELECT_CC); 143249259Sdim 144249259Sdim setTargetDAGCombine(ISD::SETCC); 145249259Sdim 146249259Sdim setSchedulingPreference(Sched::RegPressure); 147249259Sdim} 148249259Sdim 149263508Sdim//===----------------------------------------------------------------------===// 150263508Sdim// TargetLowering queries 151263508Sdim//===----------------------------------------------------------------------===// 152263508Sdim 153263508Sdimbool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, 154263508Sdim bool *IsFast) const { 155263508Sdim // XXX: This depends on the address space and also we may want to revist 156263508Sdim // the alignment values we specify in the DataLayout. 157263508Sdim if (!VT.isSimple() || VT == MVT::Other) 158263508Sdim return false; 159263508Sdim return VT.bitsGT(MVT::i32); 160263508Sdim} 161263508Sdim 162263508Sdimbool SITargetLowering::shouldSplitVectorElementType(EVT VT) const { 163263508Sdim return VT.bitsLE(MVT::i16); 164263508Sdim} 165263508Sdim 166263508SdimSDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 167263508Sdim SDLoc DL, SDValue Chain, 168263508Sdim unsigned Offset) const { 169263508Sdim MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 170263508Sdim PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 171263508Sdim AMDGPUAS::CONSTANT_ADDRESS); 172263508Sdim SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, 173263508Sdim MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); 174263508Sdim SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 175263508Sdim DAG.getConstant(Offset, MVT::i64)); 176263508Sdim return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr, 177263508Sdim MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, 178263508Sdim false, false, MemVT.getSizeInBits() >> 3); 179263508Sdim 180263508Sdim} 181263508Sdim 182249259SdimSDValue SITargetLowering::LowerFormalArguments( 183249259Sdim SDValue Chain, 184249259Sdim CallingConv::ID CallConv, 185249259Sdim bool isVarArg, 186249259Sdim const SmallVectorImpl<ISD::InputArg> &Ins, 187263508Sdim SDLoc DL, SelectionDAG &DAG, 188249259Sdim SmallVectorImpl<SDValue> &InVals) const { 189249259Sdim 190249259Sdim const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 191249259Sdim 192249259Sdim MachineFunction &MF = DAG.getMachineFunction(); 193249259Sdim FunctionType *FType = MF.getFunction()->getFunctionType(); 194249259Sdim SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 195249259Sdim 196249259Sdim assert(CallConv == CallingConv::C); 197249259Sdim 198249259Sdim SmallVector<ISD::InputArg, 16> Splits; 199249259Sdim uint32_t Skipped = 0; 200249259Sdim 201249259Sdim for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 202249259Sdim const ISD::InputArg &Arg = Ins[i]; 203249259Sdim 204263508Sdim // First check if it's a PS input addr 205263508Sdim if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() && 206263508Sdim !Arg.Flags.isByVal()) { 207263508Sdim 208249259Sdim assert((PSInputNum <= 15) && "Too many PS inputs!"); 209249259Sdim 210249259Sdim if (!Arg.Used) { 211249259Sdim // We can savely skip PS inputs 212249259Sdim Skipped |= 1 << i; 213249259Sdim ++PSInputNum; 214249259Sdim continue; 215249259Sdim } 216249259Sdim 217249259Sdim Info->PSInputAddr |= 1 << PSInputNum++; 218249259Sdim } 219249259Sdim 220249259Sdim // Second split vertices into their elements 221263508Sdim if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { 222249259Sdim ISD::InputArg NewArg = Arg; 223249259Sdim NewArg.Flags.setSplit(); 224249259Sdim NewArg.VT = Arg.VT.getVectorElementType(); 225249259Sdim 226249259Sdim // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 227249259Sdim // three or five element vertex only needs three or five registers, 228249259Sdim // NOT four or eigth. 229249259Sdim Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 230249259Sdim unsigned NumElements = ParamType->getVectorNumElements(); 231249259Sdim 232249259Sdim for (unsigned j = 0; j != NumElements; ++j) { 233249259Sdim Splits.push_back(NewArg); 234249259Sdim NewArg.PartOffset += NewArg.VT.getStoreSize(); 235249259Sdim } 236249259Sdim 237263508Sdim } else if (Info->ShaderType != ShaderType::COMPUTE) { 238249259Sdim Splits.push_back(Arg); 239249259Sdim } 240249259Sdim } 241249259Sdim 242249259Sdim SmallVector<CCValAssign, 16> ArgLocs; 243249259Sdim CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 244249259Sdim getTargetMachine(), ArgLocs, *DAG.getContext()); 245249259Sdim 246249259Sdim // At least one interpolation mode must be enabled or else the GPU will hang. 247249259Sdim if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { 248249259Sdim Info->PSInputAddr |= 1; 249249259Sdim CCInfo.AllocateReg(AMDGPU::VGPR0); 250249259Sdim CCInfo.AllocateReg(AMDGPU::VGPR1); 251249259Sdim } 252249259Sdim 253263508Sdim // The pointer to the list of arguments is stored in SGPR0, SGPR1 254263508Sdim if (Info->ShaderType == ShaderType::COMPUTE) { 255263508Sdim CCInfo.AllocateReg(AMDGPU::SGPR0); 256263508Sdim CCInfo.AllocateReg(AMDGPU::SGPR1); 257263508Sdim MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); 258263508Sdim } 259263508Sdim 260263508Sdim if (Info->ShaderType == ShaderType::COMPUTE) { 261263508Sdim getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 262263508Sdim Splits); 263263508Sdim } 264263508Sdim 265249259Sdim AnalyzeFormalArguments(CCInfo, Splits); 266249259Sdim 267249259Sdim for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 268249259Sdim 269263508Sdim const ISD::InputArg &Arg = Ins[i]; 270249259Sdim if (Skipped & (1 << i)) { 271263508Sdim InVals.push_back(DAG.getUNDEF(Arg.VT)); 272249259Sdim continue; 273249259Sdim } 274249259Sdim 275249259Sdim CCValAssign &VA = ArgLocs[ArgIdx++]; 276263508Sdim EVT VT = VA.getLocVT(); 277263508Sdim 278263508Sdim if (VA.isMemLoc()) { 279263508Sdim VT = Ins[i].VT; 280263508Sdim EVT MemVT = Splits[i].VT; 281263508Sdim // The first 36 bytes of the input buffer contains information about 282263508Sdim // thread group and global sizes. 283263508Sdim SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), 284263508Sdim 36 + VA.getLocMemOffset()); 285263508Sdim InVals.push_back(Arg); 286263508Sdim continue; 287263508Sdim } 288249259Sdim assert(VA.isRegLoc() && "Parameter must be in a register!"); 289249259Sdim 290249259Sdim unsigned Reg = VA.getLocReg(); 291249259Sdim 292249259Sdim if (VT == MVT::i64) { 293249259Sdim // For now assume it is a pointer 294249259Sdim Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 295249259Sdim &AMDGPU::SReg_64RegClass); 296249259Sdim Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 297249259Sdim InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 298249259Sdim continue; 299249259Sdim } 300249259Sdim 301249259Sdim const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 302249259Sdim 303249259Sdim Reg = MF.addLiveIn(Reg, RC); 304249259Sdim SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 305249259Sdim 306249259Sdim if (Arg.VT.isVector()) { 307249259Sdim 308249259Sdim // Build a vector from the registers 309249259Sdim Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 310249259Sdim unsigned NumElements = ParamType->getVectorNumElements(); 311249259Sdim 312249259Sdim SmallVector<SDValue, 4> Regs; 313249259Sdim Regs.push_back(Val); 314249259Sdim for (unsigned j = 1; j != NumElements; ++j) { 315249259Sdim Reg = ArgLocs[ArgIdx++].getLocReg(); 316249259Sdim Reg = MF.addLiveIn(Reg, RC); 317249259Sdim Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 318249259Sdim } 319249259Sdim 320249259Sdim // Fill up the missing vector elements 321249259Sdim NumElements = Arg.VT.getVectorNumElements() - NumElements; 322249259Sdim for (unsigned j = 0; j != NumElements; ++j) 323249259Sdim Regs.push_back(DAG.getUNDEF(VT)); 324263508Sdim 325249259Sdim InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, 326249259Sdim Regs.data(), Regs.size())); 327249259Sdim continue; 328249259Sdim } 329249259Sdim 330249259Sdim InVals.push_back(Val); 331249259Sdim } 332249259Sdim return Chain; 333249259Sdim} 334249259Sdim 335249259SdimMachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 336249259Sdim MachineInstr * MI, MachineBasicBlock * BB) const { 337249259Sdim 338263508Sdim MachineBasicBlock::iterator I = *MI; 339263508Sdim 340249259Sdim switch (MI->getOpcode()) { 341249259Sdim default: 342249259Sdim return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 343249259Sdim case AMDGPU::BRANCH: return BB; 344263508Sdim case AMDGPU::SI_ADDR64_RSRC: { 345263508Sdim const SIInstrInfo *TII = 346263508Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 347263508Sdim MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 348263508Sdim unsigned SuperReg = MI->getOperand(0).getReg(); 349263508Sdim unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 350263508Sdim unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 351263508Sdim unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 352263508Sdim unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 353263508Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) 354263508Sdim .addOperand(MI->getOperand(1)); 355263508Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) 356263508Sdim .addImm(0); 357263508Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) 358263508Sdim .addImm(RSRC_DATA_FORMAT >> 32); 359263508Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) 360263508Sdim .addReg(SubRegHiLo) 361263508Sdim .addImm(AMDGPU::sub0) 362263508Sdim .addReg(SubRegHiHi) 363263508Sdim .addImm(AMDGPU::sub1); 364263508Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) 365263508Sdim .addReg(SubRegLo) 366263508Sdim .addImm(AMDGPU::sub0_sub1) 367263508Sdim .addReg(SubRegHi) 368263508Sdim .addImm(AMDGPU::sub2_sub3); 369263508Sdim MI->eraseFromParent(); 370263508Sdim break; 371249259Sdim } 372263508Sdim case AMDGPU::V_SUB_F64: { 373263508Sdim const SIInstrInfo *TII = 374263508Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 375263508Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), 376263508Sdim MI->getOperand(0).getReg()) 377263508Sdim .addReg(MI->getOperand(1).getReg()) 378263508Sdim .addReg(MI->getOperand(2).getReg()) 379263508Sdim .addImm(0) /* src2 */ 380263508Sdim .addImm(0) /* ABS */ 381263508Sdim .addImm(0) /* CLAMP */ 382263508Sdim .addImm(0) /* OMOD */ 383263508Sdim .addImm(2); /* NEG */ 384263508Sdim MI->eraseFromParent(); 385263508Sdim break; 386263508Sdim } 387263508Sdim case AMDGPU::SI_RegisterStorePseudo: { 388263508Sdim MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 389263508Sdim const SIInstrInfo *TII = 390263508Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 391263508Sdim unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 392263508Sdim MachineInstrBuilder MIB = 393263508Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), 394263508Sdim Reg); 395263508Sdim for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) 396263508Sdim MIB.addOperand(MI->getOperand(i)); 397263508Sdim 398263508Sdim MI->eraseFromParent(); 399263508Sdim } 400263508Sdim } 401249259Sdim return BB; 402249259Sdim} 403249259Sdim 404263508SdimEVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 405263508Sdim if (!VT.isVector()) { 406263508Sdim return MVT::i1; 407263508Sdim } 408263508Sdim return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 409249259Sdim} 410249259Sdim 411249259SdimMVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 412249259Sdim return MVT::i32; 413249259Sdim} 414249259Sdim 415263508Sdimbool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 416263508Sdim VT = VT.getScalarType(); 417263508Sdim 418263508Sdim if (!VT.isSimple()) 419263508Sdim return false; 420263508Sdim 421263508Sdim switch (VT.getSimpleVT().SimpleTy) { 422263508Sdim case MVT::f32: 423263508Sdim return false; /* There is V_MAD_F32 for f32 */ 424263508Sdim case MVT::f64: 425263508Sdim return true; 426263508Sdim default: 427263508Sdim break; 428263508Sdim } 429263508Sdim 430263508Sdim return false; 431263508Sdim} 432263508Sdim 433249259Sdim//===----------------------------------------------------------------------===// 434249259Sdim// Custom DAG Lowering Operations 435249259Sdim//===----------------------------------------------------------------------===// 436249259Sdim 437249259SdimSDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 438263508Sdim MachineFunction &MF = DAG.getMachineFunction(); 439263508Sdim SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 440249259Sdim switch (Op.getOpcode()) { 441249259Sdim default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 442263508Sdim case ISD::ADD: return LowerADD(Op, DAG); 443249259Sdim case ISD::BRCOND: return LowerBRCOND(Op, DAG); 444263508Sdim case ISD::LOAD: { 445263508Sdim LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); 446263508Sdim if ((Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 447263508Sdim Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && 448263508Sdim Op.getValueType().isVector()) { 449263508Sdim SDValue MergedValues[2] = { 450263508Sdim SplitVectorLoad(Op, DAG), 451263508Sdim Load->getChain() 452263508Sdim }; 453263508Sdim return DAG.getMergeValues(MergedValues, 2, SDLoc(Op)); 454263508Sdim } else { 455263508Sdim return LowerLOAD(Op, DAG); 456263508Sdim } 457263508Sdim } 458263508Sdim 459249259Sdim case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 460263508Sdim case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); 461251662Sdim case ISD::STORE: return LowerSTORE(Op, DAG); 462263508Sdim case ISD::ANY_EXTEND: // Fall-through 463263508Sdim case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); 464263508Sdim case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 465263508Sdim case ISD::INTRINSIC_WO_CHAIN: { 466263508Sdim unsigned IntrinsicID = 467263508Sdim cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 468263508Sdim EVT VT = Op.getValueType(); 469263508Sdim SDLoc DL(Op); 470263508Sdim //XXX: Hardcoded we only use two to store the pointer to the parameters. 471263508Sdim unsigned NumUserSGPRs = 2; 472263508Sdim switch (IntrinsicID) { 473263508Sdim default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 474263508Sdim case Intrinsic::r600_read_ngroups_x: 475263508Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0); 476263508Sdim case Intrinsic::r600_read_ngroups_y: 477263508Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4); 478263508Sdim case Intrinsic::r600_read_ngroups_z: 479263508Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8); 480263508Sdim case Intrinsic::r600_read_global_size_x: 481263508Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12); 482263508Sdim case Intrinsic::r600_read_global_size_y: 483263508Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16); 484263508Sdim case Intrinsic::r600_read_global_size_z: 485263508Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20); 486263508Sdim case Intrinsic::r600_read_local_size_x: 487263508Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24); 488263508Sdim case Intrinsic::r600_read_local_size_y: 489263508Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28); 490263508Sdim case Intrinsic::r600_read_local_size_z: 491263508Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32); 492263508Sdim case Intrinsic::r600_read_tgid_x: 493263508Sdim return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 494263508Sdim AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); 495263508Sdim case Intrinsic::r600_read_tgid_y: 496263508Sdim return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 497263508Sdim AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); 498263508Sdim case Intrinsic::r600_read_tgid_z: 499263508Sdim return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 500263508Sdim AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); 501263508Sdim case Intrinsic::r600_read_tidig_x: 502263508Sdim return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 503263508Sdim AMDGPU::VGPR0, VT); 504263508Sdim case Intrinsic::r600_read_tidig_y: 505263508Sdim return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 506263508Sdim AMDGPU::VGPR1, VT); 507263508Sdim case Intrinsic::r600_read_tidig_z: 508263508Sdim return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 509263508Sdim AMDGPU::VGPR2, VT); 510263508Sdim case AMDGPUIntrinsic::SI_load_const: { 511263508Sdim SDValue Ops [] = { 512263508Sdim ResourceDescriptorToi128(Op.getOperand(1), DAG), 513263508Sdim Op.getOperand(2) 514263508Sdim }; 515263508Sdim 516263508Sdim MachineMemOperand *MMO = MF.getMachineMemOperand( 517263508Sdim MachinePointerInfo(), 518263508Sdim MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 519263508Sdim VT.getSizeInBits() / 8, 4); 520263508Sdim return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 521263508Sdim Op->getVTList(), Ops, 2, VT, MMO); 522263508Sdim } 523263508Sdim case AMDGPUIntrinsic::SI_sample: 524263508Sdim return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); 525263508Sdim case AMDGPUIntrinsic::SI_sampleb: 526263508Sdim return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); 527263508Sdim case AMDGPUIntrinsic::SI_sampled: 528263508Sdim return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); 529263508Sdim case AMDGPUIntrinsic::SI_samplel: 530263508Sdim return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); 531263508Sdim case AMDGPUIntrinsic::SI_vs_load_input: 532263508Sdim return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 533263508Sdim ResourceDescriptorToi128(Op.getOperand(1), DAG), 534263508Sdim Op.getOperand(2), 535263508Sdim Op.getOperand(3)); 536263508Sdim } 537249259Sdim } 538263508Sdim 539263508Sdim case ISD::INTRINSIC_VOID: 540263508Sdim SDValue Chain = Op.getOperand(0); 541263508Sdim unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 542263508Sdim 543263508Sdim switch (IntrinsicID) { 544263508Sdim case AMDGPUIntrinsic::SI_tbuffer_store: { 545263508Sdim SDLoc DL(Op); 546263508Sdim SDValue Ops [] = { 547263508Sdim Chain, 548263508Sdim ResourceDescriptorToi128(Op.getOperand(2), DAG), 549263508Sdim Op.getOperand(3), 550263508Sdim Op.getOperand(4), 551263508Sdim Op.getOperand(5), 552263508Sdim Op.getOperand(6), 553263508Sdim Op.getOperand(7), 554263508Sdim Op.getOperand(8), 555263508Sdim Op.getOperand(9), 556263508Sdim Op.getOperand(10), 557263508Sdim Op.getOperand(11), 558263508Sdim Op.getOperand(12), 559263508Sdim Op.getOperand(13), 560263508Sdim Op.getOperand(14) 561263508Sdim }; 562263508Sdim EVT VT = Op.getOperand(3).getValueType(); 563263508Sdim 564263508Sdim MachineMemOperand *MMO = MF.getMachineMemOperand( 565263508Sdim MachinePointerInfo(), 566263508Sdim MachineMemOperand::MOStore, 567263508Sdim VT.getSizeInBits() / 8, 4); 568263508Sdim return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 569263508Sdim Op->getVTList(), Ops, 570263508Sdim sizeof(Ops)/sizeof(Ops[0]), VT, MMO); 571263508Sdim } 572263508Sdim default: 573263508Sdim break; 574263508Sdim } 575263508Sdim } 576249259Sdim return SDValue(); 577249259Sdim} 578249259Sdim 579263508SdimSDValue SITargetLowering::LowerADD(SDValue Op, 580263508Sdim SelectionDAG &DAG) const { 581263508Sdim if (Op.getValueType() != MVT::i64) 582263508Sdim return SDValue(); 583263508Sdim 584263508Sdim SDLoc DL(Op); 585263508Sdim SDValue LHS = Op.getOperand(0); 586263508Sdim SDValue RHS = Op.getOperand(1); 587263508Sdim 588263508Sdim SDValue Zero = DAG.getConstant(0, MVT::i32); 589263508Sdim SDValue One = DAG.getConstant(1, MVT::i32); 590263508Sdim 591263508Sdim SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero); 592263508Sdim SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One); 593263508Sdim 594263508Sdim SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero); 595263508Sdim SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One); 596263508Sdim 597263508Sdim SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue); 598263508Sdim 599263508Sdim SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1); 600263508Sdim SDValue Carry = AddLo.getValue(1); 601263508Sdim SDValue AddHi = DAG.getNode(ISD::ADDE, DL, VTList, Hi0, Hi1, Carry); 602263508Sdim 603263508Sdim return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddLo, AddHi.getValue(0)); 604263508Sdim} 605263508Sdim 606249259Sdim/// \brief Helper function for LowerBRCOND 607249259Sdimstatic SDNode *findUser(SDValue Value, unsigned Opcode) { 608249259Sdim 609249259Sdim SDNode *Parent = Value.getNode(); 610249259Sdim for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 611249259Sdim I != E; ++I) { 612249259Sdim 613249259Sdim if (I.getUse().get() != Value) 614249259Sdim continue; 615249259Sdim 616249259Sdim if (I->getOpcode() == Opcode) 617249259Sdim return *I; 618249259Sdim } 619249259Sdim return 0; 620249259Sdim} 621249259Sdim 622249259Sdim/// This transforms the control flow intrinsics to get the branch destination as 623249259Sdim/// last parameter, also switches branch target with BR if the need arise 624249259SdimSDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 625249259Sdim SelectionDAG &DAG) const { 626249259Sdim 627263508Sdim SDLoc DL(BRCOND); 628249259Sdim 629249259Sdim SDNode *Intr = BRCOND.getOperand(1).getNode(); 630249259Sdim SDValue Target = BRCOND.getOperand(2); 631249259Sdim SDNode *BR = 0; 632249259Sdim 633249259Sdim if (Intr->getOpcode() == ISD::SETCC) { 634249259Sdim // As long as we negate the condition everything is fine 635249259Sdim SDNode *SetCC = Intr; 636249259Sdim assert(SetCC->getConstantOperandVal(1) == 1); 637249259Sdim assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 638249259Sdim ISD::SETNE); 639249259Sdim Intr = SetCC->getOperand(0).getNode(); 640249259Sdim 641249259Sdim } else { 642249259Sdim // Get the target from BR if we don't negate the condition 643249259Sdim BR = findUser(BRCOND, ISD::BR); 644249259Sdim Target = BR->getOperand(1); 645249259Sdim } 646249259Sdim 647249259Sdim assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 648249259Sdim 649249259Sdim // Build the result and 650249259Sdim SmallVector<EVT, 4> Res; 651249259Sdim for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) 652249259Sdim Res.push_back(Intr->getValueType(i)); 653249259Sdim 654249259Sdim // operands of the new intrinsic call 655249259Sdim SmallVector<SDValue, 4> Ops; 656249259Sdim Ops.push_back(BRCOND.getOperand(0)); 657249259Sdim for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) 658249259Sdim Ops.push_back(Intr->getOperand(i)); 659249259Sdim Ops.push_back(Target); 660249259Sdim 661249259Sdim // build the new intrinsic call 662249259Sdim SDNode *Result = DAG.getNode( 663249259Sdim Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 664249259Sdim DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); 665249259Sdim 666249259Sdim if (BR) { 667249259Sdim // Give the branch instruction our target 668249259Sdim SDValue Ops[] = { 669249259Sdim BR->getOperand(0), 670249259Sdim BRCOND.getOperand(2) 671249259Sdim }; 672249259Sdim DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); 673249259Sdim } 674249259Sdim 675249259Sdim SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 676249259Sdim 677249259Sdim // Copy the intrinsic results to registers 678249259Sdim for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 679249259Sdim SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 680249259Sdim if (!CopyToReg) 681249259Sdim continue; 682249259Sdim 683249259Sdim Chain = DAG.getCopyToReg( 684249259Sdim Chain, DL, 685249259Sdim CopyToReg->getOperand(1), 686249259Sdim SDValue(Result, i - 1), 687249259Sdim SDValue()); 688249259Sdim 689249259Sdim DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 690249259Sdim } 691249259Sdim 692249259Sdim // Remove the old intrinsic from the chain 693249259Sdim DAG.ReplaceAllUsesOfValueWith( 694249259Sdim SDValue(Intr, Intr->getNumValues() - 1), 695249259Sdim Intr->getOperand(0)); 696249259Sdim 697249259Sdim return Chain; 698249259Sdim} 699249259Sdim 700263508SdimSDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 701263508Sdim SDLoc DL(Op); 702263508Sdim LoadSDNode *Load = cast<LoadSDNode>(Op); 703251662Sdim 704263508Sdim if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 705251662Sdim return SDValue(); 706251662Sdim 707263508Sdim SDValue TruncPtr = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, 708263508Sdim Load->getBasePtr(), DAG.getConstant(0, MVT::i32)); 709263508Sdim SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, 710263508Sdim DAG.getConstant(2, MVT::i32)); 711251662Sdim 712263508Sdim SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 713263508Sdim Load->getChain(), Ptr, 714263508Sdim DAG.getTargetConstant(0, MVT::i32), 715263508Sdim Op.getOperand(2)); 716263508Sdim SDValue MergedValues[2] = { 717263508Sdim Ret, 718263508Sdim Load->getChain() 719263508Sdim }; 720263508Sdim return DAG.getMergeValues(MergedValues, 2, DL); 721251662Sdim 722263508Sdim} 723251662Sdim 724263508SdimSDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op, 725263508Sdim SelectionDAG &DAG) const { 726263508Sdim 727263508Sdim if (Op.getValueType() == MVT::i128) { 728263508Sdim return Op; 729263508Sdim } 730263508Sdim 731263508Sdim assert(Op.getOpcode() == ISD::UNDEF); 732263508Sdim 733263508Sdim return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128, 734263508Sdim DAG.getConstant(0, MVT::i64), 735263508Sdim DAG.getConstant(0, MVT::i64)); 736251662Sdim} 737251662Sdim 738263508SdimSDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, 739263508Sdim const SDValue &Op, 740263508Sdim SelectionDAG &DAG) const { 741263508Sdim return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), 742263508Sdim Op.getOperand(2), 743263508Sdim ResourceDescriptorToi128(Op.getOperand(3), DAG), 744263508Sdim Op.getOperand(4)); 745263508Sdim} 746263508Sdim 747249259SdimSDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 748249259Sdim SDValue LHS = Op.getOperand(0); 749249259Sdim SDValue RHS = Op.getOperand(1); 750249259Sdim SDValue True = Op.getOperand(2); 751249259Sdim SDValue False = Op.getOperand(3); 752249259Sdim SDValue CC = Op.getOperand(4); 753249259Sdim EVT VT = Op.getValueType(); 754263508Sdim SDLoc DL(Op); 755249259Sdim 756249259Sdim // Possible Min/Max pattern 757249259Sdim SDValue MinMax = LowerMinMax(Op, DAG); 758249259Sdim if (MinMax.getNode()) { 759249259Sdim return MinMax; 760249259Sdim } 761249259Sdim 762249259Sdim SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); 763249259Sdim return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); 764249259Sdim} 765249259Sdim 766263508SdimSDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, 767263508Sdim SelectionDAG &DAG) const { 768263508Sdim EVT VT = Op.getValueType(); 769263508Sdim SDLoc DL(Op); 770263508Sdim 771263508Sdim if (VT != MVT::i64) { 772263508Sdim return SDValue(); 773263508Sdim } 774263508Sdim 775263508Sdim SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), 776263508Sdim DAG.getConstant(31, MVT::i32)); 777263508Sdim 778263508Sdim return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); 779263508Sdim} 780263508Sdim 781263508SdimSDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 782263508Sdim SDLoc DL(Op); 783263508Sdim StoreSDNode *Store = cast<StoreSDNode>(Op); 784263508Sdim EVT VT = Store->getMemoryVT(); 785263508Sdim 786263508Sdim SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 787263508Sdim if (Ret.getNode()) 788263508Sdim return Ret; 789263508Sdim 790263508Sdim if (VT.isVector() && VT.getVectorNumElements() >= 8) 791263508Sdim return SplitVectorStore(Op, DAG); 792263508Sdim 793263508Sdim if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 794263508Sdim return SDValue(); 795263508Sdim 796263508Sdim SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32); 797263508Sdim SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, 798263508Sdim DAG.getConstant(2, MVT::i32)); 799263508Sdim SDValue Chain = Store->getChain(); 800263508Sdim SmallVector<SDValue, 8> Values; 801263508Sdim 802263508Sdim if (VT == MVT::i64) { 803263508Sdim for (unsigned i = 0; i < 2; ++i) { 804263508Sdim Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, 805263508Sdim Store->getValue(), DAG.getConstant(i, MVT::i32))); 806263508Sdim } 807263508Sdim } else if (VT == MVT::i128) { 808263508Sdim for (unsigned i = 0; i < 2; ++i) { 809263508Sdim for (unsigned j = 0; j < 2; ++j) { 810263508Sdim Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, 811263508Sdim DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, 812263508Sdim Store->getValue(), DAG.getConstant(i, MVT::i32)), 813263508Sdim DAG.getConstant(j, MVT::i32))); 814263508Sdim } 815263508Sdim } 816263508Sdim } else { 817263508Sdim Values.push_back(Store->getValue()); 818263508Sdim } 819263508Sdim 820263508Sdim for (unsigned i = 0; i < Values.size(); ++i) { 821263508Sdim SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, 822263508Sdim Ptr, DAG.getConstant(i, MVT::i32)); 823263508Sdim Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 824263508Sdim Chain, Values[i], PartPtr, 825263508Sdim DAG.getTargetConstant(0, MVT::i32)); 826263508Sdim } 827263508Sdim return Chain; 828263508Sdim} 829263508Sdim 830263508Sdim 831263508SdimSDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, 832263508Sdim SelectionDAG &DAG) const { 833263508Sdim EVT VT = Op.getValueType(); 834263508Sdim SDLoc DL(Op); 835263508Sdim 836263508Sdim if (VT != MVT::i64) { 837263508Sdim return SDValue(); 838263508Sdim } 839263508Sdim 840263508Sdim return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), 841263508Sdim DAG.getConstant(0, MVT::i32)); 842263508Sdim} 843263508Sdim 844249259Sdim//===----------------------------------------------------------------------===// 845249259Sdim// Custom DAG optimizations 846249259Sdim//===----------------------------------------------------------------------===// 847249259Sdim 848249259SdimSDValue SITargetLowering::PerformDAGCombine(SDNode *N, 849249259Sdim DAGCombinerInfo &DCI) const { 850249259Sdim SelectionDAG &DAG = DCI.DAG; 851263508Sdim SDLoc DL(N); 852249259Sdim EVT VT = N->getValueType(0); 853249259Sdim 854249259Sdim switch (N->getOpcode()) { 855249259Sdim default: break; 856249259Sdim case ISD::SELECT_CC: { 857249259Sdim ConstantSDNode *True, *False; 858249259Sdim // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) 859249259Sdim if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) 860249259Sdim && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) 861249259Sdim && True->isAllOnesValue() 862249259Sdim && False->isNullValue() 863249259Sdim && VT == MVT::i1) { 864249259Sdim return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), 865249259Sdim N->getOperand(1), N->getOperand(4)); 866249259Sdim 867249259Sdim } 868249259Sdim break; 869249259Sdim } 870249259Sdim case ISD::SETCC: { 871249259Sdim SDValue Arg0 = N->getOperand(0); 872249259Sdim SDValue Arg1 = N->getOperand(1); 873249259Sdim SDValue CC = N->getOperand(2); 874249259Sdim ConstantSDNode * C = NULL; 875249259Sdim ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); 876249259Sdim 877249259Sdim // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) 878249259Sdim if (VT == MVT::i1 879249259Sdim && Arg0.getOpcode() == ISD::SIGN_EXTEND 880249259Sdim && Arg0.getOperand(0).getValueType() == MVT::i1 881249259Sdim && (C = dyn_cast<ConstantSDNode>(Arg1)) 882249259Sdim && C->isNullValue() 883249259Sdim && CCOp == ISD::SETNE) { 884249259Sdim return SimplifySetCC(VT, Arg0.getOperand(0), 885249259Sdim DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); 886249259Sdim } 887249259Sdim break; 888249259Sdim } 889249259Sdim } 890249259Sdim return SDValue(); 891249259Sdim} 892249259Sdim 893263508Sdim/// \brief Test if RegClass is one of the VSrc classes 894249259Sdimstatic bool isVSrc(unsigned RegClass) { 895249259Sdim return AMDGPU::VSrc_32RegClassID == RegClass || 896249259Sdim AMDGPU::VSrc_64RegClassID == RegClass; 897249259Sdim} 898249259Sdim 899263508Sdim/// \brief Test if RegClass is one of the SSrc classes 900249259Sdimstatic bool isSSrc(unsigned RegClass) { 901249259Sdim return AMDGPU::SSrc_32RegClassID == RegClass || 902249259Sdim AMDGPU::SSrc_64RegClassID == RegClass; 903249259Sdim} 904249259Sdim 905249259Sdim/// \brief Analyze the possible immediate value Op 906249259Sdim/// 907249259Sdim/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 908249259Sdim/// and the immediate value if it's a literal immediate 909249259Sdimint32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 910249259Sdim 911249259Sdim union { 912249259Sdim int32_t I; 913249259Sdim float F; 914249259Sdim } Imm; 915249259Sdim 916251662Sdim if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 917251662Sdim if (Node->getZExtValue() >> 32) { 918251662Sdim return -1; 919251662Sdim } 920249259Sdim Imm.I = Node->getSExtValue(); 921251662Sdim } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) 922249259Sdim Imm.F = Node->getValueAPF().convertToFloat(); 923249259Sdim else 924249259Sdim return -1; // It isn't an immediate 925249259Sdim 926249259Sdim if ((Imm.I >= -16 && Imm.I <= 64) || 927249259Sdim Imm.F == 0.5f || Imm.F == -0.5f || 928249259Sdim Imm.F == 1.0f || Imm.F == -1.0f || 929249259Sdim Imm.F == 2.0f || Imm.F == -2.0f || 930249259Sdim Imm.F == 4.0f || Imm.F == -4.0f) 931249259Sdim return 0; // It's an inline immediate 932249259Sdim 933249259Sdim return Imm.I; // It's a literal immediate 934249259Sdim} 935249259Sdim 936249259Sdim/// \brief Try to fold an immediate directly into an instruction 937249259Sdimbool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, 938249259Sdim bool &ScalarSlotUsed) const { 939249259Sdim 940249259Sdim MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); 941263508Sdim const SIInstrInfo *TII = 942263508Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 943249259Sdim if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) 944249259Sdim return false; 945249259Sdim 946249259Sdim const SDValue &Op = Mov->getOperand(0); 947249259Sdim int32_t Value = analyzeImmediate(Op.getNode()); 948249259Sdim if (Value == -1) { 949249259Sdim // Not an immediate at all 950249259Sdim return false; 951249259Sdim 952249259Sdim } else if (Value == 0) { 953249259Sdim // Inline immediates can always be fold 954249259Sdim Operand = Op; 955249259Sdim return true; 956249259Sdim 957249259Sdim } else if (Value == Immediate) { 958249259Sdim // Already fold literal immediate 959249259Sdim Operand = Op; 960249259Sdim return true; 961249259Sdim 962249259Sdim } else if (!ScalarSlotUsed && !Immediate) { 963249259Sdim // Fold this literal immediate 964249259Sdim ScalarSlotUsed = true; 965249259Sdim Immediate = Value; 966249259Sdim Operand = Op; 967249259Sdim return true; 968249259Sdim 969249259Sdim } 970249259Sdim 971249259Sdim return false; 972249259Sdim} 973249259Sdim 974263508Sdimconst TargetRegisterClass *SITargetLowering::getRegClassForNode( 975263508Sdim SelectionDAG &DAG, const SDValue &Op) const { 976263508Sdim const SIInstrInfo *TII = 977263508Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 978263508Sdim const SIRegisterInfo &TRI = TII->getRegisterInfo(); 979249259Sdim 980263508Sdim if (!Op->isMachineOpcode()) { 981263508Sdim switch(Op->getOpcode()) { 982263508Sdim case ISD::CopyFromReg: { 983263508Sdim MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 984263508Sdim unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); 985263508Sdim if (TargetRegisterInfo::isVirtualRegister(Reg)) { 986263508Sdim return MRI.getRegClass(Reg); 987263508Sdim } 988263508Sdim return TRI.getPhysRegClass(Reg); 989263508Sdim } 990263508Sdim default: return NULL; 991263508Sdim } 992263508Sdim } 993263508Sdim const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); 994263508Sdim int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; 995263508Sdim if (OpClassID != -1) { 996263508Sdim return TRI.getRegClass(OpClassID); 997263508Sdim } 998263508Sdim switch(Op.getMachineOpcode()) { 999263508Sdim case AMDGPU::COPY_TO_REGCLASS: 1000263508Sdim // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. 1001263508Sdim OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); 1002249259Sdim 1003263508Sdim // If the COPY_TO_REGCLASS instruction is copying to a VSrc register 1004263508Sdim // class, then the register class for the value could be either a 1005263508Sdim // VReg or and SReg. In order to get a more accurate 1006263508Sdim if (OpClassID == AMDGPU::VSrc_32RegClassID || 1007263508Sdim OpClassID == AMDGPU::VSrc_64RegClassID) { 1008263508Sdim return getRegClassForNode(DAG, Op.getOperand(0)); 1009263508Sdim } 1010263508Sdim return TRI.getRegClass(OpClassID); 1011263508Sdim case AMDGPU::EXTRACT_SUBREG: { 1012263508Sdim int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1013263508Sdim const TargetRegisterClass *SuperClass = 1014263508Sdim getRegClassForNode(DAG, Op.getOperand(0)); 1015263508Sdim return TRI.getSubClassWithSubReg(SuperClass, SubIdx); 1016263508Sdim } 1017263508Sdim case AMDGPU::REG_SEQUENCE: 1018263508Sdim // Operand 0 is the register class id for REG_SEQUENCE instructions. 1019263508Sdim return TRI.getRegClass( 1020263508Sdim cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); 1021263508Sdim default: 1022263508Sdim return getRegClassFor(Op.getSimpleValueType()); 1023263508Sdim } 1024263508Sdim} 1025249259Sdim 1026263508Sdim/// \brief Does "Op" fit into register class "RegClass" ? 1027263508Sdimbool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, 1028263508Sdim unsigned RegClass) const { 1029263508Sdim const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1030263508Sdim const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); 1031263508Sdim if (!RC) { 1032249259Sdim return false; 1033263508Sdim } 1034263508Sdim return TRI->getRegClass(RegClass)->hasSubClassEq(RC); 1035249259Sdim} 1036249259Sdim 1037249259Sdim/// \brief Make sure that we don't exeed the number of allowed scalars 1038249259Sdimvoid SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 1039249259Sdim unsigned RegClass, 1040249259Sdim bool &ScalarSlotUsed) const { 1041249259Sdim 1042249259Sdim // First map the operands register class to a destination class 1043249259Sdim if (RegClass == AMDGPU::VSrc_32RegClassID) 1044249259Sdim RegClass = AMDGPU::VReg_32RegClassID; 1045249259Sdim else if (RegClass == AMDGPU::VSrc_64RegClassID) 1046249259Sdim RegClass = AMDGPU::VReg_64RegClassID; 1047249259Sdim else 1048249259Sdim return; 1049249259Sdim 1050249259Sdim // Nothing todo if they fit naturaly 1051249259Sdim if (fitsRegClass(DAG, Operand, RegClass)) 1052249259Sdim return; 1053249259Sdim 1054249259Sdim // If the scalar slot isn't used yet use it now 1055249259Sdim if (!ScalarSlotUsed) { 1056249259Sdim ScalarSlotUsed = true; 1057249259Sdim return; 1058249259Sdim } 1059249259Sdim 1060263508Sdim // This is a conservative aproach. It is possible that we can't determine the 1061263508Sdim // correct register class and copy too often, but better safe than sorry. 1062249259Sdim SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); 1063263508Sdim SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), 1064249259Sdim Operand.getValueType(), Operand, RC); 1065249259Sdim Operand = SDValue(Node, 0); 1066249259Sdim} 1067249259Sdim 1068263508Sdim/// \returns true if \p Node's operands are different from the SDValue list 1069263508Sdim/// \p Ops 1070263508Sdimstatic bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { 1071263508Sdim for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { 1072263508Sdim if (Ops[i].getNode() != Node->getOperand(i).getNode()) { 1073263508Sdim return true; 1074263508Sdim } 1075263508Sdim } 1076263508Sdim return false; 1077263508Sdim} 1078263508Sdim 1079251662Sdim/// \brief Try to fold the Nodes operands into the Node 1080251662SdimSDNode *SITargetLowering::foldOperands(MachineSDNode *Node, 1081251662Sdim SelectionDAG &DAG) const { 1082249259Sdim 1083249259Sdim // Original encoding (either e32 or e64) 1084249259Sdim int Opcode = Node->getMachineOpcode(); 1085263508Sdim const SIInstrInfo *TII = 1086263508Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1087249259Sdim const MCInstrDesc *Desc = &TII->get(Opcode); 1088249259Sdim 1089249259Sdim unsigned NumDefs = Desc->getNumDefs(); 1090249259Sdim unsigned NumOps = Desc->getNumOperands(); 1091249259Sdim 1092249259Sdim // Commuted opcode if available 1093249259Sdim int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; 1094249259Sdim const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); 1095249259Sdim 1096249259Sdim assert(!DescRev || DescRev->getNumDefs() == NumDefs); 1097249259Sdim assert(!DescRev || DescRev->getNumOperands() == NumOps); 1098249259Sdim 1099249259Sdim // e64 version if available, -1 otherwise 1100249259Sdim int OpcodeE64 = AMDGPU::getVOPe64(Opcode); 1101249259Sdim const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); 1102249259Sdim 1103249259Sdim assert(!DescE64 || DescE64->getNumDefs() == NumDefs); 1104249259Sdim assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); 1105249259Sdim 1106249259Sdim int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; 1107249259Sdim bool HaveVSrc = false, HaveSSrc = false; 1108249259Sdim 1109249259Sdim // First figure out what we alread have in this instruction 1110249259Sdim for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 1111249259Sdim i != e && Op < NumOps; ++i, ++Op) { 1112249259Sdim 1113249259Sdim unsigned RegClass = Desc->OpInfo[Op].RegClass; 1114249259Sdim if (isVSrc(RegClass)) 1115249259Sdim HaveVSrc = true; 1116249259Sdim else if (isSSrc(RegClass)) 1117249259Sdim HaveSSrc = true; 1118249259Sdim else 1119249259Sdim continue; 1120249259Sdim 1121249259Sdim int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); 1122249259Sdim if (Imm != -1 && Imm != 0) { 1123249259Sdim // Literal immediate 1124249259Sdim Immediate = Imm; 1125249259Sdim } 1126249259Sdim } 1127249259Sdim 1128249259Sdim // If we neither have VSrc nor SSrc it makes no sense to continue 1129249259Sdim if (!HaveVSrc && !HaveSSrc) 1130249259Sdim return Node; 1131249259Sdim 1132249259Sdim // No scalar allowed when we have both VSrc and SSrc 1133249259Sdim bool ScalarSlotUsed = HaveVSrc && HaveSSrc; 1134249259Sdim 1135249259Sdim // Second go over the operands and try to fold them 1136249259Sdim std::vector<SDValue> Ops; 1137249259Sdim bool Promote2e64 = false; 1138249259Sdim for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 1139249259Sdim i != e && Op < NumOps; ++i, ++Op) { 1140249259Sdim 1141249259Sdim const SDValue &Operand = Node->getOperand(i); 1142249259Sdim Ops.push_back(Operand); 1143249259Sdim 1144249259Sdim // Already folded immediate ? 1145249259Sdim if (isa<ConstantSDNode>(Operand.getNode()) || 1146249259Sdim isa<ConstantFPSDNode>(Operand.getNode())) 1147249259Sdim continue; 1148249259Sdim 1149249259Sdim // Is this a VSrc or SSrc operand ? 1150249259Sdim unsigned RegClass = Desc->OpInfo[Op].RegClass; 1151249259Sdim if (isVSrc(RegClass) || isSSrc(RegClass)) { 1152249259Sdim // Try to fold the immediates 1153249259Sdim if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { 1154249259Sdim // Folding didn't worked, make sure we don't hit the SReg limit 1155249259Sdim ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); 1156249259Sdim } 1157249259Sdim continue; 1158249259Sdim } 1159249259Sdim 1160249259Sdim if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { 1161249259Sdim 1162249259Sdim unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; 1163249259Sdim assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); 1164249259Sdim 1165249259Sdim // Test if it makes sense to swap operands 1166249259Sdim if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || 1167249259Sdim (!fitsRegClass(DAG, Ops[1], RegClass) && 1168249259Sdim fitsRegClass(DAG, Ops[1], OtherRegClass))) { 1169249259Sdim 1170249259Sdim // Swap commutable operands 1171249259Sdim SDValue Tmp = Ops[1]; 1172249259Sdim Ops[1] = Ops[0]; 1173249259Sdim Ops[0] = Tmp; 1174249259Sdim 1175249259Sdim Desc = DescRev; 1176249259Sdim DescRev = 0; 1177249259Sdim continue; 1178249259Sdim } 1179249259Sdim } 1180249259Sdim 1181249259Sdim if (DescE64 && !Immediate) { 1182249259Sdim 1183249259Sdim // Test if it makes sense to switch to e64 encoding 1184249259Sdim unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; 1185249259Sdim if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) 1186249259Sdim continue; 1187249259Sdim 1188249259Sdim int32_t TmpImm = -1; 1189249259Sdim if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || 1190249259Sdim (!fitsRegClass(DAG, Ops[i], RegClass) && 1191249259Sdim fitsRegClass(DAG, Ops[1], OtherRegClass))) { 1192249259Sdim 1193249259Sdim // Switch to e64 encoding 1194249259Sdim Immediate = -1; 1195249259Sdim Promote2e64 = true; 1196249259Sdim Desc = DescE64; 1197249259Sdim DescE64 = 0; 1198249259Sdim } 1199249259Sdim } 1200249259Sdim } 1201249259Sdim 1202249259Sdim if (Promote2e64) { 1203249259Sdim // Add the modifier flags while promoting 1204249259Sdim for (unsigned i = 0; i < 4; ++i) 1205249259Sdim Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); 1206249259Sdim } 1207249259Sdim 1208249259Sdim // Add optional chain and glue 1209249259Sdim for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) 1210249259Sdim Ops.push_back(Node->getOperand(i)); 1211249259Sdim 1212263508Sdim // Nodes that have a glue result are not CSE'd by getMachineNode(), so in 1213263508Sdim // this case a brand new node is always be created, even if the operands 1214263508Sdim // are the same as before. So, manually check if anything has been changed. 1215263508Sdim if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { 1216263508Sdim return Node; 1217263508Sdim } 1218263508Sdim 1219249259Sdim // Create a complete new instruction 1220263508Sdim return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); 1221249259Sdim} 1222251662Sdim 1223251662Sdim/// \brief Helper function for adjustWritemask 1224263508Sdimstatic unsigned SubIdx2Lane(unsigned Idx) { 1225251662Sdim switch (Idx) { 1226251662Sdim default: return 0; 1227251662Sdim case AMDGPU::sub0: return 0; 1228251662Sdim case AMDGPU::sub1: return 1; 1229251662Sdim case AMDGPU::sub2: return 2; 1230251662Sdim case AMDGPU::sub3: return 3; 1231251662Sdim } 1232251662Sdim} 1233251662Sdim 1234251662Sdim/// \brief Adjust the writemask of MIMG instructions 1235251662Sdimvoid SITargetLowering::adjustWritemask(MachineSDNode *&Node, 1236251662Sdim SelectionDAG &DAG) const { 1237251662Sdim SDNode *Users[4] = { }; 1238263508Sdim unsigned Lane = 0; 1239263508Sdim unsigned OldDmask = Node->getConstantOperandVal(0); 1240263508Sdim unsigned NewDmask = 0; 1241251662Sdim 1242251662Sdim // Try to figure out the used register components 1243251662Sdim for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 1244251662Sdim I != E; ++I) { 1245251662Sdim 1246251662Sdim // Abort if we can't understand the usage 1247251662Sdim if (!I->isMachineOpcode() || 1248251662Sdim I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 1249251662Sdim return; 1250251662Sdim 1251263508Sdim // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 1252263508Sdim // Note that subregs are packed, i.e. Lane==0 is the first bit set 1253263508Sdim // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 1254263508Sdim // set, etc. 1255251662Sdim Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 1256251662Sdim 1257263508Sdim // Set which texture component corresponds to the lane. 1258263508Sdim unsigned Comp; 1259263508Sdim for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 1260263508Sdim assert(Dmask); 1261263508Sdim Comp = countTrailingZeros(Dmask); 1262263508Sdim Dmask &= ~(1 << Comp); 1263263508Sdim } 1264263508Sdim 1265251662Sdim // Abort if we have more than one user per component 1266251662Sdim if (Users[Lane]) 1267251662Sdim return; 1268251662Sdim 1269251662Sdim Users[Lane] = *I; 1270263508Sdim NewDmask |= 1 << Comp; 1271251662Sdim } 1272251662Sdim 1273263508Sdim // Abort if there's no change 1274263508Sdim if (NewDmask == OldDmask) 1275251662Sdim return; 1276251662Sdim 1277251662Sdim // Adjust the writemask in the node 1278251662Sdim std::vector<SDValue> Ops; 1279263508Sdim Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); 1280251662Sdim for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) 1281251662Sdim Ops.push_back(Node->getOperand(i)); 1282251662Sdim Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); 1283251662Sdim 1284251662Sdim // If we only got one lane, replace it with a copy 1285263508Sdim // (if NewDmask has only one bit set...) 1286263508Sdim if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 1287251662Sdim SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); 1288251662Sdim SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 1289263508Sdim SDLoc(), Users[Lane]->getValueType(0), 1290251662Sdim SDValue(Node, 0), RC); 1291251662Sdim DAG.ReplaceAllUsesWith(Users[Lane], Copy); 1292251662Sdim return; 1293251662Sdim } 1294251662Sdim 1295251662Sdim // Update the users of the node with the new indices 1296251662Sdim for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 1297251662Sdim 1298251662Sdim SDNode *User = Users[i]; 1299251662Sdim if (!User) 1300251662Sdim continue; 1301251662Sdim 1302251662Sdim SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); 1303251662Sdim DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 1304251662Sdim 1305251662Sdim switch (Idx) { 1306251662Sdim default: break; 1307251662Sdim case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 1308251662Sdim case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 1309251662Sdim case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 1310251662Sdim } 1311251662Sdim } 1312251662Sdim} 1313251662Sdim 1314251662Sdim/// \brief Fold the instructions after slecting them 1315251662SdimSDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 1316251662Sdim SelectionDAG &DAG) const { 1317263508Sdim const SIInstrInfo *TII = 1318263508Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1319263508Sdim Node = AdjustRegClass(Node, DAG); 1320251662Sdim 1321263508Sdim if (TII->isMIMG(Node->getMachineOpcode())) 1322251662Sdim adjustWritemask(Node, DAG); 1323251662Sdim 1324251662Sdim return foldOperands(Node, DAG); 1325251662Sdim} 1326251662Sdim 1327251662Sdim/// \brief Assign the register class depending on the number of 1328251662Sdim/// bits set in the writemask 1329251662Sdimvoid SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 1330251662Sdim SDNode *Node) const { 1331263508Sdim const SIInstrInfo *TII = 1332263508Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1333263508Sdim if (!TII->isMIMG(MI->getOpcode())) 1334251662Sdim return; 1335251662Sdim 1336251662Sdim unsigned VReg = MI->getOperand(0).getReg(); 1337251662Sdim unsigned Writemask = MI->getOperand(1).getImm(); 1338251662Sdim unsigned BitsSet = 0; 1339251662Sdim for (unsigned i = 0; i < 4; ++i) 1340251662Sdim BitsSet += Writemask & (1 << i) ? 1 : 0; 1341251662Sdim 1342251662Sdim const TargetRegisterClass *RC; 1343251662Sdim switch (BitsSet) { 1344251662Sdim default: return; 1345251662Sdim case 1: RC = &AMDGPU::VReg_32RegClass; break; 1346251662Sdim case 2: RC = &AMDGPU::VReg_64RegClass; break; 1347251662Sdim case 3: RC = &AMDGPU::VReg_96RegClass; break; 1348251662Sdim } 1349251662Sdim 1350263508Sdim unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); 1351263508Sdim MI->setDesc(TII->get(NewOpcode)); 1352251662Sdim MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1353251662Sdim MRI.setRegClass(VReg, RC); 1354251662Sdim} 1355263508Sdim 1356263508SdimMachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, 1357263508Sdim SelectionDAG &DAG) const { 1358263508Sdim 1359263508Sdim SDLoc DL(N); 1360263508Sdim unsigned NewOpcode = N->getMachineOpcode(); 1361263508Sdim 1362263508Sdim switch (N->getMachineOpcode()) { 1363263508Sdim default: return N; 1364263508Sdim case AMDGPU::S_LOAD_DWORD_IMM: 1365263508Sdim NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1366263508Sdim // Fall-through 1367263508Sdim case AMDGPU::S_LOAD_DWORDX2_SGPR: 1368263508Sdim if (NewOpcode == N->getMachineOpcode()) { 1369263508Sdim NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1370263508Sdim } 1371263508Sdim // Fall-through 1372263508Sdim case AMDGPU::S_LOAD_DWORDX4_IMM: 1373263508Sdim case AMDGPU::S_LOAD_DWORDX4_SGPR: { 1374263508Sdim if (NewOpcode == N->getMachineOpcode()) { 1375263508Sdim NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1376263508Sdim } 1377263508Sdim if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { 1378263508Sdim return N; 1379263508Sdim } 1380263508Sdim ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); 1381263508Sdim SDValue Ops[] = { 1382263508Sdim SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128, 1383263508Sdim DAG.getConstant(0, MVT::i64)), 0), 1384263508Sdim N->getOperand(0), 1385263508Sdim DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32) 1386263508Sdim }; 1387263508Sdim return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); 1388263508Sdim } 1389263508Sdim } 1390263508Sdim} 1391263508Sdim 1392263508SdimSDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 1393263508Sdim const TargetRegisterClass *RC, 1394263508Sdim unsigned Reg, EVT VT) const { 1395263508Sdim SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 1396263508Sdim 1397263508Sdim return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 1398263508Sdim cast<RegisterSDNode>(VReg)->getReg(), VT); 1399263508Sdim} 1400