1249259Sdim//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2249259Sdim// 3249259Sdim// The LLVM Compiler Infrastructure 4249259Sdim// 5249259Sdim// This file is distributed under the University of Illinois Open Source 6249259Sdim// License. See LICENSE.TXT for details. 7249259Sdim// 8249259Sdim//===----------------------------------------------------------------------===// 9249259Sdim// 10249259Sdim/// \file 11249259Sdim/// \brief Custom DAG lowering for SI 12249259Sdim// 13249259Sdim//===----------------------------------------------------------------------===// 14249259Sdim 15249259Sdim#include "SIISelLowering.h" 16249259Sdim#include "AMDGPU.h" 17249259Sdim#include "AMDILIntrinsicInfo.h" 18249259Sdim#include "SIInstrInfo.h" 19249259Sdim#include "SIMachineFunctionInfo.h" 20249259Sdim#include "SIRegisterInfo.h" 21249259Sdim#include "llvm/CodeGen/CallingConvLower.h" 22249259Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 23249259Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 24249259Sdim#include "llvm/CodeGen/SelectionDAG.h" 25263509Sdim#include "llvm/IR/Function.h" 26249259Sdim 27263509Sdimconst uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; 28263509Sdim 29249259Sdimusing namespace llvm; 30249259Sdim 31249259SdimSITargetLowering::SITargetLowering(TargetMachine &TM) : 32263509Sdim AMDGPUTargetLowering(TM) { 33249259Sdim 34249259Sdim addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); 35263509Sdim addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass); 36249259Sdim 37249259Sdim addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 38249259Sdim addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 39249259Sdim 40263509Sdim addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass); 41263509Sdim addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass); 42249259Sdim 43263509Sdim addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass); 44263509Sdim addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass); 45263509Sdim addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass); 46249259Sdim 47249259Sdim addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); 48249259Sdim addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 49252723Sdim addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); 50249259Sdim 51249259Sdim addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); 52249259Sdim addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 53249259Sdim 54249259Sdim addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); 55249259Sdim addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 56249259Sdim 57249259Sdim computeRegisterProperties(); 58249259Sdim 59263509Sdim // Condition Codes 60263509Sdim setCondCodeAction(ISD::SETONE, MVT::f32, Expand); 61263509Sdim setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand); 62263509Sdim setCondCodeAction(ISD::SETUGE, MVT::f32, Expand); 63263509Sdim setCondCodeAction(ISD::SETUGT, MVT::f32, Expand); 64263509Sdim setCondCodeAction(ISD::SETULE, MVT::f32, Expand); 65263509Sdim setCondCodeAction(ISD::SETULT, MVT::f32, Expand); 66263509Sdim 67263509Sdim setCondCodeAction(ISD::SETONE, MVT::f64, Expand); 68263509Sdim setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand); 69263509Sdim setCondCodeAction(ISD::SETUGE, MVT::f64, Expand); 70263509Sdim setCondCodeAction(ISD::SETUGT, MVT::f64, Expand); 71263509Sdim setCondCodeAction(ISD::SETULE, MVT::f64, Expand); 72263509Sdim setCondCodeAction(ISD::SETULT, MVT::f64, Expand); 73263509Sdim 74249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 75249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 76249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 77249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 78249259Sdim 79249259Sdim setOperationAction(ISD::ADD, MVT::i64, Legal); 80249259Sdim setOperationAction(ISD::ADD, MVT::i32, Legal); 81263509Sdim setOperationAction(ISD::ADDC, MVT::i32, Legal); 82263509Sdim setOperationAction(ISD::ADDE, MVT::i32, Legal); 83249259Sdim 84263509Sdim setOperationAction(ISD::BITCAST, MVT::i128, Legal); 85263509Sdim 86263509Sdim // We need to custom lower vector stores from local memory 87263509Sdim setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 88263509Sdim setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 89263509Sdim setOperationAction(ISD::LOAD, MVT::v8i32, Custom); 90263509Sdim setOperationAction(ISD::LOAD, MVT::v16i32, Custom); 91263509Sdim 92263509Sdim setOperationAction(ISD::STORE, MVT::v8i32, Custom); 93263509Sdim setOperationAction(ISD::STORE, MVT::v16i32, Custom); 94263509Sdim 95263509Sdim // We need to custom lower loads/stores from private memory 96263509Sdim setOperationAction(ISD::LOAD, MVT::i32, Custom); 97263509Sdim setOperationAction(ISD::LOAD, MVT::i64, Custom); 98263509Sdim setOperationAction(ISD::LOAD, MVT::v2i32, Custom); 99263509Sdim setOperationAction(ISD::LOAD, MVT::v4i32, Custom); 100263509Sdim 101263509Sdim setOperationAction(ISD::STORE, MVT::i32, Custom); 102263509Sdim setOperationAction(ISD::STORE, MVT::i64, Custom); 103263509Sdim setOperationAction(ISD::STORE, MVT::i128, Custom); 104263509Sdim setOperationAction(ISD::STORE, MVT::v2i32, Custom); 105263509Sdim setOperationAction(ISD::STORE, MVT::v4i32, Custom); 106263509Sdim 107263509Sdim 108249259Sdim setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 109249259Sdim setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 110249259Sdim 111249259Sdim setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); 112252723Sdim 113263509Sdim setOperationAction(ISD::SETCC, MVT::v2i1, Expand); 114263509Sdim setOperationAction(ISD::SETCC, MVT::v4i1, Expand); 115252723Sdim 116263509Sdim setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom); 117263509Sdim setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom); 118263509Sdim setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom); 119263509Sdim 120263509Sdim setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 121263509Sdim setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); 122263509Sdim setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); 123263509Sdim setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); 124263509Sdim 125263509Sdim setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 126263509Sdim 127263509Sdim setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand); 128263509Sdim setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand); 129263509Sdim setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand); 130263509Sdim setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand); 131263509Sdim 132263509Sdim setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); 133263509Sdim setTruncStoreAction(MVT::f64, MVT::f32, Expand); 134263509Sdim setTruncStoreAction(MVT::i64, MVT::i32, Expand); 135263509Sdim setTruncStoreAction(MVT::i128, MVT::i64, Expand); 136263509Sdim setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); 137263509Sdim setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); 138263509Sdim 139263509Sdim setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 140263509Sdim setOperationAction(ISD::FrameIndex, MVT::i64, Custom); 141263509Sdim 142249259Sdim setTargetDAGCombine(ISD::SELECT_CC); 143249259Sdim 144249259Sdim setTargetDAGCombine(ISD::SETCC); 145249259Sdim 146249259Sdim setSchedulingPreference(Sched::RegPressure); 147249259Sdim} 148249259Sdim 149263509Sdim//===----------------------------------------------------------------------===// 150263509Sdim// TargetLowering queries 151263509Sdim//===----------------------------------------------------------------------===// 152263509Sdim 153263509Sdimbool SITargetLowering::allowsUnalignedMemoryAccesses(EVT VT, 154263509Sdim bool *IsFast) const { 155263509Sdim // XXX: This depends on the address space and also we may want to revist 156263509Sdim // the alignment values we specify in the DataLayout. 157263509Sdim if (!VT.isSimple() || VT == MVT::Other) 158263509Sdim return false; 159263509Sdim return VT.bitsGT(MVT::i32); 160263509Sdim} 161263509Sdim 162263509Sdimbool SITargetLowering::shouldSplitVectorElementType(EVT VT) const { 163263509Sdim return VT.bitsLE(MVT::i16); 164263509Sdim} 165263509Sdim 166263509SdimSDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, 167263509Sdim SDLoc DL, SDValue Chain, 168263509Sdim unsigned Offset) const { 169263509Sdim MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 170263509Sdim PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), 171263509Sdim AMDGPUAS::CONSTANT_ADDRESS); 172263509Sdim SDValue BasePtr = DAG.getCopyFromReg(Chain, DL, 173263509Sdim MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64); 174263509Sdim SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, 175263509Sdim DAG.getConstant(Offset, MVT::i64)); 176263509Sdim return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr, 177263509Sdim MachinePointerInfo(UndefValue::get(PtrTy)), MemVT, 178263509Sdim false, false, MemVT.getSizeInBits() >> 3); 179263509Sdim 180263509Sdim} 181263509Sdim 182249259SdimSDValue SITargetLowering::LowerFormalArguments( 183249259Sdim SDValue Chain, 184249259Sdim CallingConv::ID CallConv, 185249259Sdim bool isVarArg, 186249259Sdim const SmallVectorImpl<ISD::InputArg> &Ins, 187263509Sdim SDLoc DL, SelectionDAG &DAG, 188249259Sdim SmallVectorImpl<SDValue> &InVals) const { 189249259Sdim 190249259Sdim const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 191249259Sdim 192249259Sdim MachineFunction &MF = DAG.getMachineFunction(); 193249259Sdim FunctionType *FType = MF.getFunction()->getFunctionType(); 194249259Sdim SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 195249259Sdim 196249259Sdim assert(CallConv == CallingConv::C); 197249259Sdim 198249259Sdim SmallVector<ISD::InputArg, 16> Splits; 199249259Sdim uint32_t Skipped = 0; 200249259Sdim 201249259Sdim for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 202249259Sdim const ISD::InputArg &Arg = Ins[i]; 203249259Sdim 204263509Sdim // First check if it's a PS input addr 205263509Sdim if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() && 206263509Sdim !Arg.Flags.isByVal()) { 207263509Sdim 208249259Sdim assert((PSInputNum <= 15) && "Too many PS inputs!"); 209249259Sdim 210249259Sdim if (!Arg.Used) { 211249259Sdim // We can savely skip PS inputs 212249259Sdim Skipped |= 1 << i; 213249259Sdim ++PSInputNum; 214249259Sdim continue; 215249259Sdim } 216249259Sdim 217249259Sdim Info->PSInputAddr |= 1 << PSInputNum++; 218249259Sdim } 219249259Sdim 220249259Sdim // Second split vertices into their elements 221263509Sdim if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) { 222249259Sdim ISD::InputArg NewArg = Arg; 223249259Sdim NewArg.Flags.setSplit(); 224249259Sdim NewArg.VT = Arg.VT.getVectorElementType(); 225249259Sdim 226249259Sdim // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 227249259Sdim // three or five element vertex only needs three or five registers, 228249259Sdim // NOT four or eigth. 229249259Sdim Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 230249259Sdim unsigned NumElements = ParamType->getVectorNumElements(); 231249259Sdim 232249259Sdim for (unsigned j = 0; j != NumElements; ++j) { 233249259Sdim Splits.push_back(NewArg); 234249259Sdim NewArg.PartOffset += NewArg.VT.getStoreSize(); 235249259Sdim } 236249259Sdim 237263509Sdim } else if (Info->ShaderType != ShaderType::COMPUTE) { 238249259Sdim Splits.push_back(Arg); 239249259Sdim } 240249259Sdim } 241249259Sdim 242249259Sdim SmallVector<CCValAssign, 16> ArgLocs; 243249259Sdim CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 244249259Sdim getTargetMachine(), ArgLocs, *DAG.getContext()); 245249259Sdim 246249259Sdim // At least one interpolation mode must be enabled or else the GPU will hang. 247249259Sdim if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { 248249259Sdim Info->PSInputAddr |= 1; 249249259Sdim CCInfo.AllocateReg(AMDGPU::VGPR0); 250249259Sdim CCInfo.AllocateReg(AMDGPU::VGPR1); 251249259Sdim } 252249259Sdim 253263509Sdim // The pointer to the list of arguments is stored in SGPR0, SGPR1 254263509Sdim if (Info->ShaderType == ShaderType::COMPUTE) { 255263509Sdim CCInfo.AllocateReg(AMDGPU::SGPR0); 256263509Sdim CCInfo.AllocateReg(AMDGPU::SGPR1); 257263509Sdim MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass); 258263509Sdim } 259263509Sdim 260263509Sdim if (Info->ShaderType == ShaderType::COMPUTE) { 261263509Sdim getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, 262263509Sdim Splits); 263263509Sdim } 264263509Sdim 265249259Sdim AnalyzeFormalArguments(CCInfo, Splits); 266249259Sdim 267249259Sdim for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 268249259Sdim 269263509Sdim const ISD::InputArg &Arg = Ins[i]; 270249259Sdim if (Skipped & (1 << i)) { 271263509Sdim InVals.push_back(DAG.getUNDEF(Arg.VT)); 272249259Sdim continue; 273249259Sdim } 274249259Sdim 275249259Sdim CCValAssign &VA = ArgLocs[ArgIdx++]; 276263509Sdim EVT VT = VA.getLocVT(); 277263509Sdim 278263509Sdim if (VA.isMemLoc()) { 279263509Sdim VT = Ins[i].VT; 280263509Sdim EVT MemVT = Splits[i].VT; 281263509Sdim // The first 36 bytes of the input buffer contains information about 282263509Sdim // thread group and global sizes. 283263509Sdim SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, DAG.getRoot(), 284263509Sdim 36 + VA.getLocMemOffset()); 285263509Sdim InVals.push_back(Arg); 286263509Sdim continue; 287263509Sdim } 288249259Sdim assert(VA.isRegLoc() && "Parameter must be in a register!"); 289249259Sdim 290249259Sdim unsigned Reg = VA.getLocReg(); 291249259Sdim 292249259Sdim if (VT == MVT::i64) { 293249259Sdim // For now assume it is a pointer 294249259Sdim Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 295249259Sdim &AMDGPU::SReg_64RegClass); 296249259Sdim Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 297249259Sdim InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 298249259Sdim continue; 299249259Sdim } 300249259Sdim 301249259Sdim const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 302249259Sdim 303249259Sdim Reg = MF.addLiveIn(Reg, RC); 304249259Sdim SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 305249259Sdim 306249259Sdim if (Arg.VT.isVector()) { 307249259Sdim 308249259Sdim // Build a vector from the registers 309249259Sdim Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 310249259Sdim unsigned NumElements = ParamType->getVectorNumElements(); 311249259Sdim 312249259Sdim SmallVector<SDValue, 4> Regs; 313249259Sdim Regs.push_back(Val); 314249259Sdim for (unsigned j = 1; j != NumElements; ++j) { 315249259Sdim Reg = ArgLocs[ArgIdx++].getLocReg(); 316249259Sdim Reg = MF.addLiveIn(Reg, RC); 317249259Sdim Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 318249259Sdim } 319249259Sdim 320249259Sdim // Fill up the missing vector elements 321249259Sdim NumElements = Arg.VT.getVectorNumElements() - NumElements; 322249259Sdim for (unsigned j = 0; j != NumElements; ++j) 323249259Sdim Regs.push_back(DAG.getUNDEF(VT)); 324263509Sdim 325249259Sdim InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, 326249259Sdim Regs.data(), Regs.size())); 327249259Sdim continue; 328249259Sdim } 329249259Sdim 330249259Sdim InVals.push_back(Val); 331249259Sdim } 332249259Sdim return Chain; 333249259Sdim} 334249259Sdim 335249259SdimMachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 336249259Sdim MachineInstr * MI, MachineBasicBlock * BB) const { 337249259Sdim 338263509Sdim MachineBasicBlock::iterator I = *MI; 339263509Sdim 340249259Sdim switch (MI->getOpcode()) { 341249259Sdim default: 342249259Sdim return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 343249259Sdim case AMDGPU::BRANCH: return BB; 344263509Sdim case AMDGPU::SI_ADDR64_RSRC: { 345263509Sdim const SIInstrInfo *TII = 346263509Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 347263509Sdim MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 348263509Sdim unsigned SuperReg = MI->getOperand(0).getReg(); 349263509Sdim unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 350263509Sdim unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 351263509Sdim unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 352263509Sdim unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 353263509Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo) 354263509Sdim .addOperand(MI->getOperand(1)); 355263509Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo) 356263509Sdim .addImm(0); 357263509Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi) 358263509Sdim .addImm(RSRC_DATA_FORMAT >> 32); 359263509Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi) 360263509Sdim .addReg(SubRegHiLo) 361263509Sdim .addImm(AMDGPU::sub0) 362263509Sdim .addReg(SubRegHiHi) 363263509Sdim .addImm(AMDGPU::sub1); 364263509Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg) 365263509Sdim .addReg(SubRegLo) 366263509Sdim .addImm(AMDGPU::sub0_sub1) 367263509Sdim .addReg(SubRegHi) 368263509Sdim .addImm(AMDGPU::sub2_sub3); 369263509Sdim MI->eraseFromParent(); 370263509Sdim break; 371249259Sdim } 372263509Sdim case AMDGPU::V_SUB_F64: { 373263509Sdim const SIInstrInfo *TII = 374263509Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 375263509Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64), 376263509Sdim MI->getOperand(0).getReg()) 377263509Sdim .addReg(MI->getOperand(1).getReg()) 378263509Sdim .addReg(MI->getOperand(2).getReg()) 379263509Sdim .addImm(0) /* src2 */ 380263509Sdim .addImm(0) /* ABS */ 381263509Sdim .addImm(0) /* CLAMP */ 382263509Sdim .addImm(0) /* OMOD */ 383263509Sdim .addImm(2); /* NEG */ 384263509Sdim MI->eraseFromParent(); 385263509Sdim break; 386263509Sdim } 387263509Sdim case AMDGPU::SI_RegisterStorePseudo: { 388263509Sdim MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); 389263509Sdim const SIInstrInfo *TII = 390263509Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 391263509Sdim unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); 392263509Sdim MachineInstrBuilder MIB = 393263509Sdim BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), 394263509Sdim Reg); 395263509Sdim for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) 396263509Sdim MIB.addOperand(MI->getOperand(i)); 397263509Sdim 398263509Sdim MI->eraseFromParent(); 399263509Sdim } 400263509Sdim } 401249259Sdim return BB; 402249259Sdim} 403249259Sdim 404263509SdimEVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { 405263509Sdim if (!VT.isVector()) { 406263509Sdim return MVT::i1; 407263509Sdim } 408263509Sdim return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); 409249259Sdim} 410249259Sdim 411249259SdimMVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 412249259Sdim return MVT::i32; 413249259Sdim} 414249259Sdim 415263509Sdimbool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { 416263509Sdim VT = VT.getScalarType(); 417263509Sdim 418263509Sdim if (!VT.isSimple()) 419263509Sdim return false; 420263509Sdim 421263509Sdim switch (VT.getSimpleVT().SimpleTy) { 422263509Sdim case MVT::f32: 423263509Sdim return false; /* There is V_MAD_F32 for f32 */ 424263509Sdim case MVT::f64: 425263509Sdim return true; 426263509Sdim default: 427263509Sdim break; 428263509Sdim } 429263509Sdim 430263509Sdim return false; 431263509Sdim} 432263509Sdim 433249259Sdim//===----------------------------------------------------------------------===// 434249259Sdim// Custom DAG Lowering Operations 435249259Sdim//===----------------------------------------------------------------------===// 436249259Sdim 437249259SdimSDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 438263509Sdim MachineFunction &MF = DAG.getMachineFunction(); 439263509Sdim SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 440249259Sdim switch (Op.getOpcode()) { 441249259Sdim default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 442263509Sdim case ISD::ADD: return LowerADD(Op, DAG); 443249259Sdim case ISD::BRCOND: return LowerBRCOND(Op, DAG); 444263509Sdim case ISD::LOAD: { 445263509Sdim LoadSDNode *Load = dyn_cast<LoadSDNode>(Op); 446263509Sdim if ((Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || 447263509Sdim Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && 448263509Sdim Op.getValueType().isVector()) { 449263509Sdim SDValue MergedValues[2] = { 450263509Sdim SplitVectorLoad(Op, DAG), 451263509Sdim Load->getChain() 452263509Sdim }; 453263509Sdim return DAG.getMergeValues(MergedValues, 2, SDLoc(Op)); 454263509Sdim } else { 455263509Sdim return LowerLOAD(Op, DAG); 456263509Sdim } 457263509Sdim } 458263509Sdim 459249259Sdim case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 460263509Sdim case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); 461252723Sdim case ISD::STORE: return LowerSTORE(Op, DAG); 462263509Sdim case ISD::ANY_EXTEND: // Fall-through 463263509Sdim case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG); 464263509Sdim case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); 465263509Sdim case ISD::INTRINSIC_WO_CHAIN: { 466263509Sdim unsigned IntrinsicID = 467263509Sdim cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 468263509Sdim EVT VT = Op.getValueType(); 469263509Sdim SDLoc DL(Op); 470263509Sdim //XXX: Hardcoded we only use two to store the pointer to the parameters. 471263509Sdim unsigned NumUserSGPRs = 2; 472263509Sdim switch (IntrinsicID) { 473263509Sdim default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 474263509Sdim case Intrinsic::r600_read_ngroups_x: 475263509Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0); 476263509Sdim case Intrinsic::r600_read_ngroups_y: 477263509Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4); 478263509Sdim case Intrinsic::r600_read_ngroups_z: 479263509Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8); 480263509Sdim case Intrinsic::r600_read_global_size_x: 481263509Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12); 482263509Sdim case Intrinsic::r600_read_global_size_y: 483263509Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16); 484263509Sdim case Intrinsic::r600_read_global_size_z: 485263509Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20); 486263509Sdim case Intrinsic::r600_read_local_size_x: 487263509Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24); 488263509Sdim case Intrinsic::r600_read_local_size_y: 489263509Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28); 490263509Sdim case Intrinsic::r600_read_local_size_z: 491263509Sdim return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32); 492263509Sdim case Intrinsic::r600_read_tgid_x: 493263509Sdim return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 494263509Sdim AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT); 495263509Sdim case Intrinsic::r600_read_tgid_y: 496263509Sdim return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 497263509Sdim AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT); 498263509Sdim case Intrinsic::r600_read_tgid_z: 499263509Sdim return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, 500263509Sdim AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT); 501263509Sdim case Intrinsic::r600_read_tidig_x: 502263509Sdim return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 503263509Sdim AMDGPU::VGPR0, VT); 504263509Sdim case Intrinsic::r600_read_tidig_y: 505263509Sdim return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 506263509Sdim AMDGPU::VGPR1, VT); 507263509Sdim case Intrinsic::r600_read_tidig_z: 508263509Sdim return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass, 509263509Sdim AMDGPU::VGPR2, VT); 510263509Sdim case AMDGPUIntrinsic::SI_load_const: { 511263509Sdim SDValue Ops [] = { 512263509Sdim ResourceDescriptorToi128(Op.getOperand(1), DAG), 513263509Sdim Op.getOperand(2) 514263509Sdim }; 515263509Sdim 516263509Sdim MachineMemOperand *MMO = MF.getMachineMemOperand( 517263509Sdim MachinePointerInfo(), 518263509Sdim MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, 519263509Sdim VT.getSizeInBits() / 8, 4); 520263509Sdim return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, 521263509Sdim Op->getVTList(), Ops, 2, VT, MMO); 522263509Sdim } 523263509Sdim case AMDGPUIntrinsic::SI_sample: 524263509Sdim return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG); 525263509Sdim case AMDGPUIntrinsic::SI_sampleb: 526263509Sdim return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG); 527263509Sdim case AMDGPUIntrinsic::SI_sampled: 528263509Sdim return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG); 529263509Sdim case AMDGPUIntrinsic::SI_samplel: 530263509Sdim return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG); 531263509Sdim case AMDGPUIntrinsic::SI_vs_load_input: 532263509Sdim return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, 533263509Sdim ResourceDescriptorToi128(Op.getOperand(1), DAG), 534263509Sdim Op.getOperand(2), 535263509Sdim Op.getOperand(3)); 536263509Sdim } 537249259Sdim } 538263509Sdim 539263509Sdim case ISD::INTRINSIC_VOID: 540263509Sdim SDValue Chain = Op.getOperand(0); 541263509Sdim unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 542263509Sdim 543263509Sdim switch (IntrinsicID) { 544263509Sdim case AMDGPUIntrinsic::SI_tbuffer_store: { 545263509Sdim SDLoc DL(Op); 546263509Sdim SDValue Ops [] = { 547263509Sdim Chain, 548263509Sdim ResourceDescriptorToi128(Op.getOperand(2), DAG), 549263509Sdim Op.getOperand(3), 550263509Sdim Op.getOperand(4), 551263509Sdim Op.getOperand(5), 552263509Sdim Op.getOperand(6), 553263509Sdim Op.getOperand(7), 554263509Sdim Op.getOperand(8), 555263509Sdim Op.getOperand(9), 556263509Sdim Op.getOperand(10), 557263509Sdim Op.getOperand(11), 558263509Sdim Op.getOperand(12), 559263509Sdim Op.getOperand(13), 560263509Sdim Op.getOperand(14) 561263509Sdim }; 562263509Sdim EVT VT = Op.getOperand(3).getValueType(); 563263509Sdim 564263509Sdim MachineMemOperand *MMO = MF.getMachineMemOperand( 565263509Sdim MachinePointerInfo(), 566263509Sdim MachineMemOperand::MOStore, 567263509Sdim VT.getSizeInBits() / 8, 4); 568263509Sdim return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL, 569263509Sdim Op->getVTList(), Ops, 570263509Sdim sizeof(Ops)/sizeof(Ops[0]), VT, MMO); 571263509Sdim } 572263509Sdim default: 573263509Sdim break; 574263509Sdim } 575263509Sdim } 576249259Sdim return SDValue(); 577249259Sdim} 578249259Sdim 579263509SdimSDValue SITargetLowering::LowerADD(SDValue Op, 580263509Sdim SelectionDAG &DAG) const { 581263509Sdim if (Op.getValueType() != MVT::i64) 582263509Sdim return SDValue(); 583263509Sdim 584263509Sdim SDLoc DL(Op); 585263509Sdim SDValue LHS = Op.getOperand(0); 586263509Sdim SDValue RHS = Op.getOperand(1); 587263509Sdim 588263509Sdim SDValue Zero = DAG.getConstant(0, MVT::i32); 589263509Sdim SDValue One = DAG.getConstant(1, MVT::i32); 590263509Sdim 591263509Sdim SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero); 592263509Sdim SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One); 593263509Sdim 594263509Sdim SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero); 595263509Sdim SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One); 596263509Sdim 597263509Sdim SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue); 598263509Sdim 599263509Sdim SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1); 600263509Sdim SDValue Carry = AddLo.getValue(1); 601263509Sdim SDValue AddHi = DAG.getNode(ISD::ADDE, DL, VTList, Hi0, Hi1, Carry); 602263509Sdim 603263509Sdim return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddLo, AddHi.getValue(0)); 604263509Sdim} 605263509Sdim 606249259Sdim/// \brief Helper function for LowerBRCOND 607249259Sdimstatic SDNode *findUser(SDValue Value, unsigned Opcode) { 608249259Sdim 609249259Sdim SDNode *Parent = Value.getNode(); 610249259Sdim for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 611249259Sdim I != E; ++I) { 612249259Sdim 613249259Sdim if (I.getUse().get() != Value) 614249259Sdim continue; 615249259Sdim 616249259Sdim if (I->getOpcode() == Opcode) 617249259Sdim return *I; 618249259Sdim } 619249259Sdim return 0; 620249259Sdim} 621249259Sdim 622249259Sdim/// This transforms the control flow intrinsics to get the branch destination as 623249259Sdim/// last parameter, also switches branch target with BR if the need arise 624249259SdimSDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 625249259Sdim SelectionDAG &DAG) const { 626249259Sdim 627263509Sdim SDLoc DL(BRCOND); 628249259Sdim 629249259Sdim SDNode *Intr = BRCOND.getOperand(1).getNode(); 630249259Sdim SDValue Target = BRCOND.getOperand(2); 631249259Sdim SDNode *BR = 0; 632249259Sdim 633249259Sdim if (Intr->getOpcode() == ISD::SETCC) { 634249259Sdim // As long as we negate the condition everything is fine 635249259Sdim SDNode *SetCC = Intr; 636249259Sdim assert(SetCC->getConstantOperandVal(1) == 1); 637249259Sdim assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 638249259Sdim ISD::SETNE); 639249259Sdim Intr = SetCC->getOperand(0).getNode(); 640249259Sdim 641249259Sdim } else { 642249259Sdim // Get the target from BR if we don't negate the condition 643249259Sdim BR = findUser(BRCOND, ISD::BR); 644249259Sdim Target = BR->getOperand(1); 645249259Sdim } 646249259Sdim 647249259Sdim assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 648249259Sdim 649249259Sdim // Build the result and 650249259Sdim SmallVector<EVT, 4> Res; 651249259Sdim for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) 652249259Sdim Res.push_back(Intr->getValueType(i)); 653249259Sdim 654249259Sdim // operands of the new intrinsic call 655249259Sdim SmallVector<SDValue, 4> Ops; 656249259Sdim Ops.push_back(BRCOND.getOperand(0)); 657249259Sdim for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) 658249259Sdim Ops.push_back(Intr->getOperand(i)); 659249259Sdim Ops.push_back(Target); 660249259Sdim 661249259Sdim // build the new intrinsic call 662249259Sdim SDNode *Result = DAG.getNode( 663249259Sdim Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 664249259Sdim DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); 665249259Sdim 666249259Sdim if (BR) { 667249259Sdim // Give the branch instruction our target 668249259Sdim SDValue Ops[] = { 669249259Sdim BR->getOperand(0), 670249259Sdim BRCOND.getOperand(2) 671249259Sdim }; 672249259Sdim DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); 673249259Sdim } 674249259Sdim 675249259Sdim SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 676249259Sdim 677249259Sdim // Copy the intrinsic results to registers 678249259Sdim for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 679249259Sdim SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 680249259Sdim if (!CopyToReg) 681249259Sdim continue; 682249259Sdim 683249259Sdim Chain = DAG.getCopyToReg( 684249259Sdim Chain, DL, 685249259Sdim CopyToReg->getOperand(1), 686249259Sdim SDValue(Result, i - 1), 687249259Sdim SDValue()); 688249259Sdim 689249259Sdim DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 690249259Sdim } 691249259Sdim 692249259Sdim // Remove the old intrinsic from the chain 693249259Sdim DAG.ReplaceAllUsesOfValueWith( 694249259Sdim SDValue(Intr, Intr->getNumValues() - 1), 695249259Sdim Intr->getOperand(0)); 696249259Sdim 697249259Sdim return Chain; 698249259Sdim} 699249259Sdim 700263509SdimSDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { 701263509Sdim SDLoc DL(Op); 702263509Sdim LoadSDNode *Load = cast<LoadSDNode>(Op); 703252723Sdim 704263509Sdim if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 705252723Sdim return SDValue(); 706252723Sdim 707263509Sdim SDValue TruncPtr = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, 708263509Sdim Load->getBasePtr(), DAG.getConstant(0, MVT::i32)); 709263509Sdim SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, 710263509Sdim DAG.getConstant(2, MVT::i32)); 711252723Sdim 712263509Sdim SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(), 713263509Sdim Load->getChain(), Ptr, 714263509Sdim DAG.getTargetConstant(0, MVT::i32), 715263509Sdim Op.getOperand(2)); 716263509Sdim SDValue MergedValues[2] = { 717263509Sdim Ret, 718263509Sdim Load->getChain() 719263509Sdim }; 720263509Sdim return DAG.getMergeValues(MergedValues, 2, DL); 721252723Sdim 722263509Sdim} 723252723Sdim 724263509SdimSDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op, 725263509Sdim SelectionDAG &DAG) const { 726263509Sdim 727263509Sdim if (Op.getValueType() == MVT::i128) { 728263509Sdim return Op; 729263509Sdim } 730263509Sdim 731263509Sdim assert(Op.getOpcode() == ISD::UNDEF); 732263509Sdim 733263509Sdim return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128, 734263509Sdim DAG.getConstant(0, MVT::i64), 735263509Sdim DAG.getConstant(0, MVT::i64)); 736252723Sdim} 737252723Sdim 738263509SdimSDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode, 739263509Sdim const SDValue &Op, 740263509Sdim SelectionDAG &DAG) const { 741263509Sdim return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1), 742263509Sdim Op.getOperand(2), 743263509Sdim ResourceDescriptorToi128(Op.getOperand(3), DAG), 744263509Sdim Op.getOperand(4)); 745263509Sdim} 746263509Sdim 747249259SdimSDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 748249259Sdim SDValue LHS = Op.getOperand(0); 749249259Sdim SDValue RHS = Op.getOperand(1); 750249259Sdim SDValue True = Op.getOperand(2); 751249259Sdim SDValue False = Op.getOperand(3); 752249259Sdim SDValue CC = Op.getOperand(4); 753249259Sdim EVT VT = Op.getValueType(); 754263509Sdim SDLoc DL(Op); 755249259Sdim 756249259Sdim // Possible Min/Max pattern 757249259Sdim SDValue MinMax = LowerMinMax(Op, DAG); 758249259Sdim if (MinMax.getNode()) { 759249259Sdim return MinMax; 760249259Sdim } 761249259Sdim 762249259Sdim SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); 763249259Sdim return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); 764249259Sdim} 765249259Sdim 766263509SdimSDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op, 767263509Sdim SelectionDAG &DAG) const { 768263509Sdim EVT VT = Op.getValueType(); 769263509Sdim SDLoc DL(Op); 770263509Sdim 771263509Sdim if (VT != MVT::i64) { 772263509Sdim return SDValue(); 773263509Sdim } 774263509Sdim 775263509Sdim SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0), 776263509Sdim DAG.getConstant(31, MVT::i32)); 777263509Sdim 778263509Sdim return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi); 779263509Sdim} 780263509Sdim 781263509SdimSDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 782263509Sdim SDLoc DL(Op); 783263509Sdim StoreSDNode *Store = cast<StoreSDNode>(Op); 784263509Sdim EVT VT = Store->getMemoryVT(); 785263509Sdim 786263509Sdim SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG); 787263509Sdim if (Ret.getNode()) 788263509Sdim return Ret; 789263509Sdim 790263509Sdim if (VT.isVector() && VT.getVectorNumElements() >= 8) 791263509Sdim return SplitVectorStore(Op, DAG); 792263509Sdim 793263509Sdim if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 794263509Sdim return SDValue(); 795263509Sdim 796263509Sdim SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32); 797263509Sdim SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr, 798263509Sdim DAG.getConstant(2, MVT::i32)); 799263509Sdim SDValue Chain = Store->getChain(); 800263509Sdim SmallVector<SDValue, 8> Values; 801263509Sdim 802263509Sdim if (VT == MVT::i64) { 803263509Sdim for (unsigned i = 0; i < 2; ++i) { 804263509Sdim Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, 805263509Sdim Store->getValue(), DAG.getConstant(i, MVT::i32))); 806263509Sdim } 807263509Sdim } else if (VT == MVT::i128) { 808263509Sdim for (unsigned i = 0; i < 2; ++i) { 809263509Sdim for (unsigned j = 0; j < 2; ++j) { 810263509Sdim Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, 811263509Sdim DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, 812263509Sdim Store->getValue(), DAG.getConstant(i, MVT::i32)), 813263509Sdim DAG.getConstant(j, MVT::i32))); 814263509Sdim } 815263509Sdim } 816263509Sdim } else { 817263509Sdim Values.push_back(Store->getValue()); 818263509Sdim } 819263509Sdim 820263509Sdim for (unsigned i = 0; i < Values.size(); ++i) { 821263509Sdim SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, 822263509Sdim Ptr, DAG.getConstant(i, MVT::i32)); 823263509Sdim Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, 824263509Sdim Chain, Values[i], PartPtr, 825263509Sdim DAG.getTargetConstant(0, MVT::i32)); 826263509Sdim } 827263509Sdim return Chain; 828263509Sdim} 829263509Sdim 830263509Sdim 831263509SdimSDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op, 832263509Sdim SelectionDAG &DAG) const { 833263509Sdim EVT VT = Op.getValueType(); 834263509Sdim SDLoc DL(Op); 835263509Sdim 836263509Sdim if (VT != MVT::i64) { 837263509Sdim return SDValue(); 838263509Sdim } 839263509Sdim 840263509Sdim return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), 841263509Sdim DAG.getConstant(0, MVT::i32)); 842263509Sdim} 843263509Sdim 844249259Sdim//===----------------------------------------------------------------------===// 845249259Sdim// Custom DAG optimizations 846249259Sdim//===----------------------------------------------------------------------===// 847249259Sdim 848249259SdimSDValue SITargetLowering::PerformDAGCombine(SDNode *N, 849249259Sdim DAGCombinerInfo &DCI) const { 850249259Sdim SelectionDAG &DAG = DCI.DAG; 851263509Sdim SDLoc DL(N); 852249259Sdim EVT VT = N->getValueType(0); 853249259Sdim 854249259Sdim switch (N->getOpcode()) { 855249259Sdim default: break; 856249259Sdim case ISD::SELECT_CC: { 857249259Sdim ConstantSDNode *True, *False; 858249259Sdim // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) 859249259Sdim if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) 860249259Sdim && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) 861249259Sdim && True->isAllOnesValue() 862249259Sdim && False->isNullValue() 863249259Sdim && VT == MVT::i1) { 864249259Sdim return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), 865249259Sdim N->getOperand(1), N->getOperand(4)); 866249259Sdim 867249259Sdim } 868249259Sdim break; 869249259Sdim } 870249259Sdim case ISD::SETCC: { 871249259Sdim SDValue Arg0 = N->getOperand(0); 872249259Sdim SDValue Arg1 = N->getOperand(1); 873249259Sdim SDValue CC = N->getOperand(2); 874249259Sdim ConstantSDNode * C = NULL; 875249259Sdim ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); 876249259Sdim 877249259Sdim // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) 878249259Sdim if (VT == MVT::i1 879249259Sdim && Arg0.getOpcode() == ISD::SIGN_EXTEND 880249259Sdim && Arg0.getOperand(0).getValueType() == MVT::i1 881249259Sdim && (C = dyn_cast<ConstantSDNode>(Arg1)) 882249259Sdim && C->isNullValue() 883249259Sdim && CCOp == ISD::SETNE) { 884249259Sdim return SimplifySetCC(VT, Arg0.getOperand(0), 885249259Sdim DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); 886249259Sdim } 887249259Sdim break; 888249259Sdim } 889249259Sdim } 890249259Sdim return SDValue(); 891249259Sdim} 892249259Sdim 893263509Sdim/// \brief Test if RegClass is one of the VSrc classes 894249259Sdimstatic bool isVSrc(unsigned RegClass) { 895249259Sdim return AMDGPU::VSrc_32RegClassID == RegClass || 896249259Sdim AMDGPU::VSrc_64RegClassID == RegClass; 897249259Sdim} 898249259Sdim 899263509Sdim/// \brief Test if RegClass is one of the SSrc classes 900249259Sdimstatic bool isSSrc(unsigned RegClass) { 901249259Sdim return AMDGPU::SSrc_32RegClassID == RegClass || 902249259Sdim AMDGPU::SSrc_64RegClassID == RegClass; 903249259Sdim} 904249259Sdim 905249259Sdim/// \brief Analyze the possible immediate value Op 906249259Sdim/// 907249259Sdim/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 908249259Sdim/// and the immediate value if it's a literal immediate 909249259Sdimint32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 910249259Sdim 911249259Sdim union { 912249259Sdim int32_t I; 913249259Sdim float F; 914249259Sdim } Imm; 915249259Sdim 916252723Sdim if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 917252723Sdim if (Node->getZExtValue() >> 32) { 918252723Sdim return -1; 919252723Sdim } 920249259Sdim Imm.I = Node->getSExtValue(); 921252723Sdim } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) 922249259Sdim Imm.F = Node->getValueAPF().convertToFloat(); 923249259Sdim else 924249259Sdim return -1; // It isn't an immediate 925249259Sdim 926249259Sdim if ((Imm.I >= -16 && Imm.I <= 64) || 927249259Sdim Imm.F == 0.5f || Imm.F == -0.5f || 928249259Sdim Imm.F == 1.0f || Imm.F == -1.0f || 929249259Sdim Imm.F == 2.0f || Imm.F == -2.0f || 930249259Sdim Imm.F == 4.0f || Imm.F == -4.0f) 931249259Sdim return 0; // It's an inline immediate 932249259Sdim 933249259Sdim return Imm.I; // It's a literal immediate 934249259Sdim} 935249259Sdim 936249259Sdim/// \brief Try to fold an immediate directly into an instruction 937249259Sdimbool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, 938249259Sdim bool &ScalarSlotUsed) const { 939249259Sdim 940249259Sdim MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); 941263509Sdim const SIInstrInfo *TII = 942263509Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 943249259Sdim if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) 944249259Sdim return false; 945249259Sdim 946249259Sdim const SDValue &Op = Mov->getOperand(0); 947249259Sdim int32_t Value = analyzeImmediate(Op.getNode()); 948249259Sdim if (Value == -1) { 949249259Sdim // Not an immediate at all 950249259Sdim return false; 951249259Sdim 952249259Sdim } else if (Value == 0) { 953249259Sdim // Inline immediates can always be fold 954249259Sdim Operand = Op; 955249259Sdim return true; 956249259Sdim 957249259Sdim } else if (Value == Immediate) { 958249259Sdim // Already fold literal immediate 959249259Sdim Operand = Op; 960249259Sdim return true; 961249259Sdim 962249259Sdim } else if (!ScalarSlotUsed && !Immediate) { 963249259Sdim // Fold this literal immediate 964249259Sdim ScalarSlotUsed = true; 965249259Sdim Immediate = Value; 966249259Sdim Operand = Op; 967249259Sdim return true; 968249259Sdim 969249259Sdim } 970249259Sdim 971249259Sdim return false; 972249259Sdim} 973249259Sdim 974263509Sdimconst TargetRegisterClass *SITargetLowering::getRegClassForNode( 975263509Sdim SelectionDAG &DAG, const SDValue &Op) const { 976263509Sdim const SIInstrInfo *TII = 977263509Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 978263509Sdim const SIRegisterInfo &TRI = TII->getRegisterInfo(); 979249259Sdim 980263509Sdim if (!Op->isMachineOpcode()) { 981263509Sdim switch(Op->getOpcode()) { 982263509Sdim case ISD::CopyFromReg: { 983263509Sdim MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 984263509Sdim unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg(); 985263509Sdim if (TargetRegisterInfo::isVirtualRegister(Reg)) { 986263509Sdim return MRI.getRegClass(Reg); 987263509Sdim } 988263509Sdim return TRI.getPhysRegClass(Reg); 989263509Sdim } 990263509Sdim default: return NULL; 991263509Sdim } 992263509Sdim } 993263509Sdim const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode()); 994263509Sdim int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; 995263509Sdim if (OpClassID != -1) { 996263509Sdim return TRI.getRegClass(OpClassID); 997263509Sdim } 998263509Sdim switch(Op.getMachineOpcode()) { 999263509Sdim case AMDGPU::COPY_TO_REGCLASS: 1000263509Sdim // Operand 1 is the register class id for COPY_TO_REGCLASS instructions. 1001263509Sdim OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue(); 1002249259Sdim 1003263509Sdim // If the COPY_TO_REGCLASS instruction is copying to a VSrc register 1004263509Sdim // class, then the register class for the value could be either a 1005263509Sdim // VReg or and SReg. In order to get a more accurate 1006263509Sdim if (OpClassID == AMDGPU::VSrc_32RegClassID || 1007263509Sdim OpClassID == AMDGPU::VSrc_64RegClassID) { 1008263509Sdim return getRegClassForNode(DAG, Op.getOperand(0)); 1009263509Sdim } 1010263509Sdim return TRI.getRegClass(OpClassID); 1011263509Sdim case AMDGPU::EXTRACT_SUBREG: { 1012263509Sdim int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1013263509Sdim const TargetRegisterClass *SuperClass = 1014263509Sdim getRegClassForNode(DAG, Op.getOperand(0)); 1015263509Sdim return TRI.getSubClassWithSubReg(SuperClass, SubIdx); 1016263509Sdim } 1017263509Sdim case AMDGPU::REG_SEQUENCE: 1018263509Sdim // Operand 0 is the register class id for REG_SEQUENCE instructions. 1019263509Sdim return TRI.getRegClass( 1020263509Sdim cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()); 1021263509Sdim default: 1022263509Sdim return getRegClassFor(Op.getSimpleValueType()); 1023263509Sdim } 1024263509Sdim} 1025249259Sdim 1026263509Sdim/// \brief Does "Op" fit into register class "RegClass" ? 1027263509Sdimbool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op, 1028263509Sdim unsigned RegClass) const { 1029263509Sdim const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 1030263509Sdim const TargetRegisterClass *RC = getRegClassForNode(DAG, Op); 1031263509Sdim if (!RC) { 1032249259Sdim return false; 1033263509Sdim } 1034263509Sdim return TRI->getRegClass(RegClass)->hasSubClassEq(RC); 1035249259Sdim} 1036249259Sdim 1037249259Sdim/// \brief Make sure that we don't exeed the number of allowed scalars 1038249259Sdimvoid SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 1039249259Sdim unsigned RegClass, 1040249259Sdim bool &ScalarSlotUsed) const { 1041249259Sdim 1042249259Sdim // First map the operands register class to a destination class 1043249259Sdim if (RegClass == AMDGPU::VSrc_32RegClassID) 1044249259Sdim RegClass = AMDGPU::VReg_32RegClassID; 1045249259Sdim else if (RegClass == AMDGPU::VSrc_64RegClassID) 1046249259Sdim RegClass = AMDGPU::VReg_64RegClassID; 1047249259Sdim else 1048249259Sdim return; 1049249259Sdim 1050249259Sdim // Nothing todo if they fit naturaly 1051249259Sdim if (fitsRegClass(DAG, Operand, RegClass)) 1052249259Sdim return; 1053249259Sdim 1054249259Sdim // If the scalar slot isn't used yet use it now 1055249259Sdim if (!ScalarSlotUsed) { 1056249259Sdim ScalarSlotUsed = true; 1057249259Sdim return; 1058249259Sdim } 1059249259Sdim 1060263509Sdim // This is a conservative aproach. It is possible that we can't determine the 1061263509Sdim // correct register class and copy too often, but better safe than sorry. 1062249259Sdim SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); 1063263509Sdim SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(), 1064249259Sdim Operand.getValueType(), Operand, RC); 1065249259Sdim Operand = SDValue(Node, 0); 1066249259Sdim} 1067249259Sdim 1068263509Sdim/// \returns true if \p Node's operands are different from the SDValue list 1069263509Sdim/// \p Ops 1070263509Sdimstatic bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) { 1071263509Sdim for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) { 1072263509Sdim if (Ops[i].getNode() != Node->getOperand(i).getNode()) { 1073263509Sdim return true; 1074263509Sdim } 1075263509Sdim } 1076263509Sdim return false; 1077263509Sdim} 1078263509Sdim 1079252723Sdim/// \brief Try to fold the Nodes operands into the Node 1080252723SdimSDNode *SITargetLowering::foldOperands(MachineSDNode *Node, 1081252723Sdim SelectionDAG &DAG) const { 1082249259Sdim 1083249259Sdim // Original encoding (either e32 or e64) 1084249259Sdim int Opcode = Node->getMachineOpcode(); 1085263509Sdim const SIInstrInfo *TII = 1086263509Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1087249259Sdim const MCInstrDesc *Desc = &TII->get(Opcode); 1088249259Sdim 1089249259Sdim unsigned NumDefs = Desc->getNumDefs(); 1090249259Sdim unsigned NumOps = Desc->getNumOperands(); 1091249259Sdim 1092249259Sdim // Commuted opcode if available 1093249259Sdim int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; 1094249259Sdim const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); 1095249259Sdim 1096249259Sdim assert(!DescRev || DescRev->getNumDefs() == NumDefs); 1097249259Sdim assert(!DescRev || DescRev->getNumOperands() == NumOps); 1098249259Sdim 1099249259Sdim // e64 version if available, -1 otherwise 1100249259Sdim int OpcodeE64 = AMDGPU::getVOPe64(Opcode); 1101249259Sdim const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); 1102249259Sdim 1103249259Sdim assert(!DescE64 || DescE64->getNumDefs() == NumDefs); 1104249259Sdim assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); 1105249259Sdim 1106249259Sdim int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; 1107249259Sdim bool HaveVSrc = false, HaveSSrc = false; 1108249259Sdim 1109249259Sdim // First figure out what we alread have in this instruction 1110249259Sdim for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 1111249259Sdim i != e && Op < NumOps; ++i, ++Op) { 1112249259Sdim 1113249259Sdim unsigned RegClass = Desc->OpInfo[Op].RegClass; 1114249259Sdim if (isVSrc(RegClass)) 1115249259Sdim HaveVSrc = true; 1116249259Sdim else if (isSSrc(RegClass)) 1117249259Sdim HaveSSrc = true; 1118249259Sdim else 1119249259Sdim continue; 1120249259Sdim 1121249259Sdim int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); 1122249259Sdim if (Imm != -1 && Imm != 0) { 1123249259Sdim // Literal immediate 1124249259Sdim Immediate = Imm; 1125249259Sdim } 1126249259Sdim } 1127249259Sdim 1128249259Sdim // If we neither have VSrc nor SSrc it makes no sense to continue 1129249259Sdim if (!HaveVSrc && !HaveSSrc) 1130249259Sdim return Node; 1131249259Sdim 1132249259Sdim // No scalar allowed when we have both VSrc and SSrc 1133249259Sdim bool ScalarSlotUsed = HaveVSrc && HaveSSrc; 1134249259Sdim 1135249259Sdim // Second go over the operands and try to fold them 1136249259Sdim std::vector<SDValue> Ops; 1137249259Sdim bool Promote2e64 = false; 1138249259Sdim for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 1139249259Sdim i != e && Op < NumOps; ++i, ++Op) { 1140249259Sdim 1141249259Sdim const SDValue &Operand = Node->getOperand(i); 1142249259Sdim Ops.push_back(Operand); 1143249259Sdim 1144249259Sdim // Already folded immediate ? 1145249259Sdim if (isa<ConstantSDNode>(Operand.getNode()) || 1146249259Sdim isa<ConstantFPSDNode>(Operand.getNode())) 1147249259Sdim continue; 1148249259Sdim 1149249259Sdim // Is this a VSrc or SSrc operand ? 1150249259Sdim unsigned RegClass = Desc->OpInfo[Op].RegClass; 1151249259Sdim if (isVSrc(RegClass) || isSSrc(RegClass)) { 1152249259Sdim // Try to fold the immediates 1153249259Sdim if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { 1154249259Sdim // Folding didn't worked, make sure we don't hit the SReg limit 1155249259Sdim ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); 1156249259Sdim } 1157249259Sdim continue; 1158249259Sdim } 1159249259Sdim 1160249259Sdim if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { 1161249259Sdim 1162249259Sdim unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; 1163249259Sdim assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); 1164249259Sdim 1165249259Sdim // Test if it makes sense to swap operands 1166249259Sdim if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || 1167249259Sdim (!fitsRegClass(DAG, Ops[1], RegClass) && 1168249259Sdim fitsRegClass(DAG, Ops[1], OtherRegClass))) { 1169249259Sdim 1170249259Sdim // Swap commutable operands 1171249259Sdim SDValue Tmp = Ops[1]; 1172249259Sdim Ops[1] = Ops[0]; 1173249259Sdim Ops[0] = Tmp; 1174249259Sdim 1175249259Sdim Desc = DescRev; 1176249259Sdim DescRev = 0; 1177249259Sdim continue; 1178249259Sdim } 1179249259Sdim } 1180249259Sdim 1181249259Sdim if (DescE64 && !Immediate) { 1182249259Sdim 1183249259Sdim // Test if it makes sense to switch to e64 encoding 1184249259Sdim unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; 1185249259Sdim if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) 1186249259Sdim continue; 1187249259Sdim 1188249259Sdim int32_t TmpImm = -1; 1189249259Sdim if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || 1190249259Sdim (!fitsRegClass(DAG, Ops[i], RegClass) && 1191249259Sdim fitsRegClass(DAG, Ops[1], OtherRegClass))) { 1192249259Sdim 1193249259Sdim // Switch to e64 encoding 1194249259Sdim Immediate = -1; 1195249259Sdim Promote2e64 = true; 1196249259Sdim Desc = DescE64; 1197249259Sdim DescE64 = 0; 1198249259Sdim } 1199249259Sdim } 1200249259Sdim } 1201249259Sdim 1202249259Sdim if (Promote2e64) { 1203249259Sdim // Add the modifier flags while promoting 1204249259Sdim for (unsigned i = 0; i < 4; ++i) 1205249259Sdim Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); 1206249259Sdim } 1207249259Sdim 1208249259Sdim // Add optional chain and glue 1209249259Sdim for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) 1210249259Sdim Ops.push_back(Node->getOperand(i)); 1211249259Sdim 1212263509Sdim // Nodes that have a glue result are not CSE'd by getMachineNode(), so in 1213263509Sdim // this case a brand new node is always be created, even if the operands 1214263509Sdim // are the same as before. So, manually check if anything has been changed. 1215263509Sdim if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) { 1216263509Sdim return Node; 1217263509Sdim } 1218263509Sdim 1219249259Sdim // Create a complete new instruction 1220263509Sdim return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops); 1221249259Sdim} 1222252723Sdim 1223252723Sdim/// \brief Helper function for adjustWritemask 1224263509Sdimstatic unsigned SubIdx2Lane(unsigned Idx) { 1225252723Sdim switch (Idx) { 1226252723Sdim default: return 0; 1227252723Sdim case AMDGPU::sub0: return 0; 1228252723Sdim case AMDGPU::sub1: return 1; 1229252723Sdim case AMDGPU::sub2: return 2; 1230252723Sdim case AMDGPU::sub3: return 3; 1231252723Sdim } 1232252723Sdim} 1233252723Sdim 1234252723Sdim/// \brief Adjust the writemask of MIMG instructions 1235252723Sdimvoid SITargetLowering::adjustWritemask(MachineSDNode *&Node, 1236252723Sdim SelectionDAG &DAG) const { 1237252723Sdim SDNode *Users[4] = { }; 1238263509Sdim unsigned Lane = 0; 1239263509Sdim unsigned OldDmask = Node->getConstantOperandVal(0); 1240263509Sdim unsigned NewDmask = 0; 1241252723Sdim 1242252723Sdim // Try to figure out the used register components 1243252723Sdim for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 1244252723Sdim I != E; ++I) { 1245252723Sdim 1246252723Sdim // Abort if we can't understand the usage 1247252723Sdim if (!I->isMachineOpcode() || 1248252723Sdim I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 1249252723Sdim return; 1250252723Sdim 1251263509Sdim // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used. 1252263509Sdim // Note that subregs are packed, i.e. Lane==0 is the first bit set 1253263509Sdim // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit 1254263509Sdim // set, etc. 1255252723Sdim Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 1256252723Sdim 1257263509Sdim // Set which texture component corresponds to the lane. 1258263509Sdim unsigned Comp; 1259263509Sdim for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) { 1260263509Sdim assert(Dmask); 1261263509Sdim Comp = countTrailingZeros(Dmask); 1262263509Sdim Dmask &= ~(1 << Comp); 1263263509Sdim } 1264263509Sdim 1265252723Sdim // Abort if we have more than one user per component 1266252723Sdim if (Users[Lane]) 1267252723Sdim return; 1268252723Sdim 1269252723Sdim Users[Lane] = *I; 1270263509Sdim NewDmask |= 1 << Comp; 1271252723Sdim } 1272252723Sdim 1273263509Sdim // Abort if there's no change 1274263509Sdim if (NewDmask == OldDmask) 1275252723Sdim return; 1276252723Sdim 1277252723Sdim // Adjust the writemask in the node 1278252723Sdim std::vector<SDValue> Ops; 1279263509Sdim Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32)); 1280252723Sdim for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) 1281252723Sdim Ops.push_back(Node->getOperand(i)); 1282252723Sdim Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); 1283252723Sdim 1284252723Sdim // If we only got one lane, replace it with a copy 1285263509Sdim // (if NewDmask has only one bit set...) 1286263509Sdim if (NewDmask && (NewDmask & (NewDmask-1)) == 0) { 1287252723Sdim SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); 1288252723Sdim SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 1289263509Sdim SDLoc(), Users[Lane]->getValueType(0), 1290252723Sdim SDValue(Node, 0), RC); 1291252723Sdim DAG.ReplaceAllUsesWith(Users[Lane], Copy); 1292252723Sdim return; 1293252723Sdim } 1294252723Sdim 1295252723Sdim // Update the users of the node with the new indices 1296252723Sdim for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 1297252723Sdim 1298252723Sdim SDNode *User = Users[i]; 1299252723Sdim if (!User) 1300252723Sdim continue; 1301252723Sdim 1302252723Sdim SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); 1303252723Sdim DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 1304252723Sdim 1305252723Sdim switch (Idx) { 1306252723Sdim default: break; 1307252723Sdim case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 1308252723Sdim case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 1309252723Sdim case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 1310252723Sdim } 1311252723Sdim } 1312252723Sdim} 1313252723Sdim 1314252723Sdim/// \brief Fold the instructions after slecting them 1315252723SdimSDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 1316252723Sdim SelectionDAG &DAG) const { 1317263509Sdim const SIInstrInfo *TII = 1318263509Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1319263509Sdim Node = AdjustRegClass(Node, DAG); 1320252723Sdim 1321263509Sdim if (TII->isMIMG(Node->getMachineOpcode())) 1322252723Sdim adjustWritemask(Node, DAG); 1323252723Sdim 1324252723Sdim return foldOperands(Node, DAG); 1325252723Sdim} 1326252723Sdim 1327252723Sdim/// \brief Assign the register class depending on the number of 1328252723Sdim/// bits set in the writemask 1329252723Sdimvoid SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 1330252723Sdim SDNode *Node) const { 1331263509Sdim const SIInstrInfo *TII = 1332263509Sdim static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo()); 1333263509Sdim if (!TII->isMIMG(MI->getOpcode())) 1334252723Sdim return; 1335252723Sdim 1336252723Sdim unsigned VReg = MI->getOperand(0).getReg(); 1337252723Sdim unsigned Writemask = MI->getOperand(1).getImm(); 1338252723Sdim unsigned BitsSet = 0; 1339252723Sdim for (unsigned i = 0; i < 4; ++i) 1340252723Sdim BitsSet += Writemask & (1 << i) ? 1 : 0; 1341252723Sdim 1342252723Sdim const TargetRegisterClass *RC; 1343252723Sdim switch (BitsSet) { 1344252723Sdim default: return; 1345252723Sdim case 1: RC = &AMDGPU::VReg_32RegClass; break; 1346252723Sdim case 2: RC = &AMDGPU::VReg_64RegClass; break; 1347252723Sdim case 3: RC = &AMDGPU::VReg_96RegClass; break; 1348252723Sdim } 1349252723Sdim 1350263509Sdim unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet); 1351263509Sdim MI->setDesc(TII->get(NewOpcode)); 1352252723Sdim MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 1353252723Sdim MRI.setRegClass(VReg, RC); 1354252723Sdim} 1355263509Sdim 1356263509SdimMachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N, 1357263509Sdim SelectionDAG &DAG) const { 1358263509Sdim 1359263509Sdim SDLoc DL(N); 1360263509Sdim unsigned NewOpcode = N->getMachineOpcode(); 1361263509Sdim 1362263509Sdim switch (N->getMachineOpcode()) { 1363263509Sdim default: return N; 1364263509Sdim case AMDGPU::S_LOAD_DWORD_IMM: 1365263509Sdim NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64; 1366263509Sdim // Fall-through 1367263509Sdim case AMDGPU::S_LOAD_DWORDX2_SGPR: 1368263509Sdim if (NewOpcode == N->getMachineOpcode()) { 1369263509Sdim NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; 1370263509Sdim } 1371263509Sdim // Fall-through 1372263509Sdim case AMDGPU::S_LOAD_DWORDX4_IMM: 1373263509Sdim case AMDGPU::S_LOAD_DWORDX4_SGPR: { 1374263509Sdim if (NewOpcode == N->getMachineOpcode()) { 1375263509Sdim NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; 1376263509Sdim } 1377263509Sdim if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) { 1378263509Sdim return N; 1379263509Sdim } 1380263509Sdim ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1)); 1381263509Sdim SDValue Ops[] = { 1382263509Sdim SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128, 1383263509Sdim DAG.getConstant(0, MVT::i64)), 0), 1384263509Sdim N->getOperand(0), 1385263509Sdim DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32) 1386263509Sdim }; 1387263509Sdim return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops); 1388263509Sdim } 1389263509Sdim } 1390263509Sdim} 1391263509Sdim 1392263509SdimSDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, 1393263509Sdim const TargetRegisterClass *RC, 1394263509Sdim unsigned Reg, EVT VT) const { 1395263509Sdim SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT); 1396263509Sdim 1397263509Sdim return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()), 1398263509Sdim cast<RegisterSDNode>(VReg)->getReg(), VT); 1399263509Sdim} 1400