1249259Sdim//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// 2249259Sdim// 3249259Sdim// The LLVM Compiler Infrastructure 4249259Sdim// 5249259Sdim// This file is distributed under the University of Illinois Open Source 6249259Sdim// License. See LICENSE.TXT for details. 7249259Sdim// 8249259Sdim//===----------------------------------------------------------------------===// 9249259Sdim// 10249259Sdim/// \file 11249259Sdim/// \brief Custom DAG lowering for SI 12249259Sdim// 13249259Sdim//===----------------------------------------------------------------------===// 14249259Sdim 15249259Sdim#include "SIISelLowering.h" 16249259Sdim#include "AMDIL.h" 17249259Sdim#include "AMDGPU.h" 18249259Sdim#include "AMDILIntrinsicInfo.h" 19249259Sdim#include "SIInstrInfo.h" 20249259Sdim#include "SIMachineFunctionInfo.h" 21249259Sdim#include "SIRegisterInfo.h" 22249259Sdim#include "llvm/IR/Function.h" 23249259Sdim#include "llvm/CodeGen/CallingConvLower.h" 24249259Sdim#include "llvm/CodeGen/MachineInstrBuilder.h" 25249259Sdim#include "llvm/CodeGen/MachineRegisterInfo.h" 26249259Sdim#include "llvm/CodeGen/SelectionDAG.h" 27249259Sdim 28249259Sdimusing namespace llvm; 29249259Sdim 30249259SdimSITargetLowering::SITargetLowering(TargetMachine &TM) : 31249259Sdim AMDGPUTargetLowering(TM), 32249259Sdim TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())), 33249259Sdim TRI(TM.getRegisterInfo()) { 34249259Sdim 35249259Sdim addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass); 36249259Sdim addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); 37249259Sdim 38249259Sdim addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass); 39249259Sdim addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass); 40249259Sdim addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass); 41249259Sdim 42249259Sdim addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass); 43249259Sdim addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass); 44249259Sdim 45249259Sdim addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass); 46249259Sdim 47249259Sdim addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass); 48249259Sdim addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); 49249259Sdim 50249259Sdim addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass); 51249259Sdim addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); 52251662Sdim addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass); 53249259Sdim 54249259Sdim addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass); 55249259Sdim addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); 56249259Sdim 57249259Sdim addRegisterClass(MVT::v16i32, &AMDGPU::VReg_512RegClass); 58249259Sdim addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); 59249259Sdim 60249259Sdim computeRegisterProperties(); 61249259Sdim 62249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); 63249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); 64249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); 65249259Sdim setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand); 66249259Sdim 67249259Sdim setOperationAction(ISD::ADD, MVT::i64, Legal); 68249259Sdim setOperationAction(ISD::ADD, MVT::i32, Legal); 69249259Sdim 70249259Sdim setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 71249259Sdim setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 72249259Sdim 73249259Sdim setOperationAction(ISD::SELECT_CC, MVT::Other, Expand); 74251662Sdim 75251662Sdim setOperationAction(ISD::STORE, MVT::i32, Custom); 76251662Sdim setOperationAction(ISD::STORE, MVT::i64, Custom); 77251662Sdim 78249259Sdim setTargetDAGCombine(ISD::SELECT_CC); 79249259Sdim 80249259Sdim setTargetDAGCombine(ISD::SETCC); 81249259Sdim 82249259Sdim setSchedulingPreference(Sched::RegPressure); 83249259Sdim} 84249259Sdim 85249259SdimSDValue SITargetLowering::LowerFormalArguments( 86249259Sdim SDValue Chain, 87249259Sdim CallingConv::ID CallConv, 88249259Sdim bool isVarArg, 89249259Sdim const SmallVectorImpl<ISD::InputArg> &Ins, 90249259Sdim DebugLoc DL, SelectionDAG &DAG, 91249259Sdim SmallVectorImpl<SDValue> &InVals) const { 92249259Sdim 93249259Sdim const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); 94249259Sdim 95249259Sdim MachineFunction &MF = DAG.getMachineFunction(); 96249259Sdim FunctionType *FType = MF.getFunction()->getFunctionType(); 97249259Sdim SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 98249259Sdim 99249259Sdim assert(CallConv == CallingConv::C); 100249259Sdim 101249259Sdim SmallVector<ISD::InputArg, 16> Splits; 102249259Sdim uint32_t Skipped = 0; 103249259Sdim 104249259Sdim for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { 105249259Sdim const ISD::InputArg &Arg = Ins[i]; 106249259Sdim 107249259Sdim // First check if it's a PS input addr 108249259Sdim if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) { 109249259Sdim 110249259Sdim assert((PSInputNum <= 15) && "Too many PS inputs!"); 111249259Sdim 112249259Sdim if (!Arg.Used) { 113249259Sdim // We can savely skip PS inputs 114249259Sdim Skipped |= 1 << i; 115249259Sdim ++PSInputNum; 116249259Sdim continue; 117249259Sdim } 118249259Sdim 119249259Sdim Info->PSInputAddr |= 1 << PSInputNum++; 120249259Sdim } 121249259Sdim 122249259Sdim // Second split vertices into their elements 123249259Sdim if (Arg.VT.isVector()) { 124249259Sdim ISD::InputArg NewArg = Arg; 125249259Sdim NewArg.Flags.setSplit(); 126249259Sdim NewArg.VT = Arg.VT.getVectorElementType(); 127249259Sdim 128249259Sdim // We REALLY want the ORIGINAL number of vertex elements here, e.g. a 129249259Sdim // three or five element vertex only needs three or five registers, 130249259Sdim // NOT four or eigth. 131249259Sdim Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 132249259Sdim unsigned NumElements = ParamType->getVectorNumElements(); 133249259Sdim 134249259Sdim for (unsigned j = 0; j != NumElements; ++j) { 135249259Sdim Splits.push_back(NewArg); 136249259Sdim NewArg.PartOffset += NewArg.VT.getStoreSize(); 137249259Sdim } 138249259Sdim 139249259Sdim } else { 140249259Sdim Splits.push_back(Arg); 141249259Sdim } 142249259Sdim } 143249259Sdim 144249259Sdim SmallVector<CCValAssign, 16> ArgLocs; 145249259Sdim CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), 146249259Sdim getTargetMachine(), ArgLocs, *DAG.getContext()); 147249259Sdim 148249259Sdim // At least one interpolation mode must be enabled or else the GPU will hang. 149249259Sdim if (Info->ShaderType == ShaderType::PIXEL && (Info->PSInputAddr & 0x7F) == 0) { 150249259Sdim Info->PSInputAddr |= 1; 151249259Sdim CCInfo.AllocateReg(AMDGPU::VGPR0); 152249259Sdim CCInfo.AllocateReg(AMDGPU::VGPR1); 153249259Sdim } 154249259Sdim 155249259Sdim AnalyzeFormalArguments(CCInfo, Splits); 156249259Sdim 157249259Sdim for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { 158249259Sdim 159249259Sdim if (Skipped & (1 << i)) { 160249259Sdim InVals.push_back(SDValue()); 161249259Sdim continue; 162249259Sdim } 163249259Sdim 164249259Sdim CCValAssign &VA = ArgLocs[ArgIdx++]; 165249259Sdim assert(VA.isRegLoc() && "Parameter must be in a register!"); 166249259Sdim 167249259Sdim unsigned Reg = VA.getLocReg(); 168249259Sdim MVT VT = VA.getLocVT(); 169249259Sdim 170249259Sdim if (VT == MVT::i64) { 171249259Sdim // For now assume it is a pointer 172249259Sdim Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, 173249259Sdim &AMDGPU::SReg_64RegClass); 174249259Sdim Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass); 175249259Sdim InVals.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 176249259Sdim continue; 177249259Sdim } 178249259Sdim 179249259Sdim const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); 180249259Sdim 181249259Sdim Reg = MF.addLiveIn(Reg, RC); 182249259Sdim SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); 183249259Sdim 184249259Sdim const ISD::InputArg &Arg = Ins[i]; 185249259Sdim if (Arg.VT.isVector()) { 186249259Sdim 187249259Sdim // Build a vector from the registers 188249259Sdim Type *ParamType = FType->getParamType(Arg.OrigArgIndex); 189249259Sdim unsigned NumElements = ParamType->getVectorNumElements(); 190249259Sdim 191249259Sdim SmallVector<SDValue, 4> Regs; 192249259Sdim Regs.push_back(Val); 193249259Sdim for (unsigned j = 1; j != NumElements; ++j) { 194249259Sdim Reg = ArgLocs[ArgIdx++].getLocReg(); 195249259Sdim Reg = MF.addLiveIn(Reg, RC); 196249259Sdim Regs.push_back(DAG.getCopyFromReg(Chain, DL, Reg, VT)); 197249259Sdim } 198249259Sdim 199249259Sdim // Fill up the missing vector elements 200249259Sdim NumElements = Arg.VT.getVectorNumElements() - NumElements; 201249259Sdim for (unsigned j = 0; j != NumElements; ++j) 202249259Sdim Regs.push_back(DAG.getUNDEF(VT)); 203249259Sdim 204249259Sdim InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, 205249259Sdim Regs.data(), Regs.size())); 206249259Sdim continue; 207249259Sdim } 208249259Sdim 209249259Sdim InVals.push_back(Val); 210249259Sdim } 211249259Sdim return Chain; 212249259Sdim} 213249259Sdim 214249259SdimMachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( 215249259Sdim MachineInstr * MI, MachineBasicBlock * BB) const { 216249259Sdim 217249259Sdim switch (MI->getOpcode()) { 218249259Sdim default: 219249259Sdim return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); 220249259Sdim case AMDGPU::BRANCH: return BB; 221249259Sdim } 222249259Sdim return BB; 223249259Sdim} 224249259Sdim 225249259SdimEVT SITargetLowering::getSetCCResultType(EVT VT) const { 226249259Sdim return MVT::i1; 227249259Sdim} 228249259Sdim 229249259SdimMVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { 230249259Sdim return MVT::i32; 231249259Sdim} 232249259Sdim 233249259Sdim//===----------------------------------------------------------------------===// 234249259Sdim// Custom DAG Lowering Operations 235249259Sdim//===----------------------------------------------------------------------===// 236249259Sdim 237249259SdimSDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 238249259Sdim switch (Op.getOpcode()) { 239249259Sdim default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); 240249259Sdim case ISD::BRCOND: return LowerBRCOND(Op, DAG); 241249259Sdim case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 242251662Sdim case ISD::STORE: return LowerSTORE(Op, DAG); 243249259Sdim } 244249259Sdim return SDValue(); 245249259Sdim} 246249259Sdim 247249259Sdim/// \brief Helper function for LowerBRCOND 248249259Sdimstatic SDNode *findUser(SDValue Value, unsigned Opcode) { 249249259Sdim 250249259Sdim SDNode *Parent = Value.getNode(); 251249259Sdim for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); 252249259Sdim I != E; ++I) { 253249259Sdim 254249259Sdim if (I.getUse().get() != Value) 255249259Sdim continue; 256249259Sdim 257249259Sdim if (I->getOpcode() == Opcode) 258249259Sdim return *I; 259249259Sdim } 260249259Sdim return 0; 261249259Sdim} 262249259Sdim 263249259Sdim/// This transforms the control flow intrinsics to get the branch destination as 264249259Sdim/// last parameter, also switches branch target with BR if the need arise 265249259SdimSDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, 266249259Sdim SelectionDAG &DAG) const { 267249259Sdim 268249259Sdim DebugLoc DL = BRCOND.getDebugLoc(); 269249259Sdim 270249259Sdim SDNode *Intr = BRCOND.getOperand(1).getNode(); 271249259Sdim SDValue Target = BRCOND.getOperand(2); 272249259Sdim SDNode *BR = 0; 273249259Sdim 274249259Sdim if (Intr->getOpcode() == ISD::SETCC) { 275249259Sdim // As long as we negate the condition everything is fine 276249259Sdim SDNode *SetCC = Intr; 277249259Sdim assert(SetCC->getConstantOperandVal(1) == 1); 278249259Sdim assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == 279249259Sdim ISD::SETNE); 280249259Sdim Intr = SetCC->getOperand(0).getNode(); 281249259Sdim 282249259Sdim } else { 283249259Sdim // Get the target from BR if we don't negate the condition 284249259Sdim BR = findUser(BRCOND, ISD::BR); 285249259Sdim Target = BR->getOperand(1); 286249259Sdim } 287249259Sdim 288249259Sdim assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); 289249259Sdim 290249259Sdim // Build the result and 291249259Sdim SmallVector<EVT, 4> Res; 292249259Sdim for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) 293249259Sdim Res.push_back(Intr->getValueType(i)); 294249259Sdim 295249259Sdim // operands of the new intrinsic call 296249259Sdim SmallVector<SDValue, 4> Ops; 297249259Sdim Ops.push_back(BRCOND.getOperand(0)); 298249259Sdim for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) 299249259Sdim Ops.push_back(Intr->getOperand(i)); 300249259Sdim Ops.push_back(Target); 301249259Sdim 302249259Sdim // build the new intrinsic call 303249259Sdim SDNode *Result = DAG.getNode( 304249259Sdim Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, 305249259Sdim DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); 306249259Sdim 307249259Sdim if (BR) { 308249259Sdim // Give the branch instruction our target 309249259Sdim SDValue Ops[] = { 310249259Sdim BR->getOperand(0), 311249259Sdim BRCOND.getOperand(2) 312249259Sdim }; 313249259Sdim DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); 314249259Sdim } 315249259Sdim 316249259Sdim SDValue Chain = SDValue(Result, Result->getNumValues() - 1); 317249259Sdim 318249259Sdim // Copy the intrinsic results to registers 319249259Sdim for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { 320249259Sdim SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); 321249259Sdim if (!CopyToReg) 322249259Sdim continue; 323249259Sdim 324249259Sdim Chain = DAG.getCopyToReg( 325249259Sdim Chain, DL, 326249259Sdim CopyToReg->getOperand(1), 327249259Sdim SDValue(Result, i - 1), 328249259Sdim SDValue()); 329249259Sdim 330249259Sdim DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); 331249259Sdim } 332249259Sdim 333249259Sdim // Remove the old intrinsic from the chain 334249259Sdim DAG.ReplaceAllUsesOfValueWith( 335249259Sdim SDValue(Intr, Intr->getNumValues() - 1), 336249259Sdim Intr->getOperand(0)); 337249259Sdim 338249259Sdim return Chain; 339249259Sdim} 340249259Sdim 341251662Sdim#define RSRC_DATA_FORMAT 0xf00000000000 342251662Sdim 343251662SdimSDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { 344251662Sdim StoreSDNode *StoreNode = cast<StoreSDNode>(Op); 345251662Sdim SDValue Chain = Op.getOperand(0); 346251662Sdim SDValue Value = Op.getOperand(1); 347251662Sdim SDValue VirtualAddress = Op.getOperand(2); 348251662Sdim DebugLoc DL = Op.getDebugLoc(); 349251662Sdim 350251662Sdim if (StoreNode->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) { 351251662Sdim return SDValue(); 352251662Sdim } 353251662Sdim 354251662Sdim SDValue SrcSrc = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, 355251662Sdim DAG.getConstant(0, MVT::i64), 356251662Sdim DAG.getConstant(RSRC_DATA_FORMAT, MVT::i64)); 357251662Sdim 358251662Sdim SDValue Ops[2]; 359251662Sdim Ops[0] = DAG.getNode(AMDGPUISD::BUFFER_STORE, DL, MVT::Other, Chain, 360251662Sdim Value, SrcSrc, VirtualAddress); 361251662Sdim Ops[1] = Chain; 362251662Sdim 363251662Sdim return DAG.getMergeValues(Ops, 2, DL); 364251662Sdim 365251662Sdim} 366251662Sdim 367249259SdimSDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 368249259Sdim SDValue LHS = Op.getOperand(0); 369249259Sdim SDValue RHS = Op.getOperand(1); 370249259Sdim SDValue True = Op.getOperand(2); 371249259Sdim SDValue False = Op.getOperand(3); 372249259Sdim SDValue CC = Op.getOperand(4); 373249259Sdim EVT VT = Op.getValueType(); 374249259Sdim DebugLoc DL = Op.getDebugLoc(); 375249259Sdim 376249259Sdim // Possible Min/Max pattern 377249259Sdim SDValue MinMax = LowerMinMax(Op, DAG); 378249259Sdim if (MinMax.getNode()) { 379249259Sdim return MinMax; 380249259Sdim } 381249259Sdim 382249259Sdim SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC); 383249259Sdim return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False); 384249259Sdim} 385249259Sdim 386249259Sdim//===----------------------------------------------------------------------===// 387249259Sdim// Custom DAG optimizations 388249259Sdim//===----------------------------------------------------------------------===// 389249259Sdim 390249259SdimSDValue SITargetLowering::PerformDAGCombine(SDNode *N, 391249259Sdim DAGCombinerInfo &DCI) const { 392249259Sdim SelectionDAG &DAG = DCI.DAG; 393249259Sdim DebugLoc DL = N->getDebugLoc(); 394249259Sdim EVT VT = N->getValueType(0); 395249259Sdim 396249259Sdim switch (N->getOpcode()) { 397249259Sdim default: break; 398249259Sdim case ISD::SELECT_CC: { 399249259Sdim N->dump(); 400249259Sdim ConstantSDNode *True, *False; 401249259Sdim // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc) 402249259Sdim if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2))) 403249259Sdim && (False = dyn_cast<ConstantSDNode>(N->getOperand(3))) 404249259Sdim && True->isAllOnesValue() 405249259Sdim && False->isNullValue() 406249259Sdim && VT == MVT::i1) { 407249259Sdim return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0), 408249259Sdim N->getOperand(1), N->getOperand(4)); 409249259Sdim 410249259Sdim } 411249259Sdim break; 412249259Sdim } 413249259Sdim case ISD::SETCC: { 414249259Sdim SDValue Arg0 = N->getOperand(0); 415249259Sdim SDValue Arg1 = N->getOperand(1); 416249259Sdim SDValue CC = N->getOperand(2); 417249259Sdim ConstantSDNode * C = NULL; 418249259Sdim ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get(); 419249259Sdim 420249259Sdim // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne) 421249259Sdim if (VT == MVT::i1 422249259Sdim && Arg0.getOpcode() == ISD::SIGN_EXTEND 423249259Sdim && Arg0.getOperand(0).getValueType() == MVT::i1 424249259Sdim && (C = dyn_cast<ConstantSDNode>(Arg1)) 425249259Sdim && C->isNullValue() 426249259Sdim && CCOp == ISD::SETNE) { 427249259Sdim return SimplifySetCC(VT, Arg0.getOperand(0), 428249259Sdim DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL); 429249259Sdim } 430249259Sdim break; 431249259Sdim } 432249259Sdim } 433249259Sdim return SDValue(); 434249259Sdim} 435249259Sdim 436249259Sdim/// \brief Test if RegClass is one of the VSrc classes 437249259Sdimstatic bool isVSrc(unsigned RegClass) { 438249259Sdim return AMDGPU::VSrc_32RegClassID == RegClass || 439249259Sdim AMDGPU::VSrc_64RegClassID == RegClass; 440249259Sdim} 441249259Sdim 442249259Sdim/// \brief Test if RegClass is one of the SSrc classes 443249259Sdimstatic bool isSSrc(unsigned RegClass) { 444249259Sdim return AMDGPU::SSrc_32RegClassID == RegClass || 445249259Sdim AMDGPU::SSrc_64RegClassID == RegClass; 446249259Sdim} 447249259Sdim 448249259Sdim/// \brief Analyze the possible immediate value Op 449249259Sdim/// 450249259Sdim/// Returns -1 if it isn't an immediate, 0 if it's and inline immediate 451249259Sdim/// and the immediate value if it's a literal immediate 452249259Sdimint32_t SITargetLowering::analyzeImmediate(const SDNode *N) const { 453249259Sdim 454249259Sdim union { 455249259Sdim int32_t I; 456249259Sdim float F; 457249259Sdim } Imm; 458249259Sdim 459251662Sdim if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) { 460251662Sdim if (Node->getZExtValue() >> 32) { 461251662Sdim return -1; 462251662Sdim } 463249259Sdim Imm.I = Node->getSExtValue(); 464251662Sdim } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) 465249259Sdim Imm.F = Node->getValueAPF().convertToFloat(); 466249259Sdim else 467249259Sdim return -1; // It isn't an immediate 468249259Sdim 469249259Sdim if ((Imm.I >= -16 && Imm.I <= 64) || 470249259Sdim Imm.F == 0.5f || Imm.F == -0.5f || 471249259Sdim Imm.F == 1.0f || Imm.F == -1.0f || 472249259Sdim Imm.F == 2.0f || Imm.F == -2.0f || 473249259Sdim Imm.F == 4.0f || Imm.F == -4.0f) 474249259Sdim return 0; // It's an inline immediate 475249259Sdim 476249259Sdim return Imm.I; // It's a literal immediate 477249259Sdim} 478249259Sdim 479249259Sdim/// \brief Try to fold an immediate directly into an instruction 480249259Sdimbool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate, 481249259Sdim bool &ScalarSlotUsed) const { 482249259Sdim 483249259Sdim MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand); 484249259Sdim if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode())) 485249259Sdim return false; 486249259Sdim 487249259Sdim const SDValue &Op = Mov->getOperand(0); 488249259Sdim int32_t Value = analyzeImmediate(Op.getNode()); 489249259Sdim if (Value == -1) { 490249259Sdim // Not an immediate at all 491249259Sdim return false; 492249259Sdim 493249259Sdim } else if (Value == 0) { 494249259Sdim // Inline immediates can always be fold 495249259Sdim Operand = Op; 496249259Sdim return true; 497249259Sdim 498249259Sdim } else if (Value == Immediate) { 499249259Sdim // Already fold literal immediate 500249259Sdim Operand = Op; 501249259Sdim return true; 502249259Sdim 503249259Sdim } else if (!ScalarSlotUsed && !Immediate) { 504249259Sdim // Fold this literal immediate 505249259Sdim ScalarSlotUsed = true; 506249259Sdim Immediate = Value; 507249259Sdim Operand = Op; 508249259Sdim return true; 509249259Sdim 510249259Sdim } 511249259Sdim 512249259Sdim return false; 513249259Sdim} 514249259Sdim 515249259Sdim/// \brief Does "Op" fit into register class "RegClass" ? 516249259Sdimbool SITargetLowering::fitsRegClass(SelectionDAG &DAG, SDValue &Op, 517249259Sdim unsigned RegClass) const { 518249259Sdim 519249259Sdim MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 520249259Sdim SDNode *Node = Op.getNode(); 521249259Sdim 522249259Sdim const TargetRegisterClass *OpClass; 523249259Sdim if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) { 524249259Sdim const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode()); 525249259Sdim int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass; 526249259Sdim if (OpClassID == -1) 527249259Sdim OpClass = getRegClassFor(Op.getSimpleValueType()); 528249259Sdim else 529249259Sdim OpClass = TRI->getRegClass(OpClassID); 530249259Sdim 531249259Sdim } else if (Node->getOpcode() == ISD::CopyFromReg) { 532249259Sdim RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode()); 533249259Sdim OpClass = MRI.getRegClass(Reg->getReg()); 534249259Sdim 535249259Sdim } else 536249259Sdim return false; 537249259Sdim 538249259Sdim return TRI->getRegClass(RegClass)->hasSubClassEq(OpClass); 539249259Sdim} 540249259Sdim 541249259Sdim/// \brief Make sure that we don't exeed the number of allowed scalars 542249259Sdimvoid SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 543249259Sdim unsigned RegClass, 544249259Sdim bool &ScalarSlotUsed) const { 545249259Sdim 546249259Sdim // First map the operands register class to a destination class 547249259Sdim if (RegClass == AMDGPU::VSrc_32RegClassID) 548249259Sdim RegClass = AMDGPU::VReg_32RegClassID; 549249259Sdim else if (RegClass == AMDGPU::VSrc_64RegClassID) 550249259Sdim RegClass = AMDGPU::VReg_64RegClassID; 551249259Sdim else 552249259Sdim return; 553249259Sdim 554249259Sdim // Nothing todo if they fit naturaly 555249259Sdim if (fitsRegClass(DAG, Operand, RegClass)) 556249259Sdim return; 557249259Sdim 558249259Sdim // If the scalar slot isn't used yet use it now 559249259Sdim if (!ScalarSlotUsed) { 560249259Sdim ScalarSlotUsed = true; 561249259Sdim return; 562249259Sdim } 563249259Sdim 564249259Sdim // This is a conservative aproach, it is possible that we can't determine 565249259Sdim // the correct register class and copy too often, but better save than sorry. 566249259Sdim SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32); 567249259Sdim SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DebugLoc(), 568249259Sdim Operand.getValueType(), Operand, RC); 569249259Sdim Operand = SDValue(Node, 0); 570249259Sdim} 571249259Sdim 572251662Sdim/// \brief Try to fold the Nodes operands into the Node 573251662SdimSDNode *SITargetLowering::foldOperands(MachineSDNode *Node, 574251662Sdim SelectionDAG &DAG) const { 575249259Sdim 576249259Sdim // Original encoding (either e32 or e64) 577249259Sdim int Opcode = Node->getMachineOpcode(); 578249259Sdim const MCInstrDesc *Desc = &TII->get(Opcode); 579249259Sdim 580249259Sdim unsigned NumDefs = Desc->getNumDefs(); 581249259Sdim unsigned NumOps = Desc->getNumOperands(); 582249259Sdim 583249259Sdim // Commuted opcode if available 584249259Sdim int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1; 585249259Sdim const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev); 586249259Sdim 587249259Sdim assert(!DescRev || DescRev->getNumDefs() == NumDefs); 588249259Sdim assert(!DescRev || DescRev->getNumOperands() == NumOps); 589249259Sdim 590249259Sdim // e64 version if available, -1 otherwise 591249259Sdim int OpcodeE64 = AMDGPU::getVOPe64(Opcode); 592249259Sdim const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64); 593249259Sdim 594249259Sdim assert(!DescE64 || DescE64->getNumDefs() == NumDefs); 595249259Sdim assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4)); 596249259Sdim 597249259Sdim int32_t Immediate = Desc->getSize() == 4 ? 0 : -1; 598249259Sdim bool HaveVSrc = false, HaveSSrc = false; 599249259Sdim 600249259Sdim // First figure out what we alread have in this instruction 601249259Sdim for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 602249259Sdim i != e && Op < NumOps; ++i, ++Op) { 603249259Sdim 604249259Sdim unsigned RegClass = Desc->OpInfo[Op].RegClass; 605249259Sdim if (isVSrc(RegClass)) 606249259Sdim HaveVSrc = true; 607249259Sdim else if (isSSrc(RegClass)) 608249259Sdim HaveSSrc = true; 609249259Sdim else 610249259Sdim continue; 611249259Sdim 612249259Sdim int32_t Imm = analyzeImmediate(Node->getOperand(i).getNode()); 613249259Sdim if (Imm != -1 && Imm != 0) { 614249259Sdim // Literal immediate 615249259Sdim Immediate = Imm; 616249259Sdim } 617249259Sdim } 618249259Sdim 619249259Sdim // If we neither have VSrc nor SSrc it makes no sense to continue 620249259Sdim if (!HaveVSrc && !HaveSSrc) 621249259Sdim return Node; 622249259Sdim 623249259Sdim // No scalar allowed when we have both VSrc and SSrc 624249259Sdim bool ScalarSlotUsed = HaveVSrc && HaveSSrc; 625249259Sdim 626249259Sdim // Second go over the operands and try to fold them 627249259Sdim std::vector<SDValue> Ops; 628249259Sdim bool Promote2e64 = false; 629249259Sdim for (unsigned i = 0, e = Node->getNumOperands(), Op = NumDefs; 630249259Sdim i != e && Op < NumOps; ++i, ++Op) { 631249259Sdim 632249259Sdim const SDValue &Operand = Node->getOperand(i); 633249259Sdim Ops.push_back(Operand); 634249259Sdim 635249259Sdim // Already folded immediate ? 636249259Sdim if (isa<ConstantSDNode>(Operand.getNode()) || 637249259Sdim isa<ConstantFPSDNode>(Operand.getNode())) 638249259Sdim continue; 639249259Sdim 640249259Sdim // Is this a VSrc or SSrc operand ? 641249259Sdim unsigned RegClass = Desc->OpInfo[Op].RegClass; 642249259Sdim if (isVSrc(RegClass) || isSSrc(RegClass)) { 643249259Sdim // Try to fold the immediates 644249259Sdim if (!foldImm(Ops[i], Immediate, ScalarSlotUsed)) { 645249259Sdim // Folding didn't worked, make sure we don't hit the SReg limit 646249259Sdim ensureSRegLimit(DAG, Ops[i], RegClass, ScalarSlotUsed); 647249259Sdim } 648249259Sdim continue; 649249259Sdim } 650249259Sdim 651249259Sdim if (i == 1 && DescRev && fitsRegClass(DAG, Ops[0], RegClass)) { 652249259Sdim 653249259Sdim unsigned OtherRegClass = Desc->OpInfo[NumDefs].RegClass; 654249259Sdim assert(isVSrc(OtherRegClass) || isSSrc(OtherRegClass)); 655249259Sdim 656249259Sdim // Test if it makes sense to swap operands 657249259Sdim if (foldImm(Ops[1], Immediate, ScalarSlotUsed) || 658249259Sdim (!fitsRegClass(DAG, Ops[1], RegClass) && 659249259Sdim fitsRegClass(DAG, Ops[1], OtherRegClass))) { 660249259Sdim 661249259Sdim // Swap commutable operands 662249259Sdim SDValue Tmp = Ops[1]; 663249259Sdim Ops[1] = Ops[0]; 664249259Sdim Ops[0] = Tmp; 665249259Sdim 666249259Sdim Desc = DescRev; 667249259Sdim DescRev = 0; 668249259Sdim continue; 669249259Sdim } 670249259Sdim } 671249259Sdim 672249259Sdim if (DescE64 && !Immediate) { 673249259Sdim 674249259Sdim // Test if it makes sense to switch to e64 encoding 675249259Sdim unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass; 676249259Sdim if (!isVSrc(OtherRegClass) && !isSSrc(OtherRegClass)) 677249259Sdim continue; 678249259Sdim 679249259Sdim int32_t TmpImm = -1; 680249259Sdim if (foldImm(Ops[i], TmpImm, ScalarSlotUsed) || 681249259Sdim (!fitsRegClass(DAG, Ops[i], RegClass) && 682249259Sdim fitsRegClass(DAG, Ops[1], OtherRegClass))) { 683249259Sdim 684249259Sdim // Switch to e64 encoding 685249259Sdim Immediate = -1; 686249259Sdim Promote2e64 = true; 687249259Sdim Desc = DescE64; 688249259Sdim DescE64 = 0; 689249259Sdim } 690249259Sdim } 691249259Sdim } 692249259Sdim 693249259Sdim if (Promote2e64) { 694249259Sdim // Add the modifier flags while promoting 695249259Sdim for (unsigned i = 0; i < 4; ++i) 696249259Sdim Ops.push_back(DAG.getTargetConstant(0, MVT::i32)); 697249259Sdim } 698249259Sdim 699249259Sdim // Add optional chain and glue 700249259Sdim for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i) 701249259Sdim Ops.push_back(Node->getOperand(i)); 702249259Sdim 703249259Sdim // Create a complete new instruction 704249259Sdim return DAG.getMachineNode(Desc->Opcode, Node->getDebugLoc(), 705251662Sdim Node->getVTList(), Ops); 706249259Sdim} 707251662Sdim 708251662Sdim/// \brief Helper function for adjustWritemask 709251662Sdimunsigned SubIdx2Lane(unsigned Idx) { 710251662Sdim switch (Idx) { 711251662Sdim default: return 0; 712251662Sdim case AMDGPU::sub0: return 0; 713251662Sdim case AMDGPU::sub1: return 1; 714251662Sdim case AMDGPU::sub2: return 2; 715251662Sdim case AMDGPU::sub3: return 3; 716251662Sdim } 717251662Sdim} 718251662Sdim 719251662Sdim/// \brief Adjust the writemask of MIMG instructions 720251662Sdimvoid SITargetLowering::adjustWritemask(MachineSDNode *&Node, 721251662Sdim SelectionDAG &DAG) const { 722251662Sdim SDNode *Users[4] = { }; 723251662Sdim unsigned Writemask = 0, Lane = 0; 724251662Sdim 725251662Sdim // Try to figure out the used register components 726251662Sdim for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); 727251662Sdim I != E; ++I) { 728251662Sdim 729251662Sdim // Abort if we can't understand the usage 730251662Sdim if (!I->isMachineOpcode() || 731251662Sdim I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) 732251662Sdim return; 733251662Sdim 734251662Sdim Lane = SubIdx2Lane(I->getConstantOperandVal(1)); 735251662Sdim 736251662Sdim // Abort if we have more than one user per component 737251662Sdim if (Users[Lane]) 738251662Sdim return; 739251662Sdim 740251662Sdim Users[Lane] = *I; 741251662Sdim Writemask |= 1 << Lane; 742251662Sdim } 743251662Sdim 744251662Sdim // Abort if all components are used 745251662Sdim if (Writemask == 0xf) 746251662Sdim return; 747251662Sdim 748251662Sdim // Adjust the writemask in the node 749251662Sdim std::vector<SDValue> Ops; 750251662Sdim Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32)); 751251662Sdim for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) 752251662Sdim Ops.push_back(Node->getOperand(i)); 753251662Sdim Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size()); 754251662Sdim 755251662Sdim // If we only got one lane, replace it with a copy 756251662Sdim if (Writemask == (1U << Lane)) { 757251662Sdim SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32); 758251662Sdim SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, 759251662Sdim DebugLoc(), Users[Lane]->getValueType(0), 760251662Sdim SDValue(Node, 0), RC); 761251662Sdim DAG.ReplaceAllUsesWith(Users[Lane], Copy); 762251662Sdim return; 763251662Sdim } 764251662Sdim 765251662Sdim // Update the users of the node with the new indices 766251662Sdim for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) { 767251662Sdim 768251662Sdim SDNode *User = Users[i]; 769251662Sdim if (!User) 770251662Sdim continue; 771251662Sdim 772251662Sdim SDValue Op = DAG.getTargetConstant(Idx, MVT::i32); 773251662Sdim DAG.UpdateNodeOperands(User, User->getOperand(0), Op); 774251662Sdim 775251662Sdim switch (Idx) { 776251662Sdim default: break; 777251662Sdim case AMDGPU::sub0: Idx = AMDGPU::sub1; break; 778251662Sdim case AMDGPU::sub1: Idx = AMDGPU::sub2; break; 779251662Sdim case AMDGPU::sub2: Idx = AMDGPU::sub3; break; 780251662Sdim } 781251662Sdim } 782251662Sdim} 783251662Sdim 784251662Sdim/// \brief Fold the instructions after slecting them 785251662SdimSDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, 786251662Sdim SelectionDAG &DAG) const { 787251662Sdim 788251662Sdim if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1) 789251662Sdim adjustWritemask(Node, DAG); 790251662Sdim 791251662Sdim return foldOperands(Node, DAG); 792251662Sdim} 793251662Sdim 794251662Sdim/// \brief Assign the register class depending on the number of 795251662Sdim/// bits set in the writemask 796251662Sdimvoid SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, 797251662Sdim SDNode *Node) const { 798251662Sdim if (AMDGPU::isMIMG(MI->getOpcode()) == -1) 799251662Sdim return; 800251662Sdim 801251662Sdim unsigned VReg = MI->getOperand(0).getReg(); 802251662Sdim unsigned Writemask = MI->getOperand(1).getImm(); 803251662Sdim unsigned BitsSet = 0; 804251662Sdim for (unsigned i = 0; i < 4; ++i) 805251662Sdim BitsSet += Writemask & (1 << i) ? 1 : 0; 806251662Sdim 807251662Sdim const TargetRegisterClass *RC; 808251662Sdim switch (BitsSet) { 809251662Sdim default: return; 810251662Sdim case 1: RC = &AMDGPU::VReg_32RegClass; break; 811251662Sdim case 2: RC = &AMDGPU::VReg_64RegClass; break; 812251662Sdim case 3: RC = &AMDGPU::VReg_96RegClass; break; 813251662Sdim } 814251662Sdim 815251662Sdim MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); 816251662Sdim MRI.setRegClass(VReg, RC); 817251662Sdim} 818