1249259Sdim//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2249259Sdim//
3249259Sdim//                     The LLVM Compiler Infrastructure
4249259Sdim//
5249259Sdim// This file is distributed under the University of Illinois Open Source
6249259Sdim// License. See LICENSE.TXT for details.
7249259Sdim//
8249259Sdim//===----------------------------------------------------------------------===//
9249259Sdim//
10249259Sdim/// \file
11249259Sdim/// \brief Custom DAG lowering for R600
12249259Sdim//
13249259Sdim//===----------------------------------------------------------------------===//
14249259Sdim
15249259Sdim#include "R600ISelLowering.h"
16249259Sdim#include "R600Defines.h"
17249259Sdim#include "R600InstrInfo.h"
18249259Sdim#include "R600MachineFunctionInfo.h"
19249259Sdim#include "llvm/CodeGen/MachineFrameInfo.h"
20249259Sdim#include "llvm/CodeGen/MachineInstrBuilder.h"
21249259Sdim#include "llvm/CodeGen/MachineRegisterInfo.h"
22249259Sdim#include "llvm/CodeGen/SelectionDAG.h"
23249259Sdim#include "llvm/IR/Argument.h"
24249259Sdim#include "llvm/IR/Function.h"
25249259Sdim
26249259Sdimusing namespace llvm;
27249259Sdim
28249259SdimR600TargetLowering::R600TargetLowering(TargetMachine &TM) :
29249259Sdim    AMDGPUTargetLowering(TM),
30249259Sdim    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
31249259Sdim  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
32249259Sdim  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
33249259Sdim  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
34249259Sdim  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
35249259Sdim  computeRegisterProperties();
36249259Sdim
37249259Sdim  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
38249259Sdim  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
39249259Sdim  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
40249259Sdim  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
41249259Sdim
42249259Sdim  setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
43249259Sdim  setOperationAction(ISD::AND,  MVT::v4i32, Expand);
44249259Sdim  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
45249259Sdim  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
46251662Sdim  setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
47251662Sdim  setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
48251662Sdim  setOperationAction(ISD::OR, MVT::v4i32, Expand);
49251662Sdim  setOperationAction(ISD::OR, MVT::v2i32, Expand);
50249259Sdim  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
51251662Sdim  setOperationAction(ISD::SHL, MVT::v4i32, Expand);
52251662Sdim  setOperationAction(ISD::SHL, MVT::v2i32, Expand);
53251662Sdim  setOperationAction(ISD::SRL, MVT::v4i32, Expand);
54251662Sdim  setOperationAction(ISD::SRL, MVT::v2i32, Expand);
55251662Sdim  setOperationAction(ISD::SRA, MVT::v4i32, Expand);
56251662Sdim  setOperationAction(ISD::SRA, MVT::v2i32, Expand);
57251662Sdim  setOperationAction(ISD::SUB, MVT::v4i32, Expand);
58251662Sdim  setOperationAction(ISD::SUB, MVT::v2i32, Expand);
59249259Sdim  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
60249259Sdim  setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
61249259Sdim  setOperationAction(ISD::UREM, MVT::v4i32, Expand);
62249259Sdim  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
63251662Sdim  setOperationAction(ISD::XOR, MVT::v4i32, Expand);
64251662Sdim  setOperationAction(ISD::XOR, MVT::v2i32, Expand);
65249259Sdim
66249259Sdim  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
67249259Sdim  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
68249259Sdim
69249259Sdim  setOperationAction(ISD::FSUB, MVT::f32, Expand);
70249259Sdim
71249259Sdim  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
72249259Sdim  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
73249259Sdim  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
74249259Sdim
75249259Sdim  setOperationAction(ISD::ROTL, MVT::i32, Custom);
76249259Sdim
77249259Sdim  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
78249259Sdim  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
79249259Sdim
80249259Sdim  setOperationAction(ISD::SETCC, MVT::i32, Expand);
81249259Sdim  setOperationAction(ISD::SETCC, MVT::f32, Expand);
82249259Sdim  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
83249259Sdim
84249259Sdim  setOperationAction(ISD::SELECT, MVT::i32, Custom);
85249259Sdim  setOperationAction(ISD::SELECT, MVT::f32, Custom);
86249259Sdim
87251662Sdim  setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
88251662Sdim  setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
89251662Sdim
90249259Sdim  // Legalize loads and stores to the private address space.
91249259Sdim  setOperationAction(ISD::LOAD, MVT::i32, Custom);
92249259Sdim  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
93249259Sdim  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
94249259Sdim  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
95249259Sdim  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
96249259Sdim  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
97249259Sdim  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
98249259Sdim  setOperationAction(ISD::STORE, MVT::i8, Custom);
99249259Sdim  setOperationAction(ISD::STORE, MVT::i32, Custom);
100249259Sdim  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
101249259Sdim  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
102249259Sdim
103249259Sdim  setOperationAction(ISD::LOAD, MVT::i32, Custom);
104249259Sdim  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
105249259Sdim  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
106249259Sdim
107249259Sdim  setTargetDAGCombine(ISD::FP_ROUND);
108249259Sdim  setTargetDAGCombine(ISD::FP_TO_SINT);
109249259Sdim  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
110249259Sdim  setTargetDAGCombine(ISD::SELECT_CC);
111249259Sdim
112249259Sdim  setBooleanContents(ZeroOrNegativeOneBooleanContent);
113251662Sdim  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
114249259Sdim  setSchedulingPreference(Sched::VLIW);
115249259Sdim}
116249259Sdim
117249259SdimMachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
118249259Sdim    MachineInstr * MI, MachineBasicBlock * BB) const {
119249259Sdim  MachineFunction * MF = BB->getParent();
120249259Sdim  MachineRegisterInfo &MRI = MF->getRegInfo();
121249259Sdim  MachineBasicBlock::iterator I = *MI;
122249259Sdim
123249259Sdim  switch (MI->getOpcode()) {
124249259Sdim  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
125249259Sdim  case AMDGPU::CLAMP_R600: {
126249259Sdim    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
127249259Sdim                                                   AMDGPU::MOV,
128249259Sdim                                                   MI->getOperand(0).getReg(),
129249259Sdim                                                   MI->getOperand(1).getReg());
130249259Sdim    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
131249259Sdim    break;
132249259Sdim  }
133249259Sdim
134249259Sdim  case AMDGPU::FABS_R600: {
135249259Sdim    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
136249259Sdim                                                    AMDGPU::MOV,
137249259Sdim                                                    MI->getOperand(0).getReg(),
138249259Sdim                                                    MI->getOperand(1).getReg());
139249259Sdim    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
140249259Sdim    break;
141249259Sdim  }
142249259Sdim
143249259Sdim  case AMDGPU::FNEG_R600: {
144249259Sdim    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
145249259Sdim                                                    AMDGPU::MOV,
146249259Sdim                                                    MI->getOperand(0).getReg(),
147249259Sdim                                                    MI->getOperand(1).getReg());
148249259Sdim    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
149249259Sdim    break;
150249259Sdim  }
151249259Sdim
152249259Sdim  case AMDGPU::MASK_WRITE: {
153249259Sdim    unsigned maskedRegister = MI->getOperand(0).getReg();
154249259Sdim    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
155249259Sdim    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
156249259Sdim    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
157249259Sdim    break;
158249259Sdim  }
159249259Sdim
160249259Sdim  case AMDGPU::MOV_IMM_F32:
161249259Sdim    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
162249259Sdim                     MI->getOperand(1).getFPImm()->getValueAPF()
163249259Sdim                         .bitcastToAPInt().getZExtValue());
164249259Sdim    break;
165249259Sdim  case AMDGPU::MOV_IMM_I32:
166249259Sdim    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
167249259Sdim                     MI->getOperand(1).getImm());
168249259Sdim    break;
169249259Sdim  case AMDGPU::CONST_COPY: {
170249259Sdim    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
171249259Sdim        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
172249259Sdim    TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
173249259Sdim        MI->getOperand(1).getImm());
174249259Sdim    break;
175249259Sdim  }
176249259Sdim
177249259Sdim  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
178249259Sdim  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
179249259Sdim    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
180249259Sdim
181249259Sdim    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
182249259Sdim            .addOperand(MI->getOperand(0))
183249259Sdim            .addOperand(MI->getOperand(1))
184249259Sdim            .addImm(EOP); // Set End of program bit
185249259Sdim    break;
186249259Sdim  }
187249259Sdim
188249259Sdim  case AMDGPU::TXD: {
189249259Sdim    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
190249259Sdim    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
191249259Sdim
192249259Sdim    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
193249259Sdim            .addOperand(MI->getOperand(3))
194249259Sdim            .addOperand(MI->getOperand(4))
195249259Sdim            .addOperand(MI->getOperand(5))
196249259Sdim            .addOperand(MI->getOperand(6));
197249259Sdim    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
198249259Sdim            .addOperand(MI->getOperand(2))
199249259Sdim            .addOperand(MI->getOperand(4))
200249259Sdim            .addOperand(MI->getOperand(5))
201249259Sdim            .addOperand(MI->getOperand(6));
202249259Sdim    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
203249259Sdim            .addOperand(MI->getOperand(0))
204249259Sdim            .addOperand(MI->getOperand(1))
205249259Sdim            .addOperand(MI->getOperand(4))
206249259Sdim            .addOperand(MI->getOperand(5))
207249259Sdim            .addOperand(MI->getOperand(6))
208249259Sdim            .addReg(T0, RegState::Implicit)
209249259Sdim            .addReg(T1, RegState::Implicit);
210249259Sdim    break;
211249259Sdim  }
212249259Sdim
213249259Sdim  case AMDGPU::TXD_SHADOW: {
214249259Sdim    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
215249259Sdim    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
216249259Sdim
217249259Sdim    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
218249259Sdim            .addOperand(MI->getOperand(3))
219249259Sdim            .addOperand(MI->getOperand(4))
220249259Sdim            .addOperand(MI->getOperand(5))
221249259Sdim            .addOperand(MI->getOperand(6));
222249259Sdim    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
223249259Sdim            .addOperand(MI->getOperand(2))
224249259Sdim            .addOperand(MI->getOperand(4))
225249259Sdim            .addOperand(MI->getOperand(5))
226249259Sdim            .addOperand(MI->getOperand(6));
227249259Sdim    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
228249259Sdim            .addOperand(MI->getOperand(0))
229249259Sdim            .addOperand(MI->getOperand(1))
230249259Sdim            .addOperand(MI->getOperand(4))
231249259Sdim            .addOperand(MI->getOperand(5))
232249259Sdim            .addOperand(MI->getOperand(6))
233249259Sdim            .addReg(T0, RegState::Implicit)
234249259Sdim            .addReg(T1, RegState::Implicit);
235249259Sdim    break;
236249259Sdim  }
237249259Sdim
238249259Sdim  case AMDGPU::BRANCH:
239249259Sdim      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
240249259Sdim              .addOperand(MI->getOperand(0));
241249259Sdim      break;
242249259Sdim
243249259Sdim  case AMDGPU::BRANCH_COND_f32: {
244249259Sdim    MachineInstr *NewMI =
245249259Sdim      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
246249259Sdim              AMDGPU::PREDICATE_BIT)
247249259Sdim              .addOperand(MI->getOperand(1))
248249259Sdim              .addImm(OPCODE_IS_NOT_ZERO)
249249259Sdim              .addImm(0); // Flags
250249259Sdim    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
251249259Sdim    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
252249259Sdim            .addOperand(MI->getOperand(0))
253249259Sdim            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
254249259Sdim    break;
255249259Sdim  }
256249259Sdim
257249259Sdim  case AMDGPU::BRANCH_COND_i32: {
258249259Sdim    MachineInstr *NewMI =
259249259Sdim      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
260249259Sdim            AMDGPU::PREDICATE_BIT)
261249259Sdim            .addOperand(MI->getOperand(1))
262249259Sdim            .addImm(OPCODE_IS_NOT_ZERO_INT)
263249259Sdim            .addImm(0); // Flags
264249259Sdim    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
265249259Sdim    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
266249259Sdim           .addOperand(MI->getOperand(0))
267249259Sdim            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
268249259Sdim    break;
269249259Sdim  }
270249259Sdim
271249259Sdim  case AMDGPU::EG_ExportSwz:
272249259Sdim  case AMDGPU::R600_ExportSwz: {
273249259Sdim    // Instruction is left unmodified if its not the last one of its type
274249259Sdim    bool isLastInstructionOfItsType = true;
275249259Sdim    unsigned InstExportType = MI->getOperand(1).getImm();
276249259Sdim    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
277249259Sdim         EndBlock = BB->end(); NextExportInst != EndBlock;
278249259Sdim         NextExportInst = llvm::next(NextExportInst)) {
279249259Sdim      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
280249259Sdim          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
281249259Sdim        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
282249259Sdim            .getImm();
283249259Sdim        if (CurrentInstExportType == InstExportType) {
284249259Sdim          isLastInstructionOfItsType = false;
285249259Sdim          break;
286249259Sdim        }
287249259Sdim      }
288249259Sdim    }
289249259Sdim    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
290249259Sdim    if (!EOP && !isLastInstructionOfItsType)
291249259Sdim      return BB;
292249259Sdim    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
293249259Sdim    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
294249259Sdim            .addOperand(MI->getOperand(0))
295249259Sdim            .addOperand(MI->getOperand(1))
296249259Sdim            .addOperand(MI->getOperand(2))
297249259Sdim            .addOperand(MI->getOperand(3))
298249259Sdim            .addOperand(MI->getOperand(4))
299249259Sdim            .addOperand(MI->getOperand(5))
300249259Sdim            .addOperand(MI->getOperand(6))
301249259Sdim            .addImm(CfInst)
302249259Sdim            .addImm(EOP);
303249259Sdim    break;
304249259Sdim  }
305249259Sdim  case AMDGPU::RETURN: {
306249259Sdim    // RETURN instructions must have the live-out registers as implicit uses,
307249259Sdim    // otherwise they appear dead.
308249259Sdim    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
309249259Sdim    MachineInstrBuilder MIB(*MF, MI);
310249259Sdim    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
311249259Sdim      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
312249259Sdim    return BB;
313249259Sdim  }
314249259Sdim  }
315249259Sdim
316249259Sdim  MI->eraseFromParent();
317249259Sdim  return BB;
318249259Sdim}
319249259Sdim
320249259Sdim//===----------------------------------------------------------------------===//
321249259Sdim// Custom DAG Lowering Operations
322249259Sdim//===----------------------------------------------------------------------===//
323249259Sdim
324249259Sdimusing namespace llvm::Intrinsic;
325249259Sdimusing namespace llvm::AMDGPUIntrinsic;
326249259Sdim
327249259SdimSDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
328249259Sdim  switch (Op.getOpcode()) {
329249259Sdim  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
330249259Sdim  case ISD::ROTL: return LowerROTL(Op, DAG);
331249259Sdim  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
332249259Sdim  case ISD::SELECT: return LowerSELECT(Op, DAG);
333249259Sdim  case ISD::STORE: return LowerSTORE(Op, DAG);
334249259Sdim  case ISD::LOAD: return LowerLOAD(Op, DAG);
335249259Sdim  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
336249259Sdim  case ISD::INTRINSIC_VOID: {
337249259Sdim    SDValue Chain = Op.getOperand(0);
338249259Sdim    unsigned IntrinsicID =
339249259Sdim                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
340249259Sdim    switch (IntrinsicID) {
341249259Sdim    case AMDGPUIntrinsic::AMDGPU_store_output: {
342249259Sdim      MachineFunction &MF = DAG.getMachineFunction();
343249259Sdim      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
344249259Sdim      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
345249259Sdim      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
346249259Sdim      MFI->LiveOuts.push_back(Reg);
347249259Sdim      return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
348249259Sdim    }
349249259Sdim    case AMDGPUIntrinsic::R600_store_swizzle: {
350249259Sdim      const SDValue Args[8] = {
351249259Sdim        Chain,
352249259Sdim        Op.getOperand(2), // Export Value
353249259Sdim        Op.getOperand(3), // ArrayBase
354249259Sdim        Op.getOperand(4), // Type
355249259Sdim        DAG.getConstant(0, MVT::i32), // SWZ_X
356249259Sdim        DAG.getConstant(1, MVT::i32), // SWZ_Y
357249259Sdim        DAG.getConstant(2, MVT::i32), // SWZ_Z
358249259Sdim        DAG.getConstant(3, MVT::i32) // SWZ_W
359249259Sdim      };
360249259Sdim      return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(),
361249259Sdim          Args, 8);
362249259Sdim    }
363249259Sdim
364249259Sdim    // default for switch(IntrinsicID)
365249259Sdim    default: break;
366249259Sdim    }
367249259Sdim    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
368249259Sdim    break;
369249259Sdim  }
370249259Sdim  case ISD::INTRINSIC_WO_CHAIN: {
371249259Sdim    unsigned IntrinsicID =
372249259Sdim                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
373249259Sdim    EVT VT = Op.getValueType();
374249259Sdim    DebugLoc DL = Op.getDebugLoc();
375249259Sdim    switch(IntrinsicID) {
376249259Sdim    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
377249259Sdim    case AMDGPUIntrinsic::R600_load_input: {
378249259Sdim      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
379249259Sdim      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
380249259Sdim      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
381249259Sdim    }
382249259Sdim
383249259Sdim    case AMDGPUIntrinsic::R600_interp_input: {
384249259Sdim      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
385249259Sdim      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
386249259Sdim      MachineSDNode *interp;
387249259Sdim      if (ijb < 0) {
388249259Sdim        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
389249259Sdim            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
390249259Sdim        return DAG.getTargetExtractSubreg(
391249259Sdim            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
392249259Sdim            DL, MVT::f32, SDValue(interp, 0));
393249259Sdim      }
394249259Sdim
395249259Sdim      if (slot % 4 < 2)
396249259Sdim        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
397249259Sdim            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
398249259Sdim            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
399249259Sdim                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
400249259Sdim            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
401249259Sdim                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
402249259Sdim      else
403249259Sdim        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
404249259Sdim            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
405249259Sdim            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
406249259Sdim                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
407249259Sdim            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
408249259Sdim                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
409249259Sdim
410249259Sdim      return SDValue(interp, slot % 2);
411249259Sdim    }
412249259Sdim
413249259Sdim    case r600_read_ngroups_x:
414249259Sdim      return LowerImplicitParameter(DAG, VT, DL, 0);
415249259Sdim    case r600_read_ngroups_y:
416249259Sdim      return LowerImplicitParameter(DAG, VT, DL, 1);
417249259Sdim    case r600_read_ngroups_z:
418249259Sdim      return LowerImplicitParameter(DAG, VT, DL, 2);
419249259Sdim    case r600_read_global_size_x:
420249259Sdim      return LowerImplicitParameter(DAG, VT, DL, 3);
421249259Sdim    case r600_read_global_size_y:
422249259Sdim      return LowerImplicitParameter(DAG, VT, DL, 4);
423249259Sdim    case r600_read_global_size_z:
424249259Sdim      return LowerImplicitParameter(DAG, VT, DL, 5);
425249259Sdim    case r600_read_local_size_x:
426249259Sdim      return LowerImplicitParameter(DAG, VT, DL, 6);
427249259Sdim    case r600_read_local_size_y:
428249259Sdim      return LowerImplicitParameter(DAG, VT, DL, 7);
429249259Sdim    case r600_read_local_size_z:
430249259Sdim      return LowerImplicitParameter(DAG, VT, DL, 8);
431249259Sdim
432249259Sdim    case r600_read_tgid_x:
433249259Sdim      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
434249259Sdim                                  AMDGPU::T1_X, VT);
435249259Sdim    case r600_read_tgid_y:
436249259Sdim      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
437249259Sdim                                  AMDGPU::T1_Y, VT);
438249259Sdim    case r600_read_tgid_z:
439249259Sdim      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
440249259Sdim                                  AMDGPU::T1_Z, VT);
441249259Sdim    case r600_read_tidig_x:
442249259Sdim      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
443249259Sdim                                  AMDGPU::T0_X, VT);
444249259Sdim    case r600_read_tidig_y:
445249259Sdim      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
446249259Sdim                                  AMDGPU::T0_Y, VT);
447249259Sdim    case r600_read_tidig_z:
448249259Sdim      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
449249259Sdim                                  AMDGPU::T0_Z, VT);
450249259Sdim    }
451249259Sdim    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
452249259Sdim    break;
453249259Sdim  }
454249259Sdim  } // end switch(Op.getOpcode())
455249259Sdim  return SDValue();
456249259Sdim}
457249259Sdim
458249259Sdimvoid R600TargetLowering::ReplaceNodeResults(SDNode *N,
459249259Sdim                                            SmallVectorImpl<SDValue> &Results,
460249259Sdim                                            SelectionDAG &DAG) const {
461249259Sdim  switch (N->getOpcode()) {
462249259Sdim  default: return;
463249259Sdim  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
464249259Sdim    return;
465249259Sdim  case ISD::LOAD: {
466249259Sdim    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
467249259Sdim    Results.push_back(SDValue(Node, 0));
468249259Sdim    Results.push_back(SDValue(Node, 1));
469249259Sdim    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
470249259Sdim    // function
471249259Sdim    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
472249259Sdim    return;
473249259Sdim  }
474249259Sdim  case ISD::STORE:
475249259Sdim    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
476249259Sdim    Results.push_back(SDValue(Node, 0));
477249259Sdim    return;
478249259Sdim  }
479249259Sdim}
480249259Sdim
481249259SdimSDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
482249259Sdim  return DAG.getNode(
483249259Sdim      ISD::SETCC,
484249259Sdim      Op.getDebugLoc(),
485249259Sdim      MVT::i1,
486249259Sdim      Op, DAG.getConstantFP(0.0f, MVT::f32),
487249259Sdim      DAG.getCondCode(ISD::SETNE)
488249259Sdim      );
489249259Sdim}
490249259Sdim
491249259SdimSDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
492249259Sdim                                                   DebugLoc DL,
493249259Sdim                                                   unsigned DwordOffset) const {
494249259Sdim  unsigned ByteOffset = DwordOffset * 4;
495249259Sdim  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
496249259Sdim                                      AMDGPUAS::PARAM_I_ADDRESS);
497249259Sdim
498249259Sdim  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
499249259Sdim  assert(isInt<16>(ByteOffset));
500249259Sdim
501249259Sdim  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
502249259Sdim                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
503249259Sdim                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
504249259Sdim                     false, false, false, 0);
505249259Sdim}
506249259Sdim
507249259SdimSDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
508249259Sdim
509249259Sdim  MachineFunction &MF = DAG.getMachineFunction();
510249259Sdim  const AMDGPUFrameLowering *TFL =
511249259Sdim   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
512249259Sdim
513249259Sdim  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
514249259Sdim  assert(FIN);
515249259Sdim
516249259Sdim  unsigned FrameIndex = FIN->getIndex();
517249259Sdim  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
518249259Sdim  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
519249259Sdim}
520249259Sdim
521249259SdimSDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
522249259Sdim  DebugLoc DL = Op.getDebugLoc();
523249259Sdim  EVT VT = Op.getValueType();
524249259Sdim
525249259Sdim  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
526249259Sdim                     Op.getOperand(0),
527249259Sdim                     Op.getOperand(0),
528249259Sdim                     DAG.getNode(ISD::SUB, DL, VT,
529249259Sdim                                 DAG.getConstant(32, MVT::i32),
530249259Sdim                                 Op.getOperand(1)));
531249259Sdim}
532249259Sdim
533249259Sdimbool R600TargetLowering::isZero(SDValue Op) const {
534249259Sdim  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
535249259Sdim    return Cst->isNullValue();
536249259Sdim  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
537249259Sdim    return CstFP->isZero();
538249259Sdim  } else {
539249259Sdim    return false;
540249259Sdim  }
541249259Sdim}
542249259Sdim
543249259SdimSDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
544249259Sdim  DebugLoc DL = Op.getDebugLoc();
545249259Sdim  EVT VT = Op.getValueType();
546249259Sdim
547249259Sdim  SDValue LHS = Op.getOperand(0);
548249259Sdim  SDValue RHS = Op.getOperand(1);
549249259Sdim  SDValue True = Op.getOperand(2);
550249259Sdim  SDValue False = Op.getOperand(3);
551249259Sdim  SDValue CC = Op.getOperand(4);
552249259Sdim  SDValue Temp;
553249259Sdim
554249259Sdim  // LHS and RHS are guaranteed to be the same value type
555249259Sdim  EVT CompareVT = LHS.getValueType();
556249259Sdim
557249259Sdim  // Check if we can lower this to a native operation.
558249259Sdim
559249259Sdim  // Try to lower to a SET* instruction:
560249259Sdim  //
561249259Sdim  // SET* can match the following patterns:
562249259Sdim  //
563249259Sdim  // select_cc f32, f32, -1,  0, cc_any
564249259Sdim  // select_cc f32, f32, 1.0f, 0.0f, cc_any
565249259Sdim  // select_cc i32, i32, -1,  0, cc_any
566249259Sdim  //
567249259Sdim
568249259Sdim  // Move hardware True/False values to the correct operand.
569249259Sdim  if (isHWTrueValue(False) && isHWFalseValue(True)) {
570249259Sdim    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
571249259Sdim    std::swap(False, True);
572249259Sdim    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
573249259Sdim  }
574249259Sdim
575249259Sdim  if (isHWTrueValue(True) && isHWFalseValue(False) &&
576249259Sdim      (CompareVT == VT || VT == MVT::i32)) {
577249259Sdim    // This can be matched by a SET* instruction.
578249259Sdim    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
579249259Sdim  }
580249259Sdim
581249259Sdim  // Try to lower to a CND* instruction:
582249259Sdim  //
583249259Sdim  // CND* can match the following patterns:
584249259Sdim  //
585249259Sdim  // select_cc f32, 0.0, f32, f32, cc_any
586249259Sdim  // select_cc f32, 0.0, i32, i32, cc_any
587249259Sdim  // select_cc i32, 0,   f32, f32, cc_any
588249259Sdim  // select_cc i32, 0,   i32, i32, cc_any
589249259Sdim  //
590249259Sdim  if (isZero(LHS) || isZero(RHS)) {
591249259Sdim    SDValue Cond = (isZero(LHS) ? RHS : LHS);
592249259Sdim    SDValue Zero = (isZero(LHS) ? LHS : RHS);
593249259Sdim    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
594249259Sdim    if (CompareVT != VT) {
595249259Sdim      // Bitcast True / False to the correct types.  This will end up being
596249259Sdim      // a nop, but it allows us to define only a single pattern in the
597249259Sdim      // .TD files for each CND* instruction rather than having to have
598249259Sdim      // one pattern for integer True/False and one for fp True/False
599249259Sdim      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
600249259Sdim      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
601249259Sdim    }
602249259Sdim    if (isZero(LHS)) {
603249259Sdim      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
604249259Sdim    }
605249259Sdim
606249259Sdim    switch (CCOpcode) {
607249259Sdim    case ISD::SETONE:
608249259Sdim    case ISD::SETUNE:
609249259Sdim    case ISD::SETNE:
610249259Sdim    case ISD::SETULE:
611249259Sdim    case ISD::SETULT:
612249259Sdim    case ISD::SETOLE:
613249259Sdim    case ISD::SETOLT:
614249259Sdim    case ISD::SETLE:
615249259Sdim    case ISD::SETLT:
616249259Sdim      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
617249259Sdim      Temp = True;
618249259Sdim      True = False;
619249259Sdim      False = Temp;
620249259Sdim      break;
621249259Sdim    default:
622249259Sdim      break;
623249259Sdim    }
624249259Sdim    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
625249259Sdim        Cond, Zero,
626249259Sdim        True, False,
627249259Sdim        DAG.getCondCode(CCOpcode));
628249259Sdim    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
629249259Sdim  }
630249259Sdim
631249259Sdim
632249259Sdim  // Possible Min/Max pattern
633249259Sdim  SDValue MinMax = LowerMinMax(Op, DAG);
634249259Sdim  if (MinMax.getNode()) {
635249259Sdim    return MinMax;
636249259Sdim  }
637249259Sdim
638249259Sdim  // If we make it this for it means we have no native instructions to handle
639249259Sdim  // this SELECT_CC, so we must lower it.
640249259Sdim  SDValue HWTrue, HWFalse;
641249259Sdim
642249259Sdim  if (CompareVT == MVT::f32) {
643249259Sdim    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
644249259Sdim    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
645249259Sdim  } else if (CompareVT == MVT::i32) {
646249259Sdim    HWTrue = DAG.getConstant(-1, CompareVT);
647249259Sdim    HWFalse = DAG.getConstant(0, CompareVT);
648249259Sdim  }
649249259Sdim  else {
650249259Sdim    assert(!"Unhandled value type in LowerSELECT_CC");
651249259Sdim  }
652249259Sdim
653249259Sdim  // Lower this unsupported SELECT_CC into a combination of two supported
654249259Sdim  // SELECT_CC operations.
655249259Sdim  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
656249259Sdim
657249259Sdim  return DAG.getNode(ISD::SELECT_CC, DL, VT,
658249259Sdim      Cond, HWFalse,
659249259Sdim      True, False,
660249259Sdim      DAG.getCondCode(ISD::SETNE));
661249259Sdim}
662249259Sdim
663249259SdimSDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
664249259Sdim  return DAG.getNode(ISD::SELECT_CC,
665249259Sdim      Op.getDebugLoc(),
666249259Sdim      Op.getValueType(),
667249259Sdim      Op.getOperand(0),
668249259Sdim      DAG.getConstant(0, MVT::i32),
669249259Sdim      Op.getOperand(1),
670249259Sdim      Op.getOperand(2),
671249259Sdim      DAG.getCondCode(ISD::SETNE));
672249259Sdim}
673249259Sdim
674249259Sdim/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
675249259Sdim/// convert these pointers to a register index.  Each register holds
676249259Sdim/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
677249259Sdim/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
678249259Sdim/// for indirect addressing.
679249259SdimSDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
680249259Sdim                                               unsigned StackWidth,
681249259Sdim                                               SelectionDAG &DAG) const {
682249259Sdim  unsigned SRLPad;
683249259Sdim  switch(StackWidth) {
684249259Sdim  case 1:
685249259Sdim    SRLPad = 2;
686249259Sdim    break;
687249259Sdim  case 2:
688249259Sdim    SRLPad = 3;
689249259Sdim    break;
690249259Sdim  case 4:
691249259Sdim    SRLPad = 4;
692249259Sdim    break;
693249259Sdim  default: llvm_unreachable("Invalid stack width");
694249259Sdim  }
695249259Sdim
696249259Sdim  return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
697249259Sdim                     DAG.getConstant(SRLPad, MVT::i32));
698249259Sdim}
699249259Sdim
700249259Sdimvoid R600TargetLowering::getStackAddress(unsigned StackWidth,
701249259Sdim                                         unsigned ElemIdx,
702249259Sdim                                         unsigned &Channel,
703249259Sdim                                         unsigned &PtrIncr) const {
704249259Sdim  switch (StackWidth) {
705249259Sdim  default:
706249259Sdim  case 1:
707249259Sdim    Channel = 0;
708249259Sdim    if (ElemIdx > 0) {
709249259Sdim      PtrIncr = 1;
710249259Sdim    } else {
711249259Sdim      PtrIncr = 0;
712249259Sdim    }
713249259Sdim    break;
714249259Sdim  case 2:
715249259Sdim    Channel = ElemIdx % 2;
716249259Sdim    if (ElemIdx == 2) {
717249259Sdim      PtrIncr = 1;
718249259Sdim    } else {
719249259Sdim      PtrIncr = 0;
720249259Sdim    }
721249259Sdim    break;
722249259Sdim  case 4:
723249259Sdim    Channel = ElemIdx;
724249259Sdim    PtrIncr = 0;
725249259Sdim    break;
726249259Sdim  }
727249259Sdim}
728249259Sdim
729249259SdimSDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
730249259Sdim  DebugLoc DL = Op.getDebugLoc();
731249259Sdim  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
732249259Sdim  SDValue Chain = Op.getOperand(0);
733249259Sdim  SDValue Value = Op.getOperand(1);
734249259Sdim  SDValue Ptr = Op.getOperand(2);
735249259Sdim
736249259Sdim  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
737249259Sdim      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
738249259Sdim    // Convert pointer from byte address to dword address.
739249259Sdim    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
740249259Sdim                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
741249259Sdim                                  Ptr, DAG.getConstant(2, MVT::i32)));
742249259Sdim
743249259Sdim    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
744249259Sdim      assert(!"Truncated and indexed stores not supported yet");
745249259Sdim    } else {
746249259Sdim      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
747249259Sdim    }
748249259Sdim    return Chain;
749249259Sdim  }
750249259Sdim
751249259Sdim  EVT ValueVT = Value.getValueType();
752249259Sdim
753249259Sdim  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
754249259Sdim    return SDValue();
755249259Sdim  }
756249259Sdim
757249259Sdim  // Lowering for indirect addressing
758249259Sdim
759249259Sdim  const MachineFunction &MF = DAG.getMachineFunction();
760249259Sdim  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
761249259Sdim                                         getTargetMachine().getFrameLowering());
762249259Sdim  unsigned StackWidth = TFL->getStackWidth(MF);
763249259Sdim
764249259Sdim  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
765249259Sdim
766249259Sdim  if (ValueVT.isVector()) {
767249259Sdim    unsigned NumElemVT = ValueVT.getVectorNumElements();
768249259Sdim    EVT ElemVT = ValueVT.getVectorElementType();
769249259Sdim    SDValue Stores[4];
770249259Sdim
771249259Sdim    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
772249259Sdim                                      "vector width in load");
773249259Sdim
774249259Sdim    for (unsigned i = 0; i < NumElemVT; ++i) {
775249259Sdim      unsigned Channel, PtrIncr;
776249259Sdim      getStackAddress(StackWidth, i, Channel, PtrIncr);
777249259Sdim      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
778249259Sdim                        DAG.getConstant(PtrIncr, MVT::i32));
779249259Sdim      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
780249259Sdim                                 Value, DAG.getConstant(i, MVT::i32));
781249259Sdim
782249259Sdim      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
783249259Sdim                              Chain, Elem, Ptr,
784249259Sdim                              DAG.getTargetConstant(Channel, MVT::i32));
785249259Sdim    }
786249259Sdim     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
787249259Sdim   } else {
788249259Sdim    if (ValueVT == MVT::i8) {
789249259Sdim      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
790249259Sdim    }
791249259Sdim    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
792249259Sdim    DAG.getTargetConstant(0, MVT::i32)); // Channel
793249259Sdim  }
794249259Sdim
795249259Sdim  return Chain;
796249259Sdim}
797249259Sdim
798249259Sdim// return (512 + (kc_bank << 12)
799249259Sdimstatic int
800249259SdimConstantAddressBlock(unsigned AddressSpace) {
801249259Sdim  switch (AddressSpace) {
802249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_0:
803249259Sdim    return 512;
804249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_1:
805249259Sdim    return 512 + 4096;
806249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_2:
807249259Sdim    return 512 + 4096 * 2;
808249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_3:
809249259Sdim    return 512 + 4096 * 3;
810249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_4:
811249259Sdim    return 512 + 4096 * 4;
812249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_5:
813249259Sdim    return 512 + 4096 * 5;
814249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_6:
815249259Sdim    return 512 + 4096 * 6;
816249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_7:
817249259Sdim    return 512 + 4096 * 7;
818249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_8:
819249259Sdim    return 512 + 4096 * 8;
820249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_9:
821249259Sdim    return 512 + 4096 * 9;
822249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_10:
823249259Sdim    return 512 + 4096 * 10;
824249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_11:
825249259Sdim    return 512 + 4096 * 11;
826249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_12:
827249259Sdim    return 512 + 4096 * 12;
828249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_13:
829249259Sdim    return 512 + 4096 * 13;
830249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_14:
831249259Sdim    return 512 + 4096 * 14;
832249259Sdim  case AMDGPUAS::CONSTANT_BUFFER_15:
833249259Sdim    return 512 + 4096 * 15;
834249259Sdim  default:
835249259Sdim    return -1;
836249259Sdim  }
837249259Sdim}
838249259Sdim
839249259SdimSDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
840249259Sdim{
841249259Sdim  EVT VT = Op.getValueType();
842249259Sdim  DebugLoc DL = Op.getDebugLoc();
843249259Sdim  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
844249259Sdim  SDValue Chain = Op.getOperand(0);
845249259Sdim  SDValue Ptr = Op.getOperand(1);
846249259Sdim  SDValue LoweredLoad;
847249259Sdim
848249259Sdim  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
849249259Sdim  if (ConstantBlock > -1) {
850249259Sdim    SDValue Result;
851249259Sdim    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
852249259Sdim        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
853249259Sdim        dyn_cast<ConstantSDNode>(Ptr)) {
854249259Sdim      SDValue Slots[4];
855249259Sdim      for (unsigned i = 0; i < 4; i++) {
856249259Sdim        // We want Const position encoded with the following formula :
857249259Sdim        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
858249259Sdim        // const_index is Ptr computed by llvm using an alignment of 16.
859249259Sdim        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
860249259Sdim        // then div by 4 at the ISel step
861249259Sdim        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
862249259Sdim            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
863249259Sdim        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
864249259Sdim      }
865249259Sdim      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
866249259Sdim    } else {
867249259Sdim      // non constant ptr cant be folded, keeps it as a v4f32 load
868249259Sdim      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
869249259Sdim          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
870249259Sdim          DAG.getConstant(LoadNode->getAddressSpace() -
871249259Sdim	                  AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
872249259Sdim          );
873249259Sdim    }
874249259Sdim
875249259Sdim    if (!VT.isVector()) {
876249259Sdim      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
877249259Sdim          DAG.getConstant(0, MVT::i32));
878249259Sdim    }
879249259Sdim
880249259Sdim    SDValue MergedValues[2] = {
881249259Sdim        Result,
882249259Sdim        Chain
883249259Sdim    };
884249259Sdim    return DAG.getMergeValues(MergedValues, 2, DL);
885249259Sdim  }
886249259Sdim
887249259Sdim  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
888249259Sdim    return SDValue();
889249259Sdim  }
890249259Sdim
891249259Sdim  // Lowering for indirect addressing
892249259Sdim  const MachineFunction &MF = DAG.getMachineFunction();
893249259Sdim  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
894249259Sdim                                         getTargetMachine().getFrameLowering());
895249259Sdim  unsigned StackWidth = TFL->getStackWidth(MF);
896249259Sdim
897249259Sdim  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
898249259Sdim
899249259Sdim  if (VT.isVector()) {
900249259Sdim    unsigned NumElemVT = VT.getVectorNumElements();
901249259Sdim    EVT ElemVT = VT.getVectorElementType();
902249259Sdim    SDValue Loads[4];
903249259Sdim
904249259Sdim    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
905249259Sdim                                      "vector width in load");
906249259Sdim
907249259Sdim    for (unsigned i = 0; i < NumElemVT; ++i) {
908249259Sdim      unsigned Channel, PtrIncr;
909249259Sdim      getStackAddress(StackWidth, i, Channel, PtrIncr);
910249259Sdim      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
911249259Sdim                        DAG.getConstant(PtrIncr, MVT::i32));
912249259Sdim      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
913249259Sdim                             Chain, Ptr,
914249259Sdim                             DAG.getTargetConstant(Channel, MVT::i32),
915249259Sdim                             Op.getOperand(2));
916249259Sdim    }
917249259Sdim    for (unsigned i = NumElemVT; i < 4; ++i) {
918249259Sdim      Loads[i] = DAG.getUNDEF(ElemVT);
919249259Sdim    }
920249259Sdim    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
921249259Sdim    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
922249259Sdim  } else {
923249259Sdim    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
924249259Sdim                              Chain, Ptr,
925249259Sdim                              DAG.getTargetConstant(0, MVT::i32), // Channel
926249259Sdim                              Op.getOperand(2));
927249259Sdim  }
928249259Sdim
929249259Sdim  SDValue Ops[2];
930249259Sdim  Ops[0] = LoweredLoad;
931249259Sdim  Ops[1] = Chain;
932249259Sdim
933249259Sdim  return DAG.getMergeValues(Ops, 2, DL);
934249259Sdim}
935249259Sdim
936249259Sdim/// XXX Only kernel functions are supported, so we can assume for now that
937249259Sdim/// every function is a kernel function, but in the future we should use
938249259Sdim/// separate calling conventions for kernel and non-kernel functions.
939249259SdimSDValue R600TargetLowering::LowerFormalArguments(
940249259Sdim                                      SDValue Chain,
941249259Sdim                                      CallingConv::ID CallConv,
942249259Sdim                                      bool isVarArg,
943249259Sdim                                      const SmallVectorImpl<ISD::InputArg> &Ins,
944249259Sdim                                      DebugLoc DL, SelectionDAG &DAG,
945249259Sdim                                      SmallVectorImpl<SDValue> &InVals) const {
946249259Sdim  unsigned ParamOffsetBytes = 36;
947249259Sdim  Function::const_arg_iterator FuncArg =
948249259Sdim                            DAG.getMachineFunction().getFunction()->arg_begin();
949249259Sdim  for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
950249259Sdim    EVT VT = Ins[i].VT;
951249259Sdim    Type *ArgType = FuncArg->getType();
952249259Sdim    unsigned ArgSizeInBits = ArgType->isPointerTy() ?
953249259Sdim                             32 : ArgType->getPrimitiveSizeInBits();
954249259Sdim    unsigned ArgBytes = ArgSizeInBits >> 3;
955249259Sdim    EVT ArgVT;
956249259Sdim    if (ArgSizeInBits < VT.getSizeInBits()) {
957249259Sdim      assert(!ArgType->isFloatTy() &&
958249259Sdim             "Extending floating point arguments not supported yet");
959249259Sdim      ArgVT = MVT::getIntegerVT(ArgSizeInBits);
960249259Sdim    } else {
961249259Sdim      ArgVT = VT;
962249259Sdim    }
963249259Sdim    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
964249259Sdim                                                    AMDGPUAS::PARAM_I_ADDRESS);
965249259Sdim    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
966249259Sdim                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
967249259Sdim                                       MachinePointerInfo(UndefValue::get(PtrTy)),
968249259Sdim                                       ArgVT, false, false, ArgBytes);
969249259Sdim    InVals.push_back(Arg);
970249259Sdim    ParamOffsetBytes += ArgBytes;
971249259Sdim  }
972249259Sdim  return Chain;
973249259Sdim}
974249259Sdim
975249259SdimEVT R600TargetLowering::getSetCCResultType(EVT VT) const {
976249259Sdim   if (!VT.isVector()) return MVT::i32;
977249259Sdim   return VT.changeVectorElementTypeToInteger();
978249259Sdim}
979249259Sdim
980249259Sdim//===----------------------------------------------------------------------===//
981249259Sdim// Custom DAG Optimizations
982249259Sdim//===----------------------------------------------------------------------===//
983249259Sdim
984249259SdimSDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
985249259Sdim                                              DAGCombinerInfo &DCI) const {
986249259Sdim  SelectionDAG &DAG = DCI.DAG;
987249259Sdim
988249259Sdim  switch (N->getOpcode()) {
989249259Sdim  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
990249259Sdim  case ISD::FP_ROUND: {
991249259Sdim      SDValue Arg = N->getOperand(0);
992249259Sdim      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
993249259Sdim        return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
994249259Sdim                           Arg.getOperand(0));
995249259Sdim      }
996249259Sdim      break;
997249259Sdim    }
998249259Sdim
999249259Sdim  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1000249259Sdim  // (i32 select_cc f32, f32, -1, 0 cc)
1001249259Sdim  //
1002249259Sdim  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1003249259Sdim  // this to one of the SET*_DX10 instructions.
1004249259Sdim  case ISD::FP_TO_SINT: {
1005249259Sdim    SDValue FNeg = N->getOperand(0);
1006249259Sdim    if (FNeg.getOpcode() != ISD::FNEG) {
1007249259Sdim      return SDValue();
1008249259Sdim    }
1009249259Sdim    SDValue SelectCC = FNeg.getOperand(0);
1010249259Sdim    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1011249259Sdim        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1012249259Sdim        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1013249259Sdim        !isHWTrueValue(SelectCC.getOperand(2)) ||
1014249259Sdim        !isHWFalseValue(SelectCC.getOperand(3))) {
1015249259Sdim      return SDValue();
1016249259Sdim    }
1017249259Sdim
1018249259Sdim    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
1019249259Sdim                           SelectCC.getOperand(0), // LHS
1020249259Sdim                           SelectCC.getOperand(1), // RHS
1021249259Sdim                           DAG.getConstant(-1, MVT::i32), // True
1022249259Sdim                           DAG.getConstant(0, MVT::i32),  // Flase
1023249259Sdim                           SelectCC.getOperand(4)); // CC
1024249259Sdim
1025249259Sdim    break;
1026249259Sdim  }
1027249259Sdim  // Extract_vec (Build_vector) generated by custom lowering
1028249259Sdim  // also needs to be customly combined
1029249259Sdim  case ISD::EXTRACT_VECTOR_ELT: {
1030249259Sdim    SDValue Arg = N->getOperand(0);
1031249259Sdim    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1032249259Sdim      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1033249259Sdim        unsigned Element = Const->getZExtValue();
1034249259Sdim        return Arg->getOperand(Element);
1035249259Sdim      }
1036249259Sdim    }
1037249259Sdim    if (Arg.getOpcode() == ISD::BITCAST &&
1038249259Sdim        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1039249259Sdim      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1040249259Sdim        unsigned Element = Const->getZExtValue();
1041249259Sdim        return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
1042249259Sdim            Arg->getOperand(0).getOperand(Element));
1043249259Sdim      }
1044249259Sdim    }
1045249259Sdim  }
1046249259Sdim
1047249259Sdim  case ISD::SELECT_CC: {
1048249259Sdim    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1049249259Sdim    //      selectcc x, y, a, b, inv(cc)
1050249259Sdim    //
1051249259Sdim    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1052249259Sdim    //      selectcc x, y, a, b, cc
1053249259Sdim    SDValue LHS = N->getOperand(0);
1054249259Sdim    if (LHS.getOpcode() != ISD::SELECT_CC) {
1055249259Sdim      return SDValue();
1056249259Sdim    }
1057249259Sdim
1058249259Sdim    SDValue RHS = N->getOperand(1);
1059249259Sdim    SDValue True = N->getOperand(2);
1060249259Sdim    SDValue False = N->getOperand(3);
1061249259Sdim    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1062249259Sdim
1063249259Sdim    if (LHS.getOperand(2).getNode() != True.getNode() ||
1064249259Sdim        LHS.getOperand(3).getNode() != False.getNode() ||
1065249259Sdim        RHS.getNode() != False.getNode()) {
1066249259Sdim      return SDValue();
1067249259Sdim    }
1068249259Sdim
1069249259Sdim    switch (NCC) {
1070249259Sdim    default: return SDValue();
1071249259Sdim    case ISD::SETNE: return LHS;
1072249259Sdim    case ISD::SETEQ: {
1073249259Sdim      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1074249259Sdim      LHSCC = ISD::getSetCCInverse(LHSCC,
1075249259Sdim                                  LHS.getOperand(0).getValueType().isInteger());
1076249259Sdim      return DAG.getSelectCC(N->getDebugLoc(),
1077249259Sdim                             LHS.getOperand(0),
1078249259Sdim                             LHS.getOperand(1),
1079249259Sdim                             LHS.getOperand(2),
1080249259Sdim                             LHS.getOperand(3),
1081249259Sdim                             LHSCC);
1082249259Sdim    }
1083249259Sdim    }
1084249259Sdim  }
1085249259Sdim  case AMDGPUISD::EXPORT: {
1086249259Sdim    SDValue Arg = N->getOperand(1);
1087249259Sdim    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1088249259Sdim      break;
1089249259Sdim    SDValue NewBldVec[4] = {
1090249259Sdim        DAG.getUNDEF(MVT::f32),
1091249259Sdim        DAG.getUNDEF(MVT::f32),
1092249259Sdim        DAG.getUNDEF(MVT::f32),
1093249259Sdim        DAG.getUNDEF(MVT::f32)
1094249259Sdim      };
1095249259Sdim    SDValue NewArgs[8] = {
1096249259Sdim      N->getOperand(0), // Chain
1097249259Sdim      SDValue(),
1098249259Sdim      N->getOperand(2), // ArrayBase
1099249259Sdim      N->getOperand(3), // Type
1100249259Sdim      N->getOperand(4), // SWZ_X
1101249259Sdim      N->getOperand(5), // SWZ_Y
1102249259Sdim      N->getOperand(6), // SWZ_Z
1103249259Sdim      N->getOperand(7) // SWZ_W
1104249259Sdim    };
1105249259Sdim    for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
1106249259Sdim      if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
1107249259Sdim        if (C->isZero()) {
1108249259Sdim          NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
1109249259Sdim        } else if (C->isExactlyValue(1.0)) {
1110249259Sdim          NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
1111249259Sdim        } else {
1112249259Sdim          NewBldVec[i] = Arg.getOperand(i);
1113249259Sdim        }
1114249259Sdim      } else {
1115249259Sdim        NewBldVec[i] = Arg.getOperand(i);
1116249259Sdim      }
1117249259Sdim    }
1118249259Sdim    DebugLoc DL = N->getDebugLoc();
1119249259Sdim    NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
1120249259Sdim    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1121249259Sdim  }
1122249259Sdim  }
1123249259Sdim  return SDValue();
1124249259Sdim}
1125