R600ISelLowering.cpp revision 296417
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "AMDGPUFrameLowering.h"
17#include "AMDGPUIntrinsicInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "R600Defines.h"
20#include "R600InstrInfo.h"
21#include "R600MachineFunctionInfo.h"
22#include "llvm/Analysis/ValueTracking.h"
23#include "llvm/CodeGen/CallingConvLower.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/CodeGen/MachineInstrBuilder.h"
26#include "llvm/CodeGen/MachineRegisterInfo.h"
27#include "llvm/CodeGen/SelectionDAG.h"
28#include "llvm/IR/Argument.h"
29#include "llvm/IR/Function.h"
30
31using namespace llvm;
32
33R600TargetLowering::R600TargetLowering(TargetMachine &TM,
34                                       const AMDGPUSubtarget &STI)
35    : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
36  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
37  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
38  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
39  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
40  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
41  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
42
43  computeRegisterProperties(STI.getRegisterInfo());
44
45  // Set condition code actions
46  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
47  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
48  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
49  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
50  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
51  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
52  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
53  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
54  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
55  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
56  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
57  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
58
59  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
60  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
61  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
62  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
63
64  setOperationAction(ISD::FCOS, MVT::f32, Custom);
65  setOperationAction(ISD::FSIN, MVT::f32, Custom);
66
67  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
68  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
69
70  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
71  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
72  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
73
74  setOperationAction(ISD::FSUB, MVT::f32, Expand);
75
76  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
77  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
78  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
79
80  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
81  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
82
83  setOperationAction(ISD::SETCC, MVT::i32, Expand);
84  setOperationAction(ISD::SETCC, MVT::f32, Expand);
85  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
86  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
87  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
88
89  setOperationAction(ISD::SELECT, MVT::i32, Expand);
90  setOperationAction(ISD::SELECT, MVT::f32, Expand);
91  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
92  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
93
94  // ADD, SUB overflow.
95  // TODO: turn these into Legal?
96  if (Subtarget->hasCARRY())
97    setOperationAction(ISD::UADDO, MVT::i32, Custom);
98
99  if (Subtarget->hasBORROW())
100    setOperationAction(ISD::USUBO, MVT::i32, Custom);
101
102  // Expand sign extension of vectors
103  if (!Subtarget->hasBFE())
104    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
105
106  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
107  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
108
109  if (!Subtarget->hasBFE())
110    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
111  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
112  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
113
114  if (!Subtarget->hasBFE())
115    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
116  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
117  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
118
119  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
120  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
121  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
122
123  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
124
125
126  // Legalize loads and stores to the private address space.
127  setOperationAction(ISD::LOAD, MVT::i32, Custom);
128  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
129  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
130
131  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
132  // spaces, so it is custom lowered to handle those where it isn't.
133  for (MVT VT : MVT::integer_valuetypes()) {
134    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
135    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
136    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
137
138    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
139    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
140    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
141
142    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
143    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
144    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
145  }
146
147  setOperationAction(ISD::STORE, MVT::i8, Custom);
148  setOperationAction(ISD::STORE, MVT::i32, Custom);
149  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
150  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
151  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
152  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
153
154  setOperationAction(ISD::LOAD, MVT::i32, Custom);
155  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
156  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
157
158  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
159  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
160  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
161  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
162
163  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
164  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
165  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
166  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
167
168  setTargetDAGCombine(ISD::FP_ROUND);
169  setTargetDAGCombine(ISD::FP_TO_SINT);
170  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
171  setTargetDAGCombine(ISD::SELECT_CC);
172  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
173
174  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
175  //  to be Legal/Custom in order to avoid library calls.
176  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
177  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
178  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
179
180  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
181
182  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
183  for (MVT VT : ScalarIntVTs) {
184    setOperationAction(ISD::ADDC, VT, Expand);
185    setOperationAction(ISD::SUBC, VT, Expand);
186    setOperationAction(ISD::ADDE, VT, Expand);
187    setOperationAction(ISD::SUBE, VT, Expand);
188  }
189
190  setSchedulingPreference(Sched::Source);
191}
192
193static inline bool isEOP(MachineBasicBlock::iterator I) {
194  return std::next(I)->getOpcode() == AMDGPU::RETURN;
195}
196
197MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
198    MachineInstr * MI, MachineBasicBlock * BB) const {
199  MachineFunction * MF = BB->getParent();
200  MachineRegisterInfo &MRI = MF->getRegInfo();
201  MachineBasicBlock::iterator I = *MI;
202  const R600InstrInfo *TII =
203      static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
204
205  switch (MI->getOpcode()) {
206  default:
207    // Replace LDS_*_RET instruction that don't have any uses with the
208    // equivalent LDS_*_NORET instruction.
209    if (TII->isLDSRetInstr(MI->getOpcode())) {
210      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
211      assert(DstIdx != -1);
212      MachineInstrBuilder NewMI;
213      // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
214      //        LDS_1A2D support and remove this special case.
215      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
216           MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
217        return BB;
218
219      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
220                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
221      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
222        NewMI.addOperand(MI->getOperand(i));
223      }
224    } else {
225      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
226    }
227    break;
228  case AMDGPU::CLAMP_R600: {
229    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
230                                                   AMDGPU::MOV,
231                                                   MI->getOperand(0).getReg(),
232                                                   MI->getOperand(1).getReg());
233    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
234    break;
235  }
236
237  case AMDGPU::FABS_R600: {
238    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
239                                                    AMDGPU::MOV,
240                                                    MI->getOperand(0).getReg(),
241                                                    MI->getOperand(1).getReg());
242    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
243    break;
244  }
245
246  case AMDGPU::FNEG_R600: {
247    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
248                                                    AMDGPU::MOV,
249                                                    MI->getOperand(0).getReg(),
250                                                    MI->getOperand(1).getReg());
251    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
252    break;
253  }
254
255  case AMDGPU::MASK_WRITE: {
256    unsigned maskedRegister = MI->getOperand(0).getReg();
257    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
258    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
259    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
260    break;
261  }
262
263  case AMDGPU::MOV_IMM_F32:
264    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
265                     MI->getOperand(1).getFPImm()->getValueAPF()
266                         .bitcastToAPInt().getZExtValue());
267    break;
268  case AMDGPU::MOV_IMM_I32:
269    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
270                     MI->getOperand(1).getImm());
271    break;
272  case AMDGPU::CONST_COPY: {
273    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
274        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
275    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
276        MI->getOperand(1).getImm());
277    break;
278  }
279
280  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
281  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
282  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
283    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
284            .addOperand(MI->getOperand(0))
285            .addOperand(MI->getOperand(1))
286            .addImm(isEOP(I)); // Set End of program bit
287    break;
288  }
289  case AMDGPU::RAT_STORE_TYPED_eg: {
290    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
291            .addOperand(MI->getOperand(0))
292            .addOperand(MI->getOperand(1))
293            .addOperand(MI->getOperand(2))
294            .addImm(isEOP(I)); // Set End of program bit
295    break;
296  }
297
298  case AMDGPU::TXD: {
299    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
300    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
301    MachineOperand &RID = MI->getOperand(4);
302    MachineOperand &SID = MI->getOperand(5);
303    unsigned TextureId = MI->getOperand(6).getImm();
304    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
305    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
306
307    switch (TextureId) {
308    case 5: // Rect
309      CTX = CTY = 0;
310      break;
311    case 6: // Shadow1D
312      SrcW = SrcZ;
313      break;
314    case 7: // Shadow2D
315      SrcW = SrcZ;
316      break;
317    case 8: // ShadowRect
318      CTX = CTY = 0;
319      SrcW = SrcZ;
320      break;
321    case 9: // 1DArray
322      SrcZ = SrcY;
323      CTZ = 0;
324      break;
325    case 10: // 2DArray
326      CTZ = 0;
327      break;
328    case 11: // Shadow1DArray
329      SrcZ = SrcY;
330      CTZ = 0;
331      break;
332    case 12: // Shadow2DArray
333      CTZ = 0;
334      break;
335    }
336    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
337            .addOperand(MI->getOperand(3))
338            .addImm(SrcX)
339            .addImm(SrcY)
340            .addImm(SrcZ)
341            .addImm(SrcW)
342            .addImm(0)
343            .addImm(0)
344            .addImm(0)
345            .addImm(0)
346            .addImm(1)
347            .addImm(2)
348            .addImm(3)
349            .addOperand(RID)
350            .addOperand(SID)
351            .addImm(CTX)
352            .addImm(CTY)
353            .addImm(CTZ)
354            .addImm(CTW);
355    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
356            .addOperand(MI->getOperand(2))
357            .addImm(SrcX)
358            .addImm(SrcY)
359            .addImm(SrcZ)
360            .addImm(SrcW)
361            .addImm(0)
362            .addImm(0)
363            .addImm(0)
364            .addImm(0)
365            .addImm(1)
366            .addImm(2)
367            .addImm(3)
368            .addOperand(RID)
369            .addOperand(SID)
370            .addImm(CTX)
371            .addImm(CTY)
372            .addImm(CTZ)
373            .addImm(CTW);
374    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
375            .addOperand(MI->getOperand(0))
376            .addOperand(MI->getOperand(1))
377            .addImm(SrcX)
378            .addImm(SrcY)
379            .addImm(SrcZ)
380            .addImm(SrcW)
381            .addImm(0)
382            .addImm(0)
383            .addImm(0)
384            .addImm(0)
385            .addImm(1)
386            .addImm(2)
387            .addImm(3)
388            .addOperand(RID)
389            .addOperand(SID)
390            .addImm(CTX)
391            .addImm(CTY)
392            .addImm(CTZ)
393            .addImm(CTW)
394            .addReg(T0, RegState::Implicit)
395            .addReg(T1, RegState::Implicit);
396    break;
397  }
398
399  case AMDGPU::TXD_SHADOW: {
400    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
401    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
402    MachineOperand &RID = MI->getOperand(4);
403    MachineOperand &SID = MI->getOperand(5);
404    unsigned TextureId = MI->getOperand(6).getImm();
405    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
406    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
407
408    switch (TextureId) {
409    case 5: // Rect
410      CTX = CTY = 0;
411      break;
412    case 6: // Shadow1D
413      SrcW = SrcZ;
414      break;
415    case 7: // Shadow2D
416      SrcW = SrcZ;
417      break;
418    case 8: // ShadowRect
419      CTX = CTY = 0;
420      SrcW = SrcZ;
421      break;
422    case 9: // 1DArray
423      SrcZ = SrcY;
424      CTZ = 0;
425      break;
426    case 10: // 2DArray
427      CTZ = 0;
428      break;
429    case 11: // Shadow1DArray
430      SrcZ = SrcY;
431      CTZ = 0;
432      break;
433    case 12: // Shadow2DArray
434      CTZ = 0;
435      break;
436    }
437
438    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
439            .addOperand(MI->getOperand(3))
440            .addImm(SrcX)
441            .addImm(SrcY)
442            .addImm(SrcZ)
443            .addImm(SrcW)
444            .addImm(0)
445            .addImm(0)
446            .addImm(0)
447            .addImm(0)
448            .addImm(1)
449            .addImm(2)
450            .addImm(3)
451            .addOperand(RID)
452            .addOperand(SID)
453            .addImm(CTX)
454            .addImm(CTY)
455            .addImm(CTZ)
456            .addImm(CTW);
457    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
458            .addOperand(MI->getOperand(2))
459            .addImm(SrcX)
460            .addImm(SrcY)
461            .addImm(SrcZ)
462            .addImm(SrcW)
463            .addImm(0)
464            .addImm(0)
465            .addImm(0)
466            .addImm(0)
467            .addImm(1)
468            .addImm(2)
469            .addImm(3)
470            .addOperand(RID)
471            .addOperand(SID)
472            .addImm(CTX)
473            .addImm(CTY)
474            .addImm(CTZ)
475            .addImm(CTW);
476    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
477            .addOperand(MI->getOperand(0))
478            .addOperand(MI->getOperand(1))
479            .addImm(SrcX)
480            .addImm(SrcY)
481            .addImm(SrcZ)
482            .addImm(SrcW)
483            .addImm(0)
484            .addImm(0)
485            .addImm(0)
486            .addImm(0)
487            .addImm(1)
488            .addImm(2)
489            .addImm(3)
490            .addOperand(RID)
491            .addOperand(SID)
492            .addImm(CTX)
493            .addImm(CTY)
494            .addImm(CTZ)
495            .addImm(CTW)
496            .addReg(T0, RegState::Implicit)
497            .addReg(T1, RegState::Implicit);
498    break;
499  }
500
501  case AMDGPU::BRANCH:
502      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
503              .addOperand(MI->getOperand(0));
504      break;
505
506  case AMDGPU::BRANCH_COND_f32: {
507    MachineInstr *NewMI =
508      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
509              AMDGPU::PREDICATE_BIT)
510              .addOperand(MI->getOperand(1))
511              .addImm(OPCODE_IS_NOT_ZERO)
512              .addImm(0); // Flags
513    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
514    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
515            .addOperand(MI->getOperand(0))
516            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
517    break;
518  }
519
520  case AMDGPU::BRANCH_COND_i32: {
521    MachineInstr *NewMI =
522      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
523            AMDGPU::PREDICATE_BIT)
524            .addOperand(MI->getOperand(1))
525            .addImm(OPCODE_IS_NOT_ZERO_INT)
526            .addImm(0); // Flags
527    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
528    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
529           .addOperand(MI->getOperand(0))
530            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
531    break;
532  }
533
534  case AMDGPU::EG_ExportSwz:
535  case AMDGPU::R600_ExportSwz: {
536    // Instruction is left unmodified if its not the last one of its type
537    bool isLastInstructionOfItsType = true;
538    unsigned InstExportType = MI->getOperand(1).getImm();
539    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
540         EndBlock = BB->end(); NextExportInst != EndBlock;
541         NextExportInst = std::next(NextExportInst)) {
542      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
543          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
544        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
545            .getImm();
546        if (CurrentInstExportType == InstExportType) {
547          isLastInstructionOfItsType = false;
548          break;
549        }
550      }
551    }
552    bool EOP = isEOP(I);
553    if (!EOP && !isLastInstructionOfItsType)
554      return BB;
555    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
556    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
557            .addOperand(MI->getOperand(0))
558            .addOperand(MI->getOperand(1))
559            .addOperand(MI->getOperand(2))
560            .addOperand(MI->getOperand(3))
561            .addOperand(MI->getOperand(4))
562            .addOperand(MI->getOperand(5))
563            .addOperand(MI->getOperand(6))
564            .addImm(CfInst)
565            .addImm(EOP);
566    break;
567  }
568  case AMDGPU::RETURN: {
569    // RETURN instructions must have the live-out registers as implicit uses,
570    // otherwise they appear dead.
571    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
572    MachineInstrBuilder MIB(*MF, MI);
573    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
574      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
575    return BB;
576  }
577  }
578
579  MI->eraseFromParent();
580  return BB;
581}
582
583//===----------------------------------------------------------------------===//
584// Custom DAG Lowering Operations
585//===----------------------------------------------------------------------===//
586
587SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
588  MachineFunction &MF = DAG.getMachineFunction();
589  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
590  switch (Op.getOpcode()) {
591  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
592  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
593  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
594  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
595  case ISD::SRA_PARTS:
596  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
597  case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
598  case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
599  case ISD::FCOS:
600  case ISD::FSIN: return LowerTrig(Op, DAG);
601  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
602  case ISD::STORE: return LowerSTORE(Op, DAG);
603  case ISD::LOAD: {
604    SDValue Result = LowerLOAD(Op, DAG);
605    assert((!Result.getNode() ||
606            Result.getNode()->getNumValues() == 2) &&
607           "Load should return a value and a chain");
608    return Result;
609  }
610
611  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
612  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
613  case ISD::INTRINSIC_VOID: {
614    SDValue Chain = Op.getOperand(0);
615    unsigned IntrinsicID =
616                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
617    switch (IntrinsicID) {
618    case AMDGPUIntrinsic::AMDGPU_store_output: {
619      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
620      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
621      MFI->LiveOuts.push_back(Reg);
622      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
623    }
624    case AMDGPUIntrinsic::R600_store_swizzle: {
625      SDLoc DL(Op);
626      const SDValue Args[8] = {
627        Chain,
628        Op.getOperand(2), // Export Value
629        Op.getOperand(3), // ArrayBase
630        Op.getOperand(4), // Type
631        DAG.getConstant(0, DL, MVT::i32), // SWZ_X
632        DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
633        DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
634        DAG.getConstant(3, DL, MVT::i32) // SWZ_W
635      };
636      return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
637    }
638
639    // default for switch(IntrinsicID)
640    default: break;
641    }
642    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
643    break;
644  }
645  case ISD::INTRINSIC_WO_CHAIN: {
646    unsigned IntrinsicID =
647                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
648    EVT VT = Op.getValueType();
649    SDLoc DL(Op);
650    switch(IntrinsicID) {
651    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
652    case AMDGPUIntrinsic::R600_load_input: {
653      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
654      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
655      MachineFunction &MF = DAG.getMachineFunction();
656      MachineRegisterInfo &MRI = MF.getRegInfo();
657      MRI.addLiveIn(Reg);
658      return DAG.getCopyFromReg(DAG.getEntryNode(),
659          SDLoc(DAG.getEntryNode()), Reg, VT);
660    }
661
662    case AMDGPUIntrinsic::R600_interp_input: {
663      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
664      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
665      MachineSDNode *interp;
666      if (ijb < 0) {
667        const R600InstrInfo *TII =
668            static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
669        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
670            MVT::v4f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32));
671        return DAG.getTargetExtractSubreg(
672            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
673            DL, MVT::f32, SDValue(interp, 0));
674      }
675      MachineFunction &MF = DAG.getMachineFunction();
676      MachineRegisterInfo &MRI = MF.getRegInfo();
677      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
678      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
679      MRI.addLiveIn(RegisterI);
680      MRI.addLiveIn(RegisterJ);
681      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
682          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
683      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
684          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
685
686      if (slot % 4 < 2)
687        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
688            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
689            RegisterJNode, RegisterINode);
690      else
691        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
692            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4, DL, MVT::i32),
693            RegisterJNode, RegisterINode);
694      return SDValue(interp, slot % 2);
695    }
696    case AMDGPUIntrinsic::R600_interp_xy:
697    case AMDGPUIntrinsic::R600_interp_zw: {
698      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
699      MachineSDNode *interp;
700      SDValue RegisterINode = Op.getOperand(2);
701      SDValue RegisterJNode = Op.getOperand(3);
702
703      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
704        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
705            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
706            RegisterJNode, RegisterINode);
707      else
708        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
709            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, DL, MVT::i32),
710            RegisterJNode, RegisterINode);
711      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
712          SDValue(interp, 0), SDValue(interp, 1));
713    }
714    case AMDGPUIntrinsic::R600_tex:
715    case AMDGPUIntrinsic::R600_texc:
716    case AMDGPUIntrinsic::R600_txl:
717    case AMDGPUIntrinsic::R600_txlc:
718    case AMDGPUIntrinsic::R600_txb:
719    case AMDGPUIntrinsic::R600_txbc:
720    case AMDGPUIntrinsic::R600_txf:
721    case AMDGPUIntrinsic::R600_txq:
722    case AMDGPUIntrinsic::R600_ddx:
723    case AMDGPUIntrinsic::R600_ddy:
724    case AMDGPUIntrinsic::R600_ldptr: {
725      unsigned TextureOp;
726      switch (IntrinsicID) {
727      case AMDGPUIntrinsic::R600_tex:
728        TextureOp = 0;
729        break;
730      case AMDGPUIntrinsic::R600_texc:
731        TextureOp = 1;
732        break;
733      case AMDGPUIntrinsic::R600_txl:
734        TextureOp = 2;
735        break;
736      case AMDGPUIntrinsic::R600_txlc:
737        TextureOp = 3;
738        break;
739      case AMDGPUIntrinsic::R600_txb:
740        TextureOp = 4;
741        break;
742      case AMDGPUIntrinsic::R600_txbc:
743        TextureOp = 5;
744        break;
745      case AMDGPUIntrinsic::R600_txf:
746        TextureOp = 6;
747        break;
748      case AMDGPUIntrinsic::R600_txq:
749        TextureOp = 7;
750        break;
751      case AMDGPUIntrinsic::R600_ddx:
752        TextureOp = 8;
753        break;
754      case AMDGPUIntrinsic::R600_ddy:
755        TextureOp = 9;
756        break;
757      case AMDGPUIntrinsic::R600_ldptr:
758        TextureOp = 10;
759        break;
760      default:
761        llvm_unreachable("Unknow Texture Operation");
762      }
763
764      SDValue TexArgs[19] = {
765        DAG.getConstant(TextureOp, DL, MVT::i32),
766        Op.getOperand(1),
767        DAG.getConstant(0, DL, MVT::i32),
768        DAG.getConstant(1, DL, MVT::i32),
769        DAG.getConstant(2, DL, MVT::i32),
770        DAG.getConstant(3, DL, MVT::i32),
771        Op.getOperand(2),
772        Op.getOperand(3),
773        Op.getOperand(4),
774        DAG.getConstant(0, DL, MVT::i32),
775        DAG.getConstant(1, DL, MVT::i32),
776        DAG.getConstant(2, DL, MVT::i32),
777        DAG.getConstant(3, DL, MVT::i32),
778        Op.getOperand(5),
779        Op.getOperand(6),
780        Op.getOperand(7),
781        Op.getOperand(8),
782        Op.getOperand(9),
783        Op.getOperand(10)
784      };
785      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
786    }
787    case AMDGPUIntrinsic::AMDGPU_dp4: {
788      SDValue Args[8] = {
789      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
790          DAG.getConstant(0, DL, MVT::i32)),
791      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
792          DAG.getConstant(0, DL, MVT::i32)),
793      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
794          DAG.getConstant(1, DL, MVT::i32)),
795      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
796          DAG.getConstant(1, DL, MVT::i32)),
797      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
798          DAG.getConstant(2, DL, MVT::i32)),
799      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
800          DAG.getConstant(2, DL, MVT::i32)),
801      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
802          DAG.getConstant(3, DL, MVT::i32)),
803      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
804          DAG.getConstant(3, DL, MVT::i32))
805      };
806      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
807    }
808
809    case Intrinsic::r600_read_ngroups_x:
810      return LowerImplicitParameter(DAG, VT, DL, 0);
811    case Intrinsic::r600_read_ngroups_y:
812      return LowerImplicitParameter(DAG, VT, DL, 1);
813    case Intrinsic::r600_read_ngroups_z:
814      return LowerImplicitParameter(DAG, VT, DL, 2);
815    case Intrinsic::r600_read_global_size_x:
816      return LowerImplicitParameter(DAG, VT, DL, 3);
817    case Intrinsic::r600_read_global_size_y:
818      return LowerImplicitParameter(DAG, VT, DL, 4);
819    case Intrinsic::r600_read_global_size_z:
820      return LowerImplicitParameter(DAG, VT, DL, 5);
821    case Intrinsic::r600_read_local_size_x:
822      return LowerImplicitParameter(DAG, VT, DL, 6);
823    case Intrinsic::r600_read_local_size_y:
824      return LowerImplicitParameter(DAG, VT, DL, 7);
825    case Intrinsic::r600_read_local_size_z:
826      return LowerImplicitParameter(DAG, VT, DL, 8);
827
828    case Intrinsic::AMDGPU_read_workdim: {
829      uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
830      return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
831    }
832
833    case Intrinsic::r600_read_tgid_x:
834      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
835                                  AMDGPU::T1_X, VT);
836    case Intrinsic::r600_read_tgid_y:
837      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
838                                  AMDGPU::T1_Y, VT);
839    case Intrinsic::r600_read_tgid_z:
840      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
841                                  AMDGPU::T1_Z, VT);
842    case Intrinsic::r600_read_tidig_x:
843      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
844                                  AMDGPU::T0_X, VT);
845    case Intrinsic::r600_read_tidig_y:
846      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
847                                  AMDGPU::T0_Y, VT);
848    case Intrinsic::r600_read_tidig_z:
849      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
850                                  AMDGPU::T0_Z, VT);
851    case Intrinsic::AMDGPU_rsq:
852      // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
853      return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
854
855    case AMDGPUIntrinsic::AMDGPU_fract:
856    case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
857      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
858    }
859    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
860    break;
861  }
862  } // end switch(Op.getOpcode())
863  return SDValue();
864}
865
866void R600TargetLowering::ReplaceNodeResults(SDNode *N,
867                                            SmallVectorImpl<SDValue> &Results,
868                                            SelectionDAG &DAG) const {
869  switch (N->getOpcode()) {
870  default:
871    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
872    return;
873  case ISD::FP_TO_UINT:
874    if (N->getValueType(0) == MVT::i1) {
875      Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
876      return;
877    }
878    // Fall-through. Since we don't care about out of bounds values
879    // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
880    // considers some extra cases which are not necessary here.
881  case ISD::FP_TO_SINT: {
882    SDValue Result;
883    if (expandFP_TO_SINT(N, Result, DAG))
884      Results.push_back(Result);
885    return;
886  }
887  case ISD::SDIVREM: {
888    SDValue Op = SDValue(N, 1);
889    SDValue RES = LowerSDIVREM(Op, DAG);
890    Results.push_back(RES);
891    Results.push_back(RES.getValue(1));
892    break;
893  }
894  case ISD::UDIVREM: {
895    SDValue Op = SDValue(N, 0);
896    LowerUDIVREM64(Op, DAG, Results);
897    break;
898  }
899  }
900}
901
902SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
903                                                   SDValue Vector) const {
904
905  SDLoc DL(Vector);
906  EVT VecVT = Vector.getValueType();
907  EVT EltVT = VecVT.getVectorElementType();
908  SmallVector<SDValue, 8> Args;
909
910  for (unsigned i = 0, e = VecVT.getVectorNumElements();
911                                                           i != e; ++i) {
912    Args.push_back(DAG.getNode(
913        ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
914        DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
915  }
916
917  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
918}
919
920SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
921                                                    SelectionDAG &DAG) const {
922
923  SDLoc DL(Op);
924  SDValue Vector = Op.getOperand(0);
925  SDValue Index = Op.getOperand(1);
926
927  if (isa<ConstantSDNode>(Index) ||
928      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
929    return Op;
930
931  Vector = vectorToVerticalVector(DAG, Vector);
932  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
933                     Vector, Index);
934}
935
936SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
937                                                   SelectionDAG &DAG) const {
938  SDLoc DL(Op);
939  SDValue Vector = Op.getOperand(0);
940  SDValue Value = Op.getOperand(1);
941  SDValue Index = Op.getOperand(2);
942
943  if (isa<ConstantSDNode>(Index) ||
944      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
945    return Op;
946
947  Vector = vectorToVerticalVector(DAG, Vector);
948  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
949                               Vector, Value, Index);
950  return vectorToVerticalVector(DAG, Insert);
951}
952
953SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
954  // On hw >= R700, COS/SIN input must be between -1. and 1.
955  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
956  EVT VT = Op.getValueType();
957  SDValue Arg = Op.getOperand(0);
958  SDLoc DL(Op);
959
960  // TODO: Should this propagate fast-math-flags?
961  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
962      DAG.getNode(ISD::FADD, DL, VT,
963        DAG.getNode(ISD::FMUL, DL, VT, Arg,
964          DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
965        DAG.getConstantFP(0.5, DL, MVT::f32)));
966  unsigned TrigNode;
967  switch (Op.getOpcode()) {
968  case ISD::FCOS:
969    TrigNode = AMDGPUISD::COS_HW;
970    break;
971  case ISD::FSIN:
972    TrigNode = AMDGPUISD::SIN_HW;
973    break;
974  default:
975    llvm_unreachable("Wrong trig opcode");
976  }
977  SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
978      DAG.getNode(ISD::FADD, DL, VT, FractPart,
979        DAG.getConstantFP(-0.5, DL, MVT::f32)));
980  if (Gen >= AMDGPUSubtarget::R700)
981    return TrigVal;
982  // On R600 hw, COS/SIN input must be between -Pi and Pi.
983  return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
984      DAG.getConstantFP(3.14159265359, DL, MVT::f32));
985}
986
987SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
988  SDLoc DL(Op);
989  EVT VT = Op.getValueType();
990
991  SDValue Lo = Op.getOperand(0);
992  SDValue Hi = Op.getOperand(1);
993  SDValue Shift = Op.getOperand(2);
994  SDValue Zero = DAG.getConstant(0, DL, VT);
995  SDValue One  = DAG.getConstant(1, DL, VT);
996
997  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
998  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
999  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1000  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1001
1002  // The dance around Width1 is necessary for 0 special case.
1003  // Without it the CompShift might be 32, producing incorrect results in
1004  // Overflow. So we do the shift in two steps, the alternative is to
1005  // add a conditional to filter the special case.
1006
1007  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1008  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1009
1010  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1011  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1012  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1013
1014  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1015  SDValue LoBig = Zero;
1016
1017  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1018  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1019
1020  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1021}
1022
1023SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1024  SDLoc DL(Op);
1025  EVT VT = Op.getValueType();
1026
1027  SDValue Lo = Op.getOperand(0);
1028  SDValue Hi = Op.getOperand(1);
1029  SDValue Shift = Op.getOperand(2);
1030  SDValue Zero = DAG.getConstant(0, DL, VT);
1031  SDValue One  = DAG.getConstant(1, DL, VT);
1032
1033  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1034
1035  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1036  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1037  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1038  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1039
1040  // The dance around Width1 is necessary for 0 special case.
1041  // Without it the CompShift might be 32, producing incorrect results in
1042  // Overflow. So we do the shift in two steps, the alternative is to
1043  // add a conditional to filter the special case.
1044
1045  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1046  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1047
1048  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1049  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1050  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1051
1052  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1053  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1054
1055  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1056  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1057
1058  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1059}
1060
1061SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1062                                          unsigned mainop, unsigned ovf) const {
1063  SDLoc DL(Op);
1064  EVT VT = Op.getValueType();
1065
1066  SDValue Lo = Op.getOperand(0);
1067  SDValue Hi = Op.getOperand(1);
1068
1069  SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1070  // Extend sign.
1071  OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1072                    DAG.getValueType(MVT::i1));
1073
1074  SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1075
1076  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1077}
1078
1079SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1080  SDLoc DL(Op);
1081  return DAG.getNode(
1082      ISD::SETCC,
1083      DL,
1084      MVT::i1,
1085      Op, DAG.getConstantFP(0.0f, DL, MVT::f32),
1086      DAG.getCondCode(ISD::SETNE)
1087      );
1088}
1089
1090SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1091                                                   SDLoc DL,
1092                                                   unsigned DwordOffset) const {
1093  unsigned ByteOffset = DwordOffset * 4;
1094  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1095                                      AMDGPUAS::CONSTANT_BUFFER_0);
1096
1097  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1098  assert(isInt<16>(ByteOffset));
1099
1100  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1101                     DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1102                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1103                     false, false, false, 0);
1104}
1105
1106bool R600TargetLowering::isZero(SDValue Op) const {
1107  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1108    return Cst->isNullValue();
1109  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1110    return CstFP->isZero();
1111  } else {
1112    return false;
1113  }
1114}
1115
1116SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1117  SDLoc DL(Op);
1118  EVT VT = Op.getValueType();
1119
1120  SDValue LHS = Op.getOperand(0);
1121  SDValue RHS = Op.getOperand(1);
1122  SDValue True = Op.getOperand(2);
1123  SDValue False = Op.getOperand(3);
1124  SDValue CC = Op.getOperand(4);
1125  SDValue Temp;
1126
1127  if (VT == MVT::f32) {
1128    DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1129    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1130    if (MinMax)
1131      return MinMax;
1132  }
1133
1134  // LHS and RHS are guaranteed to be the same value type
1135  EVT CompareVT = LHS.getValueType();
1136
1137  // Check if we can lower this to a native operation.
1138
1139  // Try to lower to a SET* instruction:
1140  //
1141  // SET* can match the following patterns:
1142  //
1143  // select_cc f32, f32, -1,  0, cc_supported
1144  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1145  // select_cc i32, i32, -1,  0, cc_supported
1146  //
1147
1148  // Move hardware True/False values to the correct operand.
1149  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1150  ISD::CondCode InverseCC =
1151     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1152  if (isHWTrueValue(False) && isHWFalseValue(True)) {
1153    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1154      std::swap(False, True);
1155      CC = DAG.getCondCode(InverseCC);
1156    } else {
1157      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1158      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1159        std::swap(False, True);
1160        std::swap(LHS, RHS);
1161        CC = DAG.getCondCode(SwapInvCC);
1162      }
1163    }
1164  }
1165
1166  if (isHWTrueValue(True) && isHWFalseValue(False) &&
1167      (CompareVT == VT || VT == MVT::i32)) {
1168    // This can be matched by a SET* instruction.
1169    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1170  }
1171
1172  // Try to lower to a CND* instruction:
1173  //
1174  // CND* can match the following patterns:
1175  //
1176  // select_cc f32, 0.0, f32, f32, cc_supported
1177  // select_cc f32, 0.0, i32, i32, cc_supported
1178  // select_cc i32, 0,   f32, f32, cc_supported
1179  // select_cc i32, 0,   i32, i32, cc_supported
1180  //
1181
1182  // Try to move the zero value to the RHS
1183  if (isZero(LHS)) {
1184    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1185    // Try swapping the operands
1186    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1187    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1188      std::swap(LHS, RHS);
1189      CC = DAG.getCondCode(CCSwapped);
1190    } else {
1191      // Try inverting the conditon and then swapping the operands
1192      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1193      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1194      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1195        std::swap(True, False);
1196        std::swap(LHS, RHS);
1197        CC = DAG.getCondCode(CCSwapped);
1198      }
1199    }
1200  }
1201  if (isZero(RHS)) {
1202    SDValue Cond = LHS;
1203    SDValue Zero = RHS;
1204    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1205    if (CompareVT != VT) {
1206      // Bitcast True / False to the correct types.  This will end up being
1207      // a nop, but it allows us to define only a single pattern in the
1208      // .TD files for each CND* instruction rather than having to have
1209      // one pattern for integer True/False and one for fp True/False
1210      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1211      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1212    }
1213
1214    switch (CCOpcode) {
1215    case ISD::SETONE:
1216    case ISD::SETUNE:
1217    case ISD::SETNE:
1218      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1219      Temp = True;
1220      True = False;
1221      False = Temp;
1222      break;
1223    default:
1224      break;
1225    }
1226    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1227        Cond, Zero,
1228        True, False,
1229        DAG.getCondCode(CCOpcode));
1230    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1231  }
1232
1233  // If we make it this for it means we have no native instructions to handle
1234  // this SELECT_CC, so we must lower it.
1235  SDValue HWTrue, HWFalse;
1236
1237  if (CompareVT == MVT::f32) {
1238    HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1239    HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1240  } else if (CompareVT == MVT::i32) {
1241    HWTrue = DAG.getConstant(-1, DL, CompareVT);
1242    HWFalse = DAG.getConstant(0, DL, CompareVT);
1243  }
1244  else {
1245    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1246  }
1247
1248  // Lower this unsupported SELECT_CC into a combination of two supported
1249  // SELECT_CC operations.
1250  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1251
1252  return DAG.getNode(ISD::SELECT_CC, DL, VT,
1253      Cond, HWFalse,
1254      True, False,
1255      DAG.getCondCode(ISD::SETNE));
1256}
1257
1258/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1259/// convert these pointers to a register index.  Each register holds
1260/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1261/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1262/// for indirect addressing.
1263SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1264                                               unsigned StackWidth,
1265                                               SelectionDAG &DAG) const {
1266  unsigned SRLPad;
1267  switch(StackWidth) {
1268  case 1:
1269    SRLPad = 2;
1270    break;
1271  case 2:
1272    SRLPad = 3;
1273    break;
1274  case 4:
1275    SRLPad = 4;
1276    break;
1277  default: llvm_unreachable("Invalid stack width");
1278  }
1279
1280  SDLoc DL(Ptr);
1281  return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1282                     DAG.getConstant(SRLPad, DL, MVT::i32));
1283}
1284
1285void R600TargetLowering::getStackAddress(unsigned StackWidth,
1286                                         unsigned ElemIdx,
1287                                         unsigned &Channel,
1288                                         unsigned &PtrIncr) const {
1289  switch (StackWidth) {
1290  default:
1291  case 1:
1292    Channel = 0;
1293    if (ElemIdx > 0) {
1294      PtrIncr = 1;
1295    } else {
1296      PtrIncr = 0;
1297    }
1298    break;
1299  case 2:
1300    Channel = ElemIdx % 2;
1301    if (ElemIdx == 2) {
1302      PtrIncr = 1;
1303    } else {
1304      PtrIncr = 0;
1305    }
1306    break;
1307  case 4:
1308    Channel = ElemIdx;
1309    PtrIncr = 0;
1310    break;
1311  }
1312}
1313
1314SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1315  SDLoc DL(Op);
1316  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1317  SDValue Chain = Op.getOperand(0);
1318  SDValue Value = Op.getOperand(1);
1319  SDValue Ptr = Op.getOperand(2);
1320
1321  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1322  if (Result.getNode()) {
1323    return Result;
1324  }
1325
1326  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1327    if (StoreNode->isTruncatingStore()) {
1328      EVT VT = Value.getValueType();
1329      assert(VT.bitsLE(MVT::i32));
1330      EVT MemVT = StoreNode->getMemoryVT();
1331      SDValue MaskConstant;
1332      if (MemVT == MVT::i8) {
1333        MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1334      } else {
1335        assert(MemVT == MVT::i16);
1336        MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1337      }
1338      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1339                                      DAG.getConstant(2, DL, MVT::i32));
1340      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1341                                      DAG.getConstant(0x00000003, DL, VT));
1342      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1343      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1344                                   DAG.getConstant(3, DL, VT));
1345      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1346      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1347      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1348      // vector instead.
1349      SDValue Src[4] = {
1350        ShiftedValue,
1351        DAG.getConstant(0, DL, MVT::i32),
1352        DAG.getConstant(0, DL, MVT::i32),
1353        Mask
1354      };
1355      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1356      SDValue Args[3] = { Chain, Input, DWordAddr };
1357      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1358                                     Op->getVTList(), Args, MemVT,
1359                                     StoreNode->getMemOperand());
1360    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1361               Value.getValueType().bitsGE(MVT::i32)) {
1362      // Convert pointer from byte address to dword address.
1363      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1364                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1365                                    Ptr, DAG.getConstant(2, DL, MVT::i32)));
1366
1367      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1368        llvm_unreachable("Truncated and indexed stores not supported yet");
1369      } else {
1370        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1371      }
1372      return Chain;
1373    }
1374  }
1375
1376  EVT ValueVT = Value.getValueType();
1377
1378  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1379    return SDValue();
1380  }
1381
1382  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1383  if (Ret.getNode()) {
1384    return Ret;
1385  }
1386  // Lowering for indirect addressing
1387
1388  const MachineFunction &MF = DAG.getMachineFunction();
1389  const AMDGPUFrameLowering *TFL =
1390      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1391  unsigned StackWidth = TFL->getStackWidth(MF);
1392
1393  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1394
1395  if (ValueVT.isVector()) {
1396    unsigned NumElemVT = ValueVT.getVectorNumElements();
1397    EVT ElemVT = ValueVT.getVectorElementType();
1398    SmallVector<SDValue, 4> Stores(NumElemVT);
1399
1400    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1401                                      "vector width in load");
1402
1403    for (unsigned i = 0; i < NumElemVT; ++i) {
1404      unsigned Channel, PtrIncr;
1405      getStackAddress(StackWidth, i, Channel, PtrIncr);
1406      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1407                        DAG.getConstant(PtrIncr, DL, MVT::i32));
1408      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1409                                 Value, DAG.getConstant(i, DL, MVT::i32));
1410
1411      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1412                              Chain, Elem, Ptr,
1413                              DAG.getTargetConstant(Channel, DL, MVT::i32));
1414    }
1415     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1416   } else {
1417    if (ValueVT == MVT::i8) {
1418      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1419    }
1420    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1421    DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1422  }
1423
1424  return Chain;
1425}
1426
1427// return (512 + (kc_bank << 12)
1428static int
1429ConstantAddressBlock(unsigned AddressSpace) {
1430  switch (AddressSpace) {
1431  case AMDGPUAS::CONSTANT_BUFFER_0:
1432    return 512;
1433  case AMDGPUAS::CONSTANT_BUFFER_1:
1434    return 512 + 4096;
1435  case AMDGPUAS::CONSTANT_BUFFER_2:
1436    return 512 + 4096 * 2;
1437  case AMDGPUAS::CONSTANT_BUFFER_3:
1438    return 512 + 4096 * 3;
1439  case AMDGPUAS::CONSTANT_BUFFER_4:
1440    return 512 + 4096 * 4;
1441  case AMDGPUAS::CONSTANT_BUFFER_5:
1442    return 512 + 4096 * 5;
1443  case AMDGPUAS::CONSTANT_BUFFER_6:
1444    return 512 + 4096 * 6;
1445  case AMDGPUAS::CONSTANT_BUFFER_7:
1446    return 512 + 4096 * 7;
1447  case AMDGPUAS::CONSTANT_BUFFER_8:
1448    return 512 + 4096 * 8;
1449  case AMDGPUAS::CONSTANT_BUFFER_9:
1450    return 512 + 4096 * 9;
1451  case AMDGPUAS::CONSTANT_BUFFER_10:
1452    return 512 + 4096 * 10;
1453  case AMDGPUAS::CONSTANT_BUFFER_11:
1454    return 512 + 4096 * 11;
1455  case AMDGPUAS::CONSTANT_BUFFER_12:
1456    return 512 + 4096 * 12;
1457  case AMDGPUAS::CONSTANT_BUFFER_13:
1458    return 512 + 4096 * 13;
1459  case AMDGPUAS::CONSTANT_BUFFER_14:
1460    return 512 + 4096 * 14;
1461  case AMDGPUAS::CONSTANT_BUFFER_15:
1462    return 512 + 4096 * 15;
1463  default:
1464    return -1;
1465  }
1466}
1467
1468SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1469{
1470  EVT VT = Op.getValueType();
1471  SDLoc DL(Op);
1472  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1473  SDValue Chain = Op.getOperand(0);
1474  SDValue Ptr = Op.getOperand(1);
1475  SDValue LoweredLoad;
1476
1477  if (SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG))
1478    return Ret;
1479
1480  // Lower loads constant address space global variable loads
1481  if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1482      isa<GlobalVariable>(GetUnderlyingObject(
1483          LoadNode->getMemOperand()->getValue(), DAG.getDataLayout()))) {
1484
1485    SDValue Ptr = DAG.getZExtOrTrunc(
1486        LoadNode->getBasePtr(), DL,
1487        getPointerTy(DAG.getDataLayout(), AMDGPUAS::PRIVATE_ADDRESS));
1488    Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1489        DAG.getConstant(2, DL, MVT::i32));
1490    return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1491                       LoadNode->getChain(), Ptr,
1492                       DAG.getTargetConstant(0, DL, MVT::i32),
1493                       Op.getOperand(2));
1494  }
1495
1496  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1497    SDValue MergedValues[2] = {
1498      ScalarizeVectorLoad(Op, DAG),
1499      Chain
1500    };
1501    return DAG.getMergeValues(MergedValues, DL);
1502  }
1503
1504  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1505  if (ConstantBlock > -1 &&
1506      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1507       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1508    SDValue Result;
1509    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1510        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1511        isa<ConstantSDNode>(Ptr)) {
1512      SDValue Slots[4];
1513      for (unsigned i = 0; i < 4; i++) {
1514        // We want Const position encoded with the following formula :
1515        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1516        // const_index is Ptr computed by llvm using an alignment of 16.
1517        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1518        // then div by 4 at the ISel step
1519        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1520            DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1521        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1522      }
1523      EVT NewVT = MVT::v4i32;
1524      unsigned NumElements = 4;
1525      if (VT.isVector()) {
1526        NewVT = VT;
1527        NumElements = VT.getVectorNumElements();
1528      }
1529      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1530                           makeArrayRef(Slots, NumElements));
1531    } else {
1532      // non-constant ptr can't be folded, keeps it as a v4f32 load
1533      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1534          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1535                      DAG.getConstant(4, DL, MVT::i32)),
1536                      DAG.getConstant(LoadNode->getAddressSpace() -
1537                                      AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1538          );
1539    }
1540
1541    if (!VT.isVector()) {
1542      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1543                           DAG.getConstant(0, DL, MVT::i32));
1544    }
1545
1546    SDValue MergedValues[2] = {
1547      Result,
1548      Chain
1549    };
1550    return DAG.getMergeValues(MergedValues, DL);
1551  }
1552
1553  // For most operations returning SDValue() will result in the node being
1554  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1555  // need to manually expand loads that may be legal in some address spaces and
1556  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1557  // compute shaders, since the data is sign extended when it is uploaded to the
1558  // buffer. However SEXT loads from other address spaces are not supported, so
1559  // we need to expand them here.
1560  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1561    EVT MemVT = LoadNode->getMemoryVT();
1562    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1563    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1564                                  LoadNode->getPointerInfo(), MemVT,
1565                                  LoadNode->isVolatile(),
1566                                  LoadNode->isNonTemporal(),
1567                                  LoadNode->isInvariant(),
1568                                  LoadNode->getAlignment());
1569    SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1570                              DAG.getValueType(MemVT));
1571
1572    SDValue MergedValues[2] = { Res, Chain };
1573    return DAG.getMergeValues(MergedValues, DL);
1574  }
1575
1576  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1577    return SDValue();
1578  }
1579
1580  // Lowering for indirect addressing
1581  const MachineFunction &MF = DAG.getMachineFunction();
1582  const AMDGPUFrameLowering *TFL =
1583      static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1584  unsigned StackWidth = TFL->getStackWidth(MF);
1585
1586  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1587
1588  if (VT.isVector()) {
1589    unsigned NumElemVT = VT.getVectorNumElements();
1590    EVT ElemVT = VT.getVectorElementType();
1591    SDValue Loads[4];
1592
1593    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1594                                      "vector width in load");
1595
1596    for (unsigned i = 0; i < NumElemVT; ++i) {
1597      unsigned Channel, PtrIncr;
1598      getStackAddress(StackWidth, i, Channel, PtrIncr);
1599      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1600                        DAG.getConstant(PtrIncr, DL, MVT::i32));
1601      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1602                             Chain, Ptr,
1603                             DAG.getTargetConstant(Channel, DL, MVT::i32),
1604                             Op.getOperand(2));
1605    }
1606    for (unsigned i = NumElemVT; i < 4; ++i) {
1607      Loads[i] = DAG.getUNDEF(ElemVT);
1608    }
1609    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1610    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1611  } else {
1612    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1613                              Chain, Ptr,
1614                              DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1615                              Op.getOperand(2));
1616  }
1617
1618  SDValue Ops[2] = {
1619    LoweredLoad,
1620    Chain
1621  };
1622
1623  return DAG.getMergeValues(Ops, DL);
1624}
1625
1626SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1627  SDValue Chain = Op.getOperand(0);
1628  SDValue Cond  = Op.getOperand(1);
1629  SDValue Jump  = Op.getOperand(2);
1630
1631  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1632                     Chain, Jump, Cond);
1633}
1634
1635/// XXX Only kernel functions are supported, so we can assume for now that
1636/// every function is a kernel function, but in the future we should use
1637/// separate calling conventions for kernel and non-kernel functions.
1638SDValue R600TargetLowering::LowerFormalArguments(
1639                                      SDValue Chain,
1640                                      CallingConv::ID CallConv,
1641                                      bool isVarArg,
1642                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1643                                      SDLoc DL, SelectionDAG &DAG,
1644                                      SmallVectorImpl<SDValue> &InVals) const {
1645  SmallVector<CCValAssign, 16> ArgLocs;
1646  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1647                 *DAG.getContext());
1648  MachineFunction &MF = DAG.getMachineFunction();
1649  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1650
1651  SmallVector<ISD::InputArg, 8> LocalIns;
1652
1653  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1654
1655  AnalyzeFormalArguments(CCInfo, LocalIns);
1656
1657  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1658    CCValAssign &VA = ArgLocs[i];
1659    const ISD::InputArg &In = Ins[i];
1660    EVT VT = In.VT;
1661    EVT MemVT = VA.getLocVT();
1662    if (!VT.isVector() && MemVT.isVector()) {
1663      // Get load source type if scalarized.
1664      MemVT = MemVT.getVectorElementType();
1665    }
1666
1667    if (MFI->getShaderType() != ShaderType::COMPUTE) {
1668      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1669      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1670      InVals.push_back(Register);
1671      continue;
1672    }
1673
1674    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1675                                          AMDGPUAS::CONSTANT_BUFFER_0);
1676
1677    // i64 isn't a legal type, so the register type used ends up as i32, which
1678    // isn't expected here. It attempts to create this sextload, but it ends up
1679    // being invalid. Somehow this seems to work with i64 arguments, but breaks
1680    // for <1 x i64>.
1681
1682    // The first 36 bytes of the input buffer contains information about
1683    // thread group and global sizes.
1684    ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1685    if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1686      // FIXME: This should really check the extload type, but the handling of
1687      // extload vector parameters seems to be broken.
1688
1689      // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1690      Ext = ISD::SEXTLOAD;
1691    }
1692
1693    // Compute the offset from the value.
1694    // XXX - I think PartOffset should give you this, but it seems to give the
1695    // size of the register which isn't useful.
1696
1697    unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1698    unsigned PartOffset = VA.getLocMemOffset();
1699    unsigned Offset = 36 + VA.getLocMemOffset();
1700
1701    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1702    SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1703                              DAG.getConstant(Offset, DL, MVT::i32),
1704                              DAG.getUNDEF(MVT::i32),
1705                              PtrInfo,
1706                              MemVT, false, true, true, 4);
1707
1708    // 4 is the preferred alignment for the CONSTANT memory space.
1709    InVals.push_back(Arg);
1710    MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1711  }
1712  return Chain;
1713}
1714
1715EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1716                                           EVT VT) const {
1717   if (!VT.isVector())
1718     return MVT::i32;
1719   return VT.changeVectorElementTypeToInteger();
1720}
1721
1722static SDValue CompactSwizzlableVector(
1723  SelectionDAG &DAG, SDValue VectorEntry,
1724  DenseMap<unsigned, unsigned> &RemapSwizzle) {
1725  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1726  assert(RemapSwizzle.empty());
1727  SDValue NewBldVec[4] = {
1728    VectorEntry.getOperand(0),
1729    VectorEntry.getOperand(1),
1730    VectorEntry.getOperand(2),
1731    VectorEntry.getOperand(3)
1732  };
1733
1734  for (unsigned i = 0; i < 4; i++) {
1735    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1736      // We mask write here to teach later passes that the ith element of this
1737      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1738      // break false dependencies and additionnaly make assembly easier to read.
1739      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1740    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1741      if (C->isZero()) {
1742        RemapSwizzle[i] = 4; // SEL_0
1743        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1744      } else if (C->isExactlyValue(1.0)) {
1745        RemapSwizzle[i] = 5; // SEL_1
1746        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1747      }
1748    }
1749
1750    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1751      continue;
1752    for (unsigned j = 0; j < i; j++) {
1753      if (NewBldVec[i] == NewBldVec[j]) {
1754        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1755        RemapSwizzle[i] = j;
1756        break;
1757      }
1758    }
1759  }
1760
1761  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1762                     VectorEntry.getValueType(), NewBldVec);
1763}
1764
1765static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1766                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
1767  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1768  assert(RemapSwizzle.empty());
1769  SDValue NewBldVec[4] = {
1770      VectorEntry.getOperand(0),
1771      VectorEntry.getOperand(1),
1772      VectorEntry.getOperand(2),
1773      VectorEntry.getOperand(3)
1774  };
1775  bool isUnmovable[4] = { false, false, false, false };
1776  for (unsigned i = 0; i < 4; i++) {
1777    RemapSwizzle[i] = i;
1778    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1779      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1780          ->getZExtValue();
1781      if (i == Idx)
1782        isUnmovable[Idx] = true;
1783    }
1784  }
1785
1786  for (unsigned i = 0; i < 4; i++) {
1787    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1788      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1789          ->getZExtValue();
1790      if (isUnmovable[Idx])
1791        continue;
1792      // Swap i and Idx
1793      std::swap(NewBldVec[Idx], NewBldVec[i]);
1794      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1795      break;
1796    }
1797  }
1798
1799  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1800                     VectorEntry.getValueType(), NewBldVec);
1801}
1802
1803
1804SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1805                                            SDValue Swz[4], SelectionDAG &DAG,
1806                                            SDLoc DL) const {
1807  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1808  // Old -> New swizzle values
1809  DenseMap<unsigned, unsigned> SwizzleRemap;
1810
1811  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1812  for (unsigned i = 0; i < 4; i++) {
1813    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1814    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1815      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1816  }
1817
1818  SwizzleRemap.clear();
1819  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1820  for (unsigned i = 0; i < 4; i++) {
1821    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1822    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1823      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1824  }
1825
1826  return BuildVector;
1827}
1828
1829
1830//===----------------------------------------------------------------------===//
1831// Custom DAG Optimizations
1832//===----------------------------------------------------------------------===//
1833
1834SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1835                                              DAGCombinerInfo &DCI) const {
1836  SelectionDAG &DAG = DCI.DAG;
1837
1838  switch (N->getOpcode()) {
1839  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1840  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1841  case ISD::FP_ROUND: {
1842      SDValue Arg = N->getOperand(0);
1843      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1844        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1845                           Arg.getOperand(0));
1846      }
1847      break;
1848    }
1849
1850  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1851  // (i32 select_cc f32, f32, -1, 0 cc)
1852  //
1853  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1854  // this to one of the SET*_DX10 instructions.
1855  case ISD::FP_TO_SINT: {
1856    SDValue FNeg = N->getOperand(0);
1857    if (FNeg.getOpcode() != ISD::FNEG) {
1858      return SDValue();
1859    }
1860    SDValue SelectCC = FNeg.getOperand(0);
1861    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1862        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1863        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1864        !isHWTrueValue(SelectCC.getOperand(2)) ||
1865        !isHWFalseValue(SelectCC.getOperand(3))) {
1866      return SDValue();
1867    }
1868
1869    SDLoc dl(N);
1870    return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1871                           SelectCC.getOperand(0), // LHS
1872                           SelectCC.getOperand(1), // RHS
1873                           DAG.getConstant(-1, dl, MVT::i32), // True
1874                           DAG.getConstant(0, dl, MVT::i32),  // False
1875                           SelectCC.getOperand(4)); // CC
1876
1877    break;
1878  }
1879
1880  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1881  // => build_vector elt0, ... , NewEltIdx, ... , eltN
1882  case ISD::INSERT_VECTOR_ELT: {
1883    SDValue InVec = N->getOperand(0);
1884    SDValue InVal = N->getOperand(1);
1885    SDValue EltNo = N->getOperand(2);
1886    SDLoc dl(N);
1887
1888    // If the inserted element is an UNDEF, just use the input vector.
1889    if (InVal.getOpcode() == ISD::UNDEF)
1890      return InVec;
1891
1892    EVT VT = InVec.getValueType();
1893
1894    // If we can't generate a legal BUILD_VECTOR, exit
1895    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1896      return SDValue();
1897
1898    // Check that we know which element is being inserted
1899    if (!isa<ConstantSDNode>(EltNo))
1900      return SDValue();
1901    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1902
1903    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1904    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1905    // vector elements.
1906    SmallVector<SDValue, 8> Ops;
1907    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1908      Ops.append(InVec.getNode()->op_begin(),
1909                 InVec.getNode()->op_end());
1910    } else if (InVec.getOpcode() == ISD::UNDEF) {
1911      unsigned NElts = VT.getVectorNumElements();
1912      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1913    } else {
1914      return SDValue();
1915    }
1916
1917    // Insert the element
1918    if (Elt < Ops.size()) {
1919      // All the operands of BUILD_VECTOR must have the same type;
1920      // we enforce that here.
1921      EVT OpVT = Ops[0].getValueType();
1922      if (InVal.getValueType() != OpVT)
1923        InVal = OpVT.bitsGT(InVal.getValueType()) ?
1924          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1925          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1926      Ops[Elt] = InVal;
1927    }
1928
1929    // Return the new vector
1930    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1931  }
1932
1933  // Extract_vec (Build_vector) generated by custom lowering
1934  // also needs to be customly combined
1935  case ISD::EXTRACT_VECTOR_ELT: {
1936    SDValue Arg = N->getOperand(0);
1937    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1938      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1939        unsigned Element = Const->getZExtValue();
1940        return Arg->getOperand(Element);
1941      }
1942    }
1943    if (Arg.getOpcode() == ISD::BITCAST &&
1944        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1945      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1946        unsigned Element = Const->getZExtValue();
1947        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1948            Arg->getOperand(0).getOperand(Element));
1949      }
1950    }
1951    break;
1952  }
1953
1954  case ISD::SELECT_CC: {
1955    // Try common optimizations
1956    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1957    if (Ret.getNode())
1958      return Ret;
1959
1960    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1961    //      selectcc x, y, a, b, inv(cc)
1962    //
1963    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1964    //      selectcc x, y, a, b, cc
1965    SDValue LHS = N->getOperand(0);
1966    if (LHS.getOpcode() != ISD::SELECT_CC) {
1967      return SDValue();
1968    }
1969
1970    SDValue RHS = N->getOperand(1);
1971    SDValue True = N->getOperand(2);
1972    SDValue False = N->getOperand(3);
1973    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1974
1975    if (LHS.getOperand(2).getNode() != True.getNode() ||
1976        LHS.getOperand(3).getNode() != False.getNode() ||
1977        RHS.getNode() != False.getNode()) {
1978      return SDValue();
1979    }
1980
1981    switch (NCC) {
1982    default: return SDValue();
1983    case ISD::SETNE: return LHS;
1984    case ISD::SETEQ: {
1985      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1986      LHSCC = ISD::getSetCCInverse(LHSCC,
1987                                  LHS.getOperand(0).getValueType().isInteger());
1988      if (DCI.isBeforeLegalizeOps() ||
1989          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1990        return DAG.getSelectCC(SDLoc(N),
1991                               LHS.getOperand(0),
1992                               LHS.getOperand(1),
1993                               LHS.getOperand(2),
1994                               LHS.getOperand(3),
1995                               LHSCC);
1996      break;
1997    }
1998    }
1999    return SDValue();
2000  }
2001
2002  case AMDGPUISD::EXPORT: {
2003    SDValue Arg = N->getOperand(1);
2004    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2005      break;
2006
2007    SDValue NewArgs[8] = {
2008      N->getOperand(0), // Chain
2009      SDValue(),
2010      N->getOperand(2), // ArrayBase
2011      N->getOperand(3), // Type
2012      N->getOperand(4), // SWZ_X
2013      N->getOperand(5), // SWZ_Y
2014      N->getOperand(6), // SWZ_Z
2015      N->getOperand(7) // SWZ_W
2016    };
2017    SDLoc DL(N);
2018    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2019    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2020  }
2021  case AMDGPUISD::TEXTURE_FETCH: {
2022    SDValue Arg = N->getOperand(1);
2023    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2024      break;
2025
2026    SDValue NewArgs[19] = {
2027      N->getOperand(0),
2028      N->getOperand(1),
2029      N->getOperand(2),
2030      N->getOperand(3),
2031      N->getOperand(4),
2032      N->getOperand(5),
2033      N->getOperand(6),
2034      N->getOperand(7),
2035      N->getOperand(8),
2036      N->getOperand(9),
2037      N->getOperand(10),
2038      N->getOperand(11),
2039      N->getOperand(12),
2040      N->getOperand(13),
2041      N->getOperand(14),
2042      N->getOperand(15),
2043      N->getOperand(16),
2044      N->getOperand(17),
2045      N->getOperand(18),
2046    };
2047    SDLoc DL(N);
2048    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2049    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2050  }
2051  }
2052
2053  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2054}
2055
2056static bool
2057FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2058            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2059  const R600InstrInfo *TII =
2060      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2061  if (!Src.isMachineOpcode())
2062    return false;
2063  switch (Src.getMachineOpcode()) {
2064  case AMDGPU::FNEG_R600:
2065    if (!Neg.getNode())
2066      return false;
2067    Src = Src.getOperand(0);
2068    Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2069    return true;
2070  case AMDGPU::FABS_R600:
2071    if (!Abs.getNode())
2072      return false;
2073    Src = Src.getOperand(0);
2074    Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2075    return true;
2076  case AMDGPU::CONST_COPY: {
2077    unsigned Opcode = ParentNode->getMachineOpcode();
2078    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2079
2080    if (!Sel.getNode())
2081      return false;
2082
2083    SDValue CstOffset = Src.getOperand(0);
2084    if (ParentNode->getValueType(0).isVector())
2085      return false;
2086
2087    // Gather constants values
2088    int SrcIndices[] = {
2089      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2090      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2091      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2092      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2093      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2094      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2095      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2096      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2097      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2098      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2099      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2100    };
2101    std::vector<unsigned> Consts;
2102    for (int OtherSrcIdx : SrcIndices) {
2103      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2104      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2105        continue;
2106      if (HasDst) {
2107        OtherSrcIdx--;
2108        OtherSelIdx--;
2109      }
2110      if (RegisterSDNode *Reg =
2111          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2112        if (Reg->getReg() == AMDGPU::ALU_CONST) {
2113          ConstantSDNode *Cst
2114            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2115          Consts.push_back(Cst->getZExtValue());
2116        }
2117      }
2118    }
2119
2120    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2121    Consts.push_back(Cst->getZExtValue());
2122    if (!TII->fitsConstReadLimitations(Consts)) {
2123      return false;
2124    }
2125
2126    Sel = CstOffset;
2127    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2128    return true;
2129  }
2130  case AMDGPU::MOV_IMM_I32:
2131  case AMDGPU::MOV_IMM_F32: {
2132    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2133    uint64_t ImmValue = 0;
2134
2135
2136    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2137      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2138      float FloatValue = FPC->getValueAPF().convertToFloat();
2139      if (FloatValue == 0.0) {
2140        ImmReg = AMDGPU::ZERO;
2141      } else if (FloatValue == 0.5) {
2142        ImmReg = AMDGPU::HALF;
2143      } else if (FloatValue == 1.0) {
2144        ImmReg = AMDGPU::ONE;
2145      } else {
2146        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2147      }
2148    } else {
2149      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2150      uint64_t Value = C->getZExtValue();
2151      if (Value == 0) {
2152        ImmReg = AMDGPU::ZERO;
2153      } else if (Value == 1) {
2154        ImmReg = AMDGPU::ONE_INT;
2155      } else {
2156        ImmValue = Value;
2157      }
2158    }
2159
2160    // Check that we aren't already using an immediate.
2161    // XXX: It's possible for an instruction to have more than one
2162    // immediate operand, but this is not supported yet.
2163    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2164      if (!Imm.getNode())
2165        return false;
2166      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2167      assert(C);
2168      if (C->getZExtValue())
2169        return false;
2170      Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2171    }
2172    Src = DAG.getRegister(ImmReg, MVT::i32);
2173    return true;
2174  }
2175  default:
2176    return false;
2177  }
2178}
2179
2180
2181/// \brief Fold the instructions after selecting them
2182SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2183                                            SelectionDAG &DAG) const {
2184  const R600InstrInfo *TII =
2185      static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2186  if (!Node->isMachineOpcode())
2187    return Node;
2188  unsigned Opcode = Node->getMachineOpcode();
2189  SDValue FakeOp;
2190
2191  std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2192
2193  if (Opcode == AMDGPU::DOT_4) {
2194    int OperandIdx[] = {
2195      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2196      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2197      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2198      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2199      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2200      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2201      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2202      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2203        };
2204    int NegIdx[] = {
2205      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2206      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2207      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2208      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2209      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2210      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2211      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2212      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2213    };
2214    int AbsIdx[] = {
2215      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2216      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2217      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2218      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2219      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2220      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2221      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2222      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2223    };
2224    for (unsigned i = 0; i < 8; i++) {
2225      if (OperandIdx[i] < 0)
2226        return Node;
2227      SDValue &Src = Ops[OperandIdx[i] - 1];
2228      SDValue &Neg = Ops[NegIdx[i] - 1];
2229      SDValue &Abs = Ops[AbsIdx[i] - 1];
2230      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2231      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2232      if (HasDst)
2233        SelIdx--;
2234      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2235      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2236        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2237    }
2238  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2239    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2240      SDValue &Src = Ops[i];
2241      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2242        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2243    }
2244  } else if (Opcode == AMDGPU::CLAMP_R600) {
2245    SDValue Src = Node->getOperand(0);
2246    if (!Src.isMachineOpcode() ||
2247        !TII->hasInstrModifiers(Src.getMachineOpcode()))
2248      return Node;
2249    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2250        AMDGPU::OpName::clamp);
2251    if (ClampIdx < 0)
2252      return Node;
2253    SDLoc DL(Node);
2254    std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2255    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2256    return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2257                              Node->getVTList(), Ops);
2258  } else {
2259    if (!TII->hasInstrModifiers(Opcode))
2260      return Node;
2261    int OperandIdx[] = {
2262      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2263      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2264      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2265    };
2266    int NegIdx[] = {
2267      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2268      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2269      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2270    };
2271    int AbsIdx[] = {
2272      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2273      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2274      -1
2275    };
2276    for (unsigned i = 0; i < 3; i++) {
2277      if (OperandIdx[i] < 0)
2278        return Node;
2279      SDValue &Src = Ops[OperandIdx[i] - 1];
2280      SDValue &Neg = Ops[NegIdx[i] - 1];
2281      SDValue FakeAbs;
2282      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2283      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2284      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2285      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2286      if (HasDst) {
2287        SelIdx--;
2288        ImmIdx--;
2289      }
2290      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2291      SDValue &Imm = Ops[ImmIdx];
2292      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2293        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2294    }
2295  }
2296
2297  return Node;
2298}
2299