R600ISelLowering.cpp revision 309124
1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "AMDGPUFrameLowering.h"
17#include "AMDGPUIntrinsicInfo.h"
18#include "AMDGPUSubtarget.h"
19#include "R600Defines.h"
20#include "R600InstrInfo.h"
21#include "R600MachineFunctionInfo.h"
22#include "llvm/Analysis/ValueTracking.h"
23#include "llvm/CodeGen/CallingConvLower.h"
24#include "llvm/CodeGen/MachineFrameInfo.h"
25#include "llvm/CodeGen/MachineInstrBuilder.h"
26#include "llvm/CodeGen/MachineRegisterInfo.h"
27#include "llvm/CodeGen/SelectionDAG.h"
28#include "llvm/IR/Argument.h"
29#include "llvm/IR/Function.h"
30
31using namespace llvm;
32
33R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
34                                       const R600Subtarget &STI)
35    : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
36  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
37  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
38  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
39  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
40  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
41  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
42
43  computeRegisterProperties(STI.getRegisterInfo());
44
45  // Legalize loads and stores to the private address space.
46  setOperationAction(ISD::LOAD, MVT::i32, Custom);
47  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
48  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
49
50  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
51  // spaces, so it is custom lowered to handle those where it isn't.
52  for (MVT VT : MVT::integer_valuetypes()) {
53    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
54    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
55    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
56
57    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
58    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
59    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
60
61    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
62    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
63    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
64  }
65
66  // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
67  setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
68  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
69  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
70
71  setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
72  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
73  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
74
75
76  setOperationAction(ISD::STORE, MVT::i8, Custom);
77  setOperationAction(ISD::STORE, MVT::i32, Custom);
78  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
79  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
80
81  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
82  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
83
84  // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
85  setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
86  setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
87
88  // Set condition code actions
89  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
90  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
91  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
92  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
93  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
94  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
95  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
96  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
97  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
98  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
99  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
100  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
101
102  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
103  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
104  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
105  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
106
107  setOperationAction(ISD::FCOS, MVT::f32, Custom);
108  setOperationAction(ISD::FSIN, MVT::f32, Custom);
109
110  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
111  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
112
113  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
114  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
115  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
116
117  setOperationAction(ISD::FSUB, MVT::f32, Expand);
118
119  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
120  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
121
122  setOperationAction(ISD::SETCC, MVT::i32, Expand);
123  setOperationAction(ISD::SETCC, MVT::f32, Expand);
124  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
125  setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
126  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
127  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
128
129  setOperationAction(ISD::SELECT, MVT::i32, Expand);
130  setOperationAction(ISD::SELECT, MVT::f32, Expand);
131  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
132  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
133
134  // ADD, SUB overflow.
135  // TODO: turn these into Legal?
136  if (Subtarget->hasCARRY())
137    setOperationAction(ISD::UADDO, MVT::i32, Custom);
138
139  if (Subtarget->hasBORROW())
140    setOperationAction(ISD::USUBO, MVT::i32, Custom);
141
142  // Expand sign extension of vectors
143  if (!Subtarget->hasBFE())
144    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
145
146  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
147  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
148
149  if (!Subtarget->hasBFE())
150    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
151  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
152  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
153
154  if (!Subtarget->hasBFE())
155    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
156  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
157  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
158
159  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
160  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
161  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
162
163  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
164
165  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
166
167  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
168  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
169  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
170  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
171
172  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
173  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
174  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
175  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
176
177  // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
178  //  to be Legal/Custom in order to avoid library calls.
179  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
180  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
181  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
182
183  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
184
185  const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
186  for (MVT VT : ScalarIntVTs) {
187    setOperationAction(ISD::ADDC, VT, Expand);
188    setOperationAction(ISD::SUBC, VT, Expand);
189    setOperationAction(ISD::ADDE, VT, Expand);
190    setOperationAction(ISD::SUBE, VT, Expand);
191  }
192
193  setSchedulingPreference(Sched::Source);
194
195
196  setTargetDAGCombine(ISD::FP_ROUND);
197  setTargetDAGCombine(ISD::FP_TO_SINT);
198  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
199  setTargetDAGCombine(ISD::SELECT_CC);
200  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
201}
202
203const R600Subtarget *R600TargetLowering::getSubtarget() const {
204  return static_cast<const R600Subtarget *>(Subtarget);
205}
206
207static inline bool isEOP(MachineBasicBlock::iterator I) {
208  return std::next(I)->getOpcode() == AMDGPU::RETURN;
209}
210
211MachineBasicBlock *
212R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
213                                                MachineBasicBlock *BB) const {
214  MachineFunction * MF = BB->getParent();
215  MachineRegisterInfo &MRI = MF->getRegInfo();
216  MachineBasicBlock::iterator I = MI;
217  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
218
219  switch (MI.getOpcode()) {
220  default:
221    // Replace LDS_*_RET instruction that don't have any uses with the
222    // equivalent LDS_*_NORET instruction.
223    if (TII->isLDSRetInstr(MI.getOpcode())) {
224      int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
225      assert(DstIdx != -1);
226      MachineInstrBuilder NewMI;
227      // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
228      //        LDS_1A2D support and remove this special case.
229      if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
230          MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
231        return BB;
232
233      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
234                      TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
235      for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
236        NewMI.addOperand(MI.getOperand(i));
237      }
238    } else {
239      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
240    }
241    break;
242  case AMDGPU::CLAMP_R600: {
243    MachineInstr *NewMI = TII->buildDefaultInstruction(
244        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
245        MI.getOperand(1).getReg());
246    TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
247    break;
248  }
249
250  case AMDGPU::FABS_R600: {
251    MachineInstr *NewMI = TII->buildDefaultInstruction(
252        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
253        MI.getOperand(1).getReg());
254    TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
255    break;
256  }
257
258  case AMDGPU::FNEG_R600: {
259    MachineInstr *NewMI = TII->buildDefaultInstruction(
260        *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
261        MI.getOperand(1).getReg());
262    TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
263    break;
264  }
265
266  case AMDGPU::MASK_WRITE: {
267    unsigned maskedRegister = MI.getOperand(0).getReg();
268    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
269    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
270    TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
271    break;
272  }
273
274  case AMDGPU::MOV_IMM_F32:
275    TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
276                                                            .getFPImm()
277                                                            ->getValueAPF()
278                                                            .bitcastToAPInt()
279                                                            .getZExtValue());
280    break;
281  case AMDGPU::MOV_IMM_I32:
282    TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
283                     MI.getOperand(1).getImm());
284    break;
285  case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
286    //TODO: Perhaps combine this instruction with the next if possible
287    auto MIB = TII->buildDefaultInstruction(
288        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
289    int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
290    //TODO: Ugh this is rather ugly
291    MIB->getOperand(Idx) = MI.getOperand(1);
292    break;
293  }
294  case AMDGPU::CONST_COPY: {
295    MachineInstr *NewMI = TII->buildDefaultInstruction(
296        *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
297    TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
298                       MI.getOperand(1).getImm());
299    break;
300  }
301
302  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
303  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
304  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
305    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
306        .addOperand(MI.getOperand(0))
307        .addOperand(MI.getOperand(1))
308        .addImm(isEOP(I)); // Set End of program bit
309    break;
310  }
311  case AMDGPU::RAT_STORE_TYPED_eg: {
312    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
313        .addOperand(MI.getOperand(0))
314        .addOperand(MI.getOperand(1))
315        .addOperand(MI.getOperand(2))
316        .addImm(isEOP(I)); // Set End of program bit
317    break;
318  }
319
320  case AMDGPU::TXD: {
321    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
322    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
323    MachineOperand &RID = MI.getOperand(4);
324    MachineOperand &SID = MI.getOperand(5);
325    unsigned TextureId = MI.getOperand(6).getImm();
326    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
327    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
328
329    switch (TextureId) {
330    case 5: // Rect
331      CTX = CTY = 0;
332      break;
333    case 6: // Shadow1D
334      SrcW = SrcZ;
335      break;
336    case 7: // Shadow2D
337      SrcW = SrcZ;
338      break;
339    case 8: // ShadowRect
340      CTX = CTY = 0;
341      SrcW = SrcZ;
342      break;
343    case 9: // 1DArray
344      SrcZ = SrcY;
345      CTZ = 0;
346      break;
347    case 10: // 2DArray
348      CTZ = 0;
349      break;
350    case 11: // Shadow1DArray
351      SrcZ = SrcY;
352      CTZ = 0;
353      break;
354    case 12: // Shadow2DArray
355      CTZ = 0;
356      break;
357    }
358    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
359            T0)
360        .addOperand(MI.getOperand(3))
361        .addImm(SrcX)
362        .addImm(SrcY)
363        .addImm(SrcZ)
364        .addImm(SrcW)
365        .addImm(0)
366        .addImm(0)
367        .addImm(0)
368        .addImm(0)
369        .addImm(1)
370        .addImm(2)
371        .addImm(3)
372        .addOperand(RID)
373        .addOperand(SID)
374        .addImm(CTX)
375        .addImm(CTY)
376        .addImm(CTZ)
377        .addImm(CTW);
378    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
379            T1)
380        .addOperand(MI.getOperand(2))
381        .addImm(SrcX)
382        .addImm(SrcY)
383        .addImm(SrcZ)
384        .addImm(SrcW)
385        .addImm(0)
386        .addImm(0)
387        .addImm(0)
388        .addImm(0)
389        .addImm(1)
390        .addImm(2)
391        .addImm(3)
392        .addOperand(RID)
393        .addOperand(SID)
394        .addImm(CTX)
395        .addImm(CTY)
396        .addImm(CTZ)
397        .addImm(CTW);
398    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
399        .addOperand(MI.getOperand(0))
400        .addOperand(MI.getOperand(1))
401        .addImm(SrcX)
402        .addImm(SrcY)
403        .addImm(SrcZ)
404        .addImm(SrcW)
405        .addImm(0)
406        .addImm(0)
407        .addImm(0)
408        .addImm(0)
409        .addImm(1)
410        .addImm(2)
411        .addImm(3)
412        .addOperand(RID)
413        .addOperand(SID)
414        .addImm(CTX)
415        .addImm(CTY)
416        .addImm(CTZ)
417        .addImm(CTW)
418        .addReg(T0, RegState::Implicit)
419        .addReg(T1, RegState::Implicit);
420    break;
421  }
422
423  case AMDGPU::TXD_SHADOW: {
424    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
425    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
426    MachineOperand &RID = MI.getOperand(4);
427    MachineOperand &SID = MI.getOperand(5);
428    unsigned TextureId = MI.getOperand(6).getImm();
429    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
430    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
431
432    switch (TextureId) {
433    case 5: // Rect
434      CTX = CTY = 0;
435      break;
436    case 6: // Shadow1D
437      SrcW = SrcZ;
438      break;
439    case 7: // Shadow2D
440      SrcW = SrcZ;
441      break;
442    case 8: // ShadowRect
443      CTX = CTY = 0;
444      SrcW = SrcZ;
445      break;
446    case 9: // 1DArray
447      SrcZ = SrcY;
448      CTZ = 0;
449      break;
450    case 10: // 2DArray
451      CTZ = 0;
452      break;
453    case 11: // Shadow1DArray
454      SrcZ = SrcY;
455      CTZ = 0;
456      break;
457    case 12: // Shadow2DArray
458      CTZ = 0;
459      break;
460    }
461
462    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
463            T0)
464        .addOperand(MI.getOperand(3))
465        .addImm(SrcX)
466        .addImm(SrcY)
467        .addImm(SrcZ)
468        .addImm(SrcW)
469        .addImm(0)
470        .addImm(0)
471        .addImm(0)
472        .addImm(0)
473        .addImm(1)
474        .addImm(2)
475        .addImm(3)
476        .addOperand(RID)
477        .addOperand(SID)
478        .addImm(CTX)
479        .addImm(CTY)
480        .addImm(CTZ)
481        .addImm(CTW);
482    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
483            T1)
484        .addOperand(MI.getOperand(2))
485        .addImm(SrcX)
486        .addImm(SrcY)
487        .addImm(SrcZ)
488        .addImm(SrcW)
489        .addImm(0)
490        .addImm(0)
491        .addImm(0)
492        .addImm(0)
493        .addImm(1)
494        .addImm(2)
495        .addImm(3)
496        .addOperand(RID)
497        .addOperand(SID)
498        .addImm(CTX)
499        .addImm(CTY)
500        .addImm(CTZ)
501        .addImm(CTW);
502    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
503        .addOperand(MI.getOperand(0))
504        .addOperand(MI.getOperand(1))
505        .addImm(SrcX)
506        .addImm(SrcY)
507        .addImm(SrcZ)
508        .addImm(SrcW)
509        .addImm(0)
510        .addImm(0)
511        .addImm(0)
512        .addImm(0)
513        .addImm(1)
514        .addImm(2)
515        .addImm(3)
516        .addOperand(RID)
517        .addOperand(SID)
518        .addImm(CTX)
519        .addImm(CTY)
520        .addImm(CTZ)
521        .addImm(CTW)
522        .addReg(T0, RegState::Implicit)
523        .addReg(T1, RegState::Implicit);
524    break;
525  }
526
527  case AMDGPU::BRANCH:
528    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
529        .addOperand(MI.getOperand(0));
530    break;
531
532  case AMDGPU::BRANCH_COND_f32: {
533    MachineInstr *NewMI =
534        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
535                AMDGPU::PREDICATE_BIT)
536            .addOperand(MI.getOperand(1))
537            .addImm(OPCODE_IS_NOT_ZERO)
538            .addImm(0); // Flags
539    TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
540    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
541        .addOperand(MI.getOperand(0))
542        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
543    break;
544  }
545
546  case AMDGPU::BRANCH_COND_i32: {
547    MachineInstr *NewMI =
548        BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
549                AMDGPU::PREDICATE_BIT)
550            .addOperand(MI.getOperand(1))
551            .addImm(OPCODE_IS_NOT_ZERO_INT)
552            .addImm(0); // Flags
553    TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
554    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
555        .addOperand(MI.getOperand(0))
556        .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
557    break;
558  }
559
560  case AMDGPU::EG_ExportSwz:
561  case AMDGPU::R600_ExportSwz: {
562    // Instruction is left unmodified if its not the last one of its type
563    bool isLastInstructionOfItsType = true;
564    unsigned InstExportType = MI.getOperand(1).getImm();
565    for (MachineBasicBlock::iterator NextExportInst = std::next(I),
566         EndBlock = BB->end(); NextExportInst != EndBlock;
567         NextExportInst = std::next(NextExportInst)) {
568      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
569          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
570        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
571            .getImm();
572        if (CurrentInstExportType == InstExportType) {
573          isLastInstructionOfItsType = false;
574          break;
575        }
576      }
577    }
578    bool EOP = isEOP(I);
579    if (!EOP && !isLastInstructionOfItsType)
580      return BB;
581    unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
582    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
583        .addOperand(MI.getOperand(0))
584        .addOperand(MI.getOperand(1))
585        .addOperand(MI.getOperand(2))
586        .addOperand(MI.getOperand(3))
587        .addOperand(MI.getOperand(4))
588        .addOperand(MI.getOperand(5))
589        .addOperand(MI.getOperand(6))
590        .addImm(CfInst)
591        .addImm(EOP);
592    break;
593  }
594  case AMDGPU::RETURN: {
595    // RETURN instructions must have the live-out registers as implicit uses,
596    // otherwise they appear dead.
597    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
598    MachineInstrBuilder MIB(*MF, MI);
599    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
600      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
601    return BB;
602  }
603  }
604
605  MI.eraseFromParent();
606  return BB;
607}
608
609//===----------------------------------------------------------------------===//
610// Custom DAG Lowering Operations
611//===----------------------------------------------------------------------===//
612
613SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
614  MachineFunction &MF = DAG.getMachineFunction();
615  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
616  switch (Op.getOpcode()) {
617  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
618  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
619  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
620  case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
621  case ISD::SRA_PARTS:
622  case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
623  case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
624  case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
625  case ISD::FCOS:
626  case ISD::FSIN: return LowerTrig(Op, DAG);
627  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
628  case ISD::STORE: return LowerSTORE(Op, DAG);
629  case ISD::LOAD: {
630    SDValue Result = LowerLOAD(Op, DAG);
631    assert((!Result.getNode() ||
632            Result.getNode()->getNumValues() == 2) &&
633           "Load should return a value and a chain");
634    return Result;
635  }
636
637  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
638  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
639  case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
640  case ISD::INTRINSIC_VOID: {
641    SDValue Chain = Op.getOperand(0);
642    unsigned IntrinsicID =
643                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
644    switch (IntrinsicID) {
645    case AMDGPUIntrinsic::r600_store_swizzle: {
646      SDLoc DL(Op);
647      const SDValue Args[8] = {
648        Chain,
649        Op.getOperand(2), // Export Value
650        Op.getOperand(3), // ArrayBase
651        Op.getOperand(4), // Type
652        DAG.getConstant(0, DL, MVT::i32), // SWZ_X
653        DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
654        DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
655        DAG.getConstant(3, DL, MVT::i32) // SWZ_W
656      };
657      return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
658    }
659
660    // default for switch(IntrinsicID)
661    default: break;
662    }
663    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
664    break;
665  }
666  case ISD::INTRINSIC_WO_CHAIN: {
667    unsigned IntrinsicID =
668                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
669    EVT VT = Op.getValueType();
670    SDLoc DL(Op);
671    switch(IntrinsicID) {
672    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
673    case AMDGPUIntrinsic::r600_tex:
674    case AMDGPUIntrinsic::r600_texc:
675    case AMDGPUIntrinsic::r600_txl:
676    case AMDGPUIntrinsic::r600_txlc:
677    case AMDGPUIntrinsic::r600_txb:
678    case AMDGPUIntrinsic::r600_txbc:
679    case AMDGPUIntrinsic::r600_txf:
680    case AMDGPUIntrinsic::r600_txq:
681    case AMDGPUIntrinsic::r600_ddx:
682    case AMDGPUIntrinsic::r600_ddy: {
683      unsigned TextureOp;
684      switch (IntrinsicID) {
685      case AMDGPUIntrinsic::r600_tex:
686        TextureOp = 0;
687        break;
688      case AMDGPUIntrinsic::r600_texc:
689        TextureOp = 1;
690        break;
691      case AMDGPUIntrinsic::r600_txl:
692        TextureOp = 2;
693        break;
694      case AMDGPUIntrinsic::r600_txlc:
695        TextureOp = 3;
696        break;
697      case AMDGPUIntrinsic::r600_txb:
698        TextureOp = 4;
699        break;
700      case AMDGPUIntrinsic::r600_txbc:
701        TextureOp = 5;
702        break;
703      case AMDGPUIntrinsic::r600_txf:
704        TextureOp = 6;
705        break;
706      case AMDGPUIntrinsic::r600_txq:
707        TextureOp = 7;
708        break;
709      case AMDGPUIntrinsic::r600_ddx:
710        TextureOp = 8;
711        break;
712      case AMDGPUIntrinsic::r600_ddy:
713        TextureOp = 9;
714        break;
715      default:
716        llvm_unreachable("Unknow Texture Operation");
717      }
718
719      SDValue TexArgs[19] = {
720        DAG.getConstant(TextureOp, DL, MVT::i32),
721        Op.getOperand(1),
722        DAG.getConstant(0, DL, MVT::i32),
723        DAG.getConstant(1, DL, MVT::i32),
724        DAG.getConstant(2, DL, MVT::i32),
725        DAG.getConstant(3, DL, MVT::i32),
726        Op.getOperand(2),
727        Op.getOperand(3),
728        Op.getOperand(4),
729        DAG.getConstant(0, DL, MVT::i32),
730        DAG.getConstant(1, DL, MVT::i32),
731        DAG.getConstant(2, DL, MVT::i32),
732        DAG.getConstant(3, DL, MVT::i32),
733        Op.getOperand(5),
734        Op.getOperand(6),
735        Op.getOperand(7),
736        Op.getOperand(8),
737        Op.getOperand(9),
738        Op.getOperand(10)
739      };
740      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
741    }
742    case AMDGPUIntrinsic::r600_dot4: {
743      SDValue Args[8] = {
744      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
745          DAG.getConstant(0, DL, MVT::i32)),
746      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
747          DAG.getConstant(0, DL, MVT::i32)),
748      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
749          DAG.getConstant(1, DL, MVT::i32)),
750      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
751          DAG.getConstant(1, DL, MVT::i32)),
752      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
753          DAG.getConstant(2, DL, MVT::i32)),
754      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
755          DAG.getConstant(2, DL, MVT::i32)),
756      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
757          DAG.getConstant(3, DL, MVT::i32)),
758      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
759          DAG.getConstant(3, DL, MVT::i32))
760      };
761      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
762    }
763
764    case Intrinsic::r600_implicitarg_ptr: {
765      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
766      uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
767      return DAG.getConstant(ByteOffset, DL, PtrVT);
768    }
769    case Intrinsic::r600_read_ngroups_x:
770      return LowerImplicitParameter(DAG, VT, DL, 0);
771    case Intrinsic::r600_read_ngroups_y:
772      return LowerImplicitParameter(DAG, VT, DL, 1);
773    case Intrinsic::r600_read_ngroups_z:
774      return LowerImplicitParameter(DAG, VT, DL, 2);
775    case Intrinsic::r600_read_global_size_x:
776      return LowerImplicitParameter(DAG, VT, DL, 3);
777    case Intrinsic::r600_read_global_size_y:
778      return LowerImplicitParameter(DAG, VT, DL, 4);
779    case Intrinsic::r600_read_global_size_z:
780      return LowerImplicitParameter(DAG, VT, DL, 5);
781    case Intrinsic::r600_read_local_size_x:
782      return LowerImplicitParameter(DAG, VT, DL, 6);
783    case Intrinsic::r600_read_local_size_y:
784      return LowerImplicitParameter(DAG, VT, DL, 7);
785    case Intrinsic::r600_read_local_size_z:
786      return LowerImplicitParameter(DAG, VT, DL, 8);
787
788    case Intrinsic::r600_read_workdim:
789    case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
790      uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
791      return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
792    }
793
794    case Intrinsic::r600_read_tgid_x:
795      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
796                                  AMDGPU::T1_X, VT);
797    case Intrinsic::r600_read_tgid_y:
798      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
799                                  AMDGPU::T1_Y, VT);
800    case Intrinsic::r600_read_tgid_z:
801      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
802                                  AMDGPU::T1_Z, VT);
803    case Intrinsic::r600_read_tidig_x:
804      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
805                                  AMDGPU::T0_X, VT);
806    case Intrinsic::r600_read_tidig_y:
807      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
808                                  AMDGPU::T0_Y, VT);
809    case Intrinsic::r600_read_tidig_z:
810      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
811                                  AMDGPU::T0_Z, VT);
812
813    case Intrinsic::r600_recipsqrt_ieee:
814      return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
815
816    case Intrinsic::r600_recipsqrt_clamped:
817      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
818    }
819
820    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
821    break;
822  }
823  } // end switch(Op.getOpcode())
824  return SDValue();
825}
826
827void R600TargetLowering::ReplaceNodeResults(SDNode *N,
828                                            SmallVectorImpl<SDValue> &Results,
829                                            SelectionDAG &DAG) const {
830  switch (N->getOpcode()) {
831  default:
832    AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
833    return;
834  case ISD::FP_TO_UINT:
835    if (N->getValueType(0) == MVT::i1) {
836      Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
837      return;
838    }
839    // Fall-through. Since we don't care about out of bounds values
840    // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
841    // considers some extra cases which are not necessary here.
842  case ISD::FP_TO_SINT: {
843    if (N->getValueType(0) == MVT::i1) {
844      Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
845      return;
846    }
847
848    SDValue Result;
849    if (expandFP_TO_SINT(N, Result, DAG))
850      Results.push_back(Result);
851    return;
852  }
853  case ISD::SDIVREM: {
854    SDValue Op = SDValue(N, 1);
855    SDValue RES = LowerSDIVREM(Op, DAG);
856    Results.push_back(RES);
857    Results.push_back(RES.getValue(1));
858    break;
859  }
860  case ISD::UDIVREM: {
861    SDValue Op = SDValue(N, 0);
862    LowerUDIVREM64(Op, DAG, Results);
863    break;
864  }
865  }
866}
867
868SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
869                                                   SDValue Vector) const {
870
871  SDLoc DL(Vector);
872  EVT VecVT = Vector.getValueType();
873  EVT EltVT = VecVT.getVectorElementType();
874  SmallVector<SDValue, 8> Args;
875
876  for (unsigned i = 0, e = VecVT.getVectorNumElements();
877                                                           i != e; ++i) {
878    Args.push_back(DAG.getNode(
879        ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
880        DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
881  }
882
883  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
884}
885
886SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
887                                                    SelectionDAG &DAG) const {
888
889  SDLoc DL(Op);
890  SDValue Vector = Op.getOperand(0);
891  SDValue Index = Op.getOperand(1);
892
893  if (isa<ConstantSDNode>(Index) ||
894      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
895    return Op;
896
897  Vector = vectorToVerticalVector(DAG, Vector);
898  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
899                     Vector, Index);
900}
901
902SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
903                                                   SelectionDAG &DAG) const {
904  SDLoc DL(Op);
905  SDValue Vector = Op.getOperand(0);
906  SDValue Value = Op.getOperand(1);
907  SDValue Index = Op.getOperand(2);
908
909  if (isa<ConstantSDNode>(Index) ||
910      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
911    return Op;
912
913  Vector = vectorToVerticalVector(DAG, Vector);
914  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
915                               Vector, Value, Index);
916  return vectorToVerticalVector(DAG, Insert);
917}
918
919SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
920                                               SDValue Op,
921                                               SelectionDAG &DAG) const {
922
923  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
924  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
925    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
926
927  const DataLayout &DL = DAG.getDataLayout();
928  const GlobalValue *GV = GSD->getGlobal();
929  MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
930
931  SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
932  return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
933}
934
935SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
936  // On hw >= R700, COS/SIN input must be between -1. and 1.
937  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
938  EVT VT = Op.getValueType();
939  SDValue Arg = Op.getOperand(0);
940  SDLoc DL(Op);
941
942  // TODO: Should this propagate fast-math-flags?
943  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
944      DAG.getNode(ISD::FADD, DL, VT,
945        DAG.getNode(ISD::FMUL, DL, VT, Arg,
946          DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
947        DAG.getConstantFP(0.5, DL, MVT::f32)));
948  unsigned TrigNode;
949  switch (Op.getOpcode()) {
950  case ISD::FCOS:
951    TrigNode = AMDGPUISD::COS_HW;
952    break;
953  case ISD::FSIN:
954    TrigNode = AMDGPUISD::SIN_HW;
955    break;
956  default:
957    llvm_unreachable("Wrong trig opcode");
958  }
959  SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
960      DAG.getNode(ISD::FADD, DL, VT, FractPart,
961        DAG.getConstantFP(-0.5, DL, MVT::f32)));
962  if (Gen >= R600Subtarget::R700)
963    return TrigVal;
964  // On R600 hw, COS/SIN input must be between -Pi and Pi.
965  return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
966      DAG.getConstantFP(3.14159265359, DL, MVT::f32));
967}
968
969SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
970  SDLoc DL(Op);
971  EVT VT = Op.getValueType();
972
973  SDValue Lo = Op.getOperand(0);
974  SDValue Hi = Op.getOperand(1);
975  SDValue Shift = Op.getOperand(2);
976  SDValue Zero = DAG.getConstant(0, DL, VT);
977  SDValue One  = DAG.getConstant(1, DL, VT);
978
979  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
980  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
981  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
982  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
983
984  // The dance around Width1 is necessary for 0 special case.
985  // Without it the CompShift might be 32, producing incorrect results in
986  // Overflow. So we do the shift in two steps, the alternative is to
987  // add a conditional to filter the special case.
988
989  SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
990  Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
991
992  SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
993  HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
994  SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
995
996  SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
997  SDValue LoBig = Zero;
998
999  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1000  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1001
1002  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1003}
1004
1005SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1006  SDLoc DL(Op);
1007  EVT VT = Op.getValueType();
1008
1009  SDValue Lo = Op.getOperand(0);
1010  SDValue Hi = Op.getOperand(1);
1011  SDValue Shift = Op.getOperand(2);
1012  SDValue Zero = DAG.getConstant(0, DL, VT);
1013  SDValue One  = DAG.getConstant(1, DL, VT);
1014
1015  const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1016
1017  SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
1018  SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
1019  SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1020  SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1021
1022  // The dance around Width1 is necessary for 0 special case.
1023  // Without it the CompShift might be 32, producing incorrect results in
1024  // Overflow. So we do the shift in two steps, the alternative is to
1025  // add a conditional to filter the special case.
1026
1027  SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1028  Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1029
1030  SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1031  SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1032  LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1033
1034  SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1035  SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1036
1037  Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1038  Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1039
1040  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1041}
1042
1043SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1044                                          unsigned mainop, unsigned ovf) const {
1045  SDLoc DL(Op);
1046  EVT VT = Op.getValueType();
1047
1048  SDValue Lo = Op.getOperand(0);
1049  SDValue Hi = Op.getOperand(1);
1050
1051  SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1052  // Extend sign.
1053  OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1054                    DAG.getValueType(MVT::i1));
1055
1056  SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1057
1058  return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1059}
1060
1061SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
1062  SDLoc DL(Op);
1063  return DAG.getNode(
1064      ISD::SETCC,
1065      DL,
1066      MVT::i1,
1067      Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
1068      DAG.getCondCode(ISD::SETEQ));
1069}
1070
1071SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
1072  SDLoc DL(Op);
1073  return DAG.getNode(
1074      ISD::SETCC,
1075      DL,
1076      MVT::i1,
1077      Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
1078      DAG.getCondCode(ISD::SETEQ));
1079}
1080
1081SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1082                                                   const SDLoc &DL,
1083                                                   unsigned DwordOffset) const {
1084  unsigned ByteOffset = DwordOffset * 4;
1085  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1086                                      AMDGPUAS::CONSTANT_BUFFER_0);
1087
1088  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1089  assert(isInt<16>(ByteOffset));
1090
1091  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1092                     DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1093                     MachinePointerInfo(ConstantPointerNull::get(PtrType)));
1094}
1095
1096bool R600TargetLowering::isZero(SDValue Op) const {
1097  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1098    return Cst->isNullValue();
1099  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1100    return CstFP->isZero();
1101  } else {
1102    return false;
1103  }
1104}
1105
1106bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
1107  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1108    return CFP->isExactlyValue(1.0);
1109  }
1110  return isAllOnesConstant(Op);
1111}
1112
1113bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
1114  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1115    return CFP->getValueAPF().isZero();
1116  }
1117  return isNullConstant(Op);
1118}
1119
1120SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1121  SDLoc DL(Op);
1122  EVT VT = Op.getValueType();
1123
1124  SDValue LHS = Op.getOperand(0);
1125  SDValue RHS = Op.getOperand(1);
1126  SDValue True = Op.getOperand(2);
1127  SDValue False = Op.getOperand(3);
1128  SDValue CC = Op.getOperand(4);
1129  SDValue Temp;
1130
1131  if (VT == MVT::f32) {
1132    DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1133    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1134    if (MinMax)
1135      return MinMax;
1136  }
1137
1138  // LHS and RHS are guaranteed to be the same value type
1139  EVT CompareVT = LHS.getValueType();
1140
1141  // Check if we can lower this to a native operation.
1142
1143  // Try to lower to a SET* instruction:
1144  //
1145  // SET* can match the following patterns:
1146  //
1147  // select_cc f32, f32, -1,  0, cc_supported
1148  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1149  // select_cc i32, i32, -1,  0, cc_supported
1150  //
1151
1152  // Move hardware True/False values to the correct operand.
1153  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1154  ISD::CondCode InverseCC =
1155     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1156  if (isHWTrueValue(False) && isHWFalseValue(True)) {
1157    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1158      std::swap(False, True);
1159      CC = DAG.getCondCode(InverseCC);
1160    } else {
1161      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1162      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1163        std::swap(False, True);
1164        std::swap(LHS, RHS);
1165        CC = DAG.getCondCode(SwapInvCC);
1166      }
1167    }
1168  }
1169
1170  if (isHWTrueValue(True) && isHWFalseValue(False) &&
1171      (CompareVT == VT || VT == MVT::i32)) {
1172    // This can be matched by a SET* instruction.
1173    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1174  }
1175
1176  // Try to lower to a CND* instruction:
1177  //
1178  // CND* can match the following patterns:
1179  //
1180  // select_cc f32, 0.0, f32, f32, cc_supported
1181  // select_cc f32, 0.0, i32, i32, cc_supported
1182  // select_cc i32, 0,   f32, f32, cc_supported
1183  // select_cc i32, 0,   i32, i32, cc_supported
1184  //
1185
1186  // Try to move the zero value to the RHS
1187  if (isZero(LHS)) {
1188    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1189    // Try swapping the operands
1190    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1191    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1192      std::swap(LHS, RHS);
1193      CC = DAG.getCondCode(CCSwapped);
1194    } else {
1195      // Try inverting the conditon and then swapping the operands
1196      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1197      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1198      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1199        std::swap(True, False);
1200        std::swap(LHS, RHS);
1201        CC = DAG.getCondCode(CCSwapped);
1202      }
1203    }
1204  }
1205  if (isZero(RHS)) {
1206    SDValue Cond = LHS;
1207    SDValue Zero = RHS;
1208    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1209    if (CompareVT != VT) {
1210      // Bitcast True / False to the correct types.  This will end up being
1211      // a nop, but it allows us to define only a single pattern in the
1212      // .TD files for each CND* instruction rather than having to have
1213      // one pattern for integer True/False and one for fp True/False
1214      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1215      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1216    }
1217
1218    switch (CCOpcode) {
1219    case ISD::SETONE:
1220    case ISD::SETUNE:
1221    case ISD::SETNE:
1222      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1223      Temp = True;
1224      True = False;
1225      False = Temp;
1226      break;
1227    default:
1228      break;
1229    }
1230    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1231        Cond, Zero,
1232        True, False,
1233        DAG.getCondCode(CCOpcode));
1234    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1235  }
1236
1237  // If we make it this for it means we have no native instructions to handle
1238  // this SELECT_CC, so we must lower it.
1239  SDValue HWTrue, HWFalse;
1240
1241  if (CompareVT == MVT::f32) {
1242    HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1243    HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1244  } else if (CompareVT == MVT::i32) {
1245    HWTrue = DAG.getConstant(-1, DL, CompareVT);
1246    HWFalse = DAG.getConstant(0, DL, CompareVT);
1247  }
1248  else {
1249    llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1250  }
1251
1252  // Lower this unsupported SELECT_CC into a combination of two supported
1253  // SELECT_CC operations.
1254  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1255
1256  return DAG.getNode(ISD::SELECT_CC, DL, VT,
1257      Cond, HWFalse,
1258      True, False,
1259      DAG.getCondCode(ISD::SETNE));
1260}
1261
1262/// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1263/// convert these pointers to a register index.  Each register holds
1264/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1265/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1266/// for indirect addressing.
1267SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1268                                               unsigned StackWidth,
1269                                               SelectionDAG &DAG) const {
1270  unsigned SRLPad;
1271  switch(StackWidth) {
1272  case 1:
1273    SRLPad = 2;
1274    break;
1275  case 2:
1276    SRLPad = 3;
1277    break;
1278  case 4:
1279    SRLPad = 4;
1280    break;
1281  default: llvm_unreachable("Invalid stack width");
1282  }
1283
1284  SDLoc DL(Ptr);
1285  return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1286                     DAG.getConstant(SRLPad, DL, MVT::i32));
1287}
1288
1289void R600TargetLowering::getStackAddress(unsigned StackWidth,
1290                                         unsigned ElemIdx,
1291                                         unsigned &Channel,
1292                                         unsigned &PtrIncr) const {
1293  switch (StackWidth) {
1294  default:
1295  case 1:
1296    Channel = 0;
1297    if (ElemIdx > 0) {
1298      PtrIncr = 1;
1299    } else {
1300      PtrIncr = 0;
1301    }
1302    break;
1303  case 2:
1304    Channel = ElemIdx % 2;
1305    if (ElemIdx == 2) {
1306      PtrIncr = 1;
1307    } else {
1308      PtrIncr = 0;
1309    }
1310    break;
1311  case 4:
1312    Channel = ElemIdx;
1313    PtrIncr = 0;
1314    break;
1315  }
1316}
1317
1318SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
1319                                                   SelectionDAG &DAG) const {
1320  SDLoc DL(Store);
1321
1322  unsigned Mask = 0;
1323  if (Store->getMemoryVT() == MVT::i8) {
1324    Mask = 0xff;
1325  } else if (Store->getMemoryVT() == MVT::i16) {
1326    Mask = 0xffff;
1327  }
1328
1329  SDValue Chain = Store->getChain();
1330  SDValue BasePtr = Store->getBasePtr();
1331  EVT MemVT = Store->getMemoryVT();
1332
1333  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
1334                            DAG.getConstant(2, DL, MVT::i32));
1335  SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
1336                            Chain, Ptr,
1337                            DAG.getTargetConstant(0, DL, MVT::i32));
1338
1339  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
1340                                DAG.getConstant(0x3, DL, MVT::i32));
1341
1342  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1343                                 DAG.getConstant(3, DL, MVT::i32));
1344
1345  SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1346                                  Store->getValue());
1347
1348  SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
1349
1350  SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1351                                     MaskedValue, ShiftAmt);
1352
1353  SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
1354                                DAG.getConstant(Mask, DL, MVT::i32),
1355                                ShiftAmt);
1356  DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
1357                        DAG.getConstant(0xffffffff, DL, MVT::i32));
1358  Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1359
1360  SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1361  return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1362                     Chain, Value, Ptr,
1363                     DAG.getTargetConstant(0, DL, MVT::i32));
1364}
1365
1366SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1367  if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
1368    return Result;
1369
1370  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1371  unsigned AS = StoreNode->getAddressSpace();
1372  SDValue Value = StoreNode->getValue();
1373  EVT ValueVT = Value.getValueType();
1374
1375  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
1376      ValueVT.isVector()) {
1377    return SplitVectorStore(Op, DAG);
1378  }
1379
1380  SDLoc DL(Op);
1381  SDValue Chain = StoreNode->getChain();
1382  SDValue Ptr = StoreNode->getBasePtr();
1383
1384  if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
1385    if (StoreNode->isTruncatingStore()) {
1386      EVT VT = Value.getValueType();
1387      assert(VT.bitsLE(MVT::i32));
1388      EVT MemVT = StoreNode->getMemoryVT();
1389      SDValue MaskConstant;
1390      if (MemVT == MVT::i8) {
1391        MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1392      } else {
1393        assert(MemVT == MVT::i16);
1394        MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1395      }
1396      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1397                                      DAG.getConstant(2, DL, MVT::i32));
1398      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1399                                      DAG.getConstant(0x00000003, DL, VT));
1400      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1401      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1402                                   DAG.getConstant(3, DL, VT));
1403      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1404      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1405      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1406      // vector instead.
1407      SDValue Src[4] = {
1408        ShiftedValue,
1409        DAG.getConstant(0, DL, MVT::i32),
1410        DAG.getConstant(0, DL, MVT::i32),
1411        Mask
1412      };
1413      SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
1414      SDValue Args[3] = { Chain, Input, DWordAddr };
1415      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1416                                     Op->getVTList(), Args, MemVT,
1417                                     StoreNode->getMemOperand());
1418    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1419               ValueVT.bitsGE(MVT::i32)) {
1420      // Convert pointer from byte address to dword address.
1421      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1422                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1423                                    Ptr, DAG.getConstant(2, DL, MVT::i32)));
1424
1425      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1426        llvm_unreachable("Truncated and indexed stores not supported yet");
1427      } else {
1428        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1429      }
1430      return Chain;
1431    }
1432  }
1433
1434  if (AS != AMDGPUAS::PRIVATE_ADDRESS)
1435    return SDValue();
1436
1437  EVT MemVT = StoreNode->getMemoryVT();
1438  if (MemVT.bitsLT(MVT::i32))
1439    return lowerPrivateTruncStore(StoreNode, DAG);
1440
1441  // Lowering for indirect addressing
1442  const MachineFunction &MF = DAG.getMachineFunction();
1443  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1444  unsigned StackWidth = TFL->getStackWidth(MF);
1445
1446  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1447
1448  if (ValueVT.isVector()) {
1449    unsigned NumElemVT = ValueVT.getVectorNumElements();
1450    EVT ElemVT = ValueVT.getVectorElementType();
1451    SmallVector<SDValue, 4> Stores(NumElemVT);
1452
1453    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1454                                      "vector width in load");
1455
1456    for (unsigned i = 0; i < NumElemVT; ++i) {
1457      unsigned Channel, PtrIncr;
1458      getStackAddress(StackWidth, i, Channel, PtrIncr);
1459      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1460                        DAG.getConstant(PtrIncr, DL, MVT::i32));
1461      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1462                                 Value, DAG.getConstant(i, DL, MVT::i32));
1463
1464      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1465                              Chain, Elem, Ptr,
1466                              DAG.getTargetConstant(Channel, DL, MVT::i32));
1467    }
1468     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1469   } else {
1470    if (ValueVT == MVT::i8) {
1471      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1472    }
1473    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1474    DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1475  }
1476
1477  return Chain;
1478}
1479
1480// return (512 + (kc_bank << 12)
1481static int
1482ConstantAddressBlock(unsigned AddressSpace) {
1483  switch (AddressSpace) {
1484  case AMDGPUAS::CONSTANT_BUFFER_0:
1485    return 512;
1486  case AMDGPUAS::CONSTANT_BUFFER_1:
1487    return 512 + 4096;
1488  case AMDGPUAS::CONSTANT_BUFFER_2:
1489    return 512 + 4096 * 2;
1490  case AMDGPUAS::CONSTANT_BUFFER_3:
1491    return 512 + 4096 * 3;
1492  case AMDGPUAS::CONSTANT_BUFFER_4:
1493    return 512 + 4096 * 4;
1494  case AMDGPUAS::CONSTANT_BUFFER_5:
1495    return 512 + 4096 * 5;
1496  case AMDGPUAS::CONSTANT_BUFFER_6:
1497    return 512 + 4096 * 6;
1498  case AMDGPUAS::CONSTANT_BUFFER_7:
1499    return 512 + 4096 * 7;
1500  case AMDGPUAS::CONSTANT_BUFFER_8:
1501    return 512 + 4096 * 8;
1502  case AMDGPUAS::CONSTANT_BUFFER_9:
1503    return 512 + 4096 * 9;
1504  case AMDGPUAS::CONSTANT_BUFFER_10:
1505    return 512 + 4096 * 10;
1506  case AMDGPUAS::CONSTANT_BUFFER_11:
1507    return 512 + 4096 * 11;
1508  case AMDGPUAS::CONSTANT_BUFFER_12:
1509    return 512 + 4096 * 12;
1510  case AMDGPUAS::CONSTANT_BUFFER_13:
1511    return 512 + 4096 * 13;
1512  case AMDGPUAS::CONSTANT_BUFFER_14:
1513    return 512 + 4096 * 14;
1514  case AMDGPUAS::CONSTANT_BUFFER_15:
1515    return 512 + 4096 * 15;
1516  default:
1517    return -1;
1518  }
1519}
1520
1521SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
1522                                                SelectionDAG &DAG) const {
1523  SDLoc DL(Op);
1524  LoadSDNode *Load = cast<LoadSDNode>(Op);
1525  ISD::LoadExtType ExtType = Load->getExtensionType();
1526  EVT MemVT = Load->getMemoryVT();
1527
1528  // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
1529  // register (2-)byte extract.
1530
1531  // Get Register holding the target.
1532  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
1533                            DAG.getConstant(2, DL, MVT::i32));
1534  // Load the Register.
1535  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
1536                            Load->getChain(),
1537                            Ptr,
1538                            DAG.getTargetConstant(0, DL, MVT::i32),
1539                            Op.getOperand(2));
1540
1541  // Get offset within the register.
1542  SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1543                                Load->getBasePtr(),
1544                                DAG.getConstant(0x3, DL, MVT::i32));
1545
1546  // Bit offset of target byte (byteIdx * 8).
1547  SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1548                                 DAG.getConstant(3, DL, MVT::i32));
1549
1550  // Shift to the right.
1551  Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
1552
1553  // Eliminate the upper bits by setting them to ...
1554  EVT MemEltVT = MemVT.getScalarType();
1555
1556  // ... ones.
1557  if (ExtType == ISD::SEXTLOAD) {
1558    SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1559
1560    SDValue Ops[] = {
1561      DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
1562      Load->getChain()
1563    };
1564
1565    return DAG.getMergeValues(Ops, DL);
1566  }
1567
1568  // ... or zeros.
1569  SDValue Ops[] = {
1570    DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
1571    Load->getChain()
1572  };
1573
1574  return DAG.getMergeValues(Ops, DL);
1575}
1576
1577SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1578  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1579  unsigned AS = LoadNode->getAddressSpace();
1580  EVT MemVT = LoadNode->getMemoryVT();
1581  ISD::LoadExtType ExtType = LoadNode->getExtensionType();
1582
1583  if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
1584      ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
1585    return lowerPrivateExtLoad(Op, DAG);
1586  }
1587
1588  SDLoc DL(Op);
1589  EVT VT = Op.getValueType();
1590  SDValue Chain = LoadNode->getChain();
1591  SDValue Ptr = LoadNode->getBasePtr();
1592
1593  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1594    SDValue MergedValues[2] = {
1595      scalarizeVectorLoad(LoadNode, DAG),
1596      Chain
1597    };
1598    return DAG.getMergeValues(MergedValues, DL);
1599  }
1600
1601  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1602  if (ConstantBlock > -1 &&
1603      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1604       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1605    SDValue Result;
1606    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1607        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1608        isa<ConstantSDNode>(Ptr)) {
1609      SDValue Slots[4];
1610      for (unsigned i = 0; i < 4; i++) {
1611        // We want Const position encoded with the following formula :
1612        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1613        // const_index is Ptr computed by llvm using an alignment of 16.
1614        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1615        // then div by 4 at the ISel step
1616        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1617            DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1618        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1619      }
1620      EVT NewVT = MVT::v4i32;
1621      unsigned NumElements = 4;
1622      if (VT.isVector()) {
1623        NewVT = VT;
1624        NumElements = VT.getVectorNumElements();
1625      }
1626      Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
1627    } else {
1628      // non-constant ptr can't be folded, keeps it as a v4f32 load
1629      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1630          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1631                      DAG.getConstant(4, DL, MVT::i32)),
1632                      DAG.getConstant(LoadNode->getAddressSpace() -
1633                                      AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1634          );
1635    }
1636
1637    if (!VT.isVector()) {
1638      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1639                           DAG.getConstant(0, DL, MVT::i32));
1640    }
1641
1642    SDValue MergedValues[2] = {
1643      Result,
1644      Chain
1645    };
1646    return DAG.getMergeValues(MergedValues, DL);
1647  }
1648
1649  SDValue LoweredLoad;
1650
1651  // For most operations returning SDValue() will result in the node being
1652  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1653  // need to manually expand loads that may be legal in some address spaces and
1654  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1655  // compute shaders, since the data is sign extended when it is uploaded to the
1656  // buffer. However SEXT loads from other address spaces are not supported, so
1657  // we need to expand them here.
1658  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1659    EVT MemVT = LoadNode->getMemoryVT();
1660    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1661    SDValue NewLoad = DAG.getExtLoad(
1662        ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
1663        LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
1664    SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1665                              DAG.getValueType(MemVT));
1666
1667    SDValue MergedValues[2] = { Res, Chain };
1668    return DAG.getMergeValues(MergedValues, DL);
1669  }
1670
1671  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1672    return SDValue();
1673  }
1674
1675  // Lowering for indirect addressing
1676  const MachineFunction &MF = DAG.getMachineFunction();
1677  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1678  unsigned StackWidth = TFL->getStackWidth(MF);
1679
1680  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1681
1682  if (VT.isVector()) {
1683    unsigned NumElemVT = VT.getVectorNumElements();
1684    EVT ElemVT = VT.getVectorElementType();
1685    SDValue Loads[4];
1686
1687    assert(NumElemVT <= 4);
1688    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1689                                      "vector width in load");
1690
1691    for (unsigned i = 0; i < NumElemVT; ++i) {
1692      unsigned Channel, PtrIncr;
1693      getStackAddress(StackWidth, i, Channel, PtrIncr);
1694      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1695                        DAG.getConstant(PtrIncr, DL, MVT::i32));
1696      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1697                             Chain, Ptr,
1698                             DAG.getTargetConstant(Channel, DL, MVT::i32),
1699                             Op.getOperand(2));
1700    }
1701    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
1702    LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
1703  } else {
1704    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1705                              Chain, Ptr,
1706                              DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1707                              Op.getOperand(2));
1708  }
1709
1710  SDValue Ops[2] = {
1711    LoweredLoad,
1712    Chain
1713  };
1714
1715  return DAG.getMergeValues(Ops, DL);
1716}
1717
1718SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1719  SDValue Chain = Op.getOperand(0);
1720  SDValue Cond  = Op.getOperand(1);
1721  SDValue Jump  = Op.getOperand(2);
1722
1723  return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1724                     Chain, Jump, Cond);
1725}
1726
1727SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
1728                                            SelectionDAG &DAG) const {
1729  MachineFunction &MF = DAG.getMachineFunction();
1730  const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1731
1732  FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
1733
1734  unsigned FrameIndex = FIN->getIndex();
1735  unsigned IgnoredFrameReg;
1736  unsigned Offset =
1737    TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
1738  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
1739                         Op.getValueType());
1740}
1741
1742/// XXX Only kernel functions are supported, so we can assume for now that
1743/// every function is a kernel function, but in the future we should use
1744/// separate calling conventions for kernel and non-kernel functions.
1745SDValue R600TargetLowering::LowerFormalArguments(
1746    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1747    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1748    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1749  SmallVector<CCValAssign, 16> ArgLocs;
1750  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1751                 *DAG.getContext());
1752  MachineFunction &MF = DAG.getMachineFunction();
1753  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1754
1755  SmallVector<ISD::InputArg, 8> LocalIns;
1756
1757  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1758
1759  AnalyzeFormalArguments(CCInfo, LocalIns);
1760
1761  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1762    CCValAssign &VA = ArgLocs[i];
1763    const ISD::InputArg &In = Ins[i];
1764    EVT VT = In.VT;
1765    EVT MemVT = VA.getLocVT();
1766    if (!VT.isVector() && MemVT.isVector()) {
1767      // Get load source type if scalarized.
1768      MemVT = MemVT.getVectorElementType();
1769    }
1770
1771    if (AMDGPU::isShader(CallConv)) {
1772      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1773      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1774      InVals.push_back(Register);
1775      continue;
1776    }
1777
1778    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1779                                          AMDGPUAS::CONSTANT_BUFFER_0);
1780
1781    // i64 isn't a legal type, so the register type used ends up as i32, which
1782    // isn't expected here. It attempts to create this sextload, but it ends up
1783    // being invalid. Somehow this seems to work with i64 arguments, but breaks
1784    // for <1 x i64>.
1785
1786    // The first 36 bytes of the input buffer contains information about
1787    // thread group and global sizes.
1788    ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1789    if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1790      // FIXME: This should really check the extload type, but the handling of
1791      // extload vector parameters seems to be broken.
1792
1793      // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1794      Ext = ISD::SEXTLOAD;
1795    }
1796
1797    // Compute the offset from the value.
1798    // XXX - I think PartOffset should give you this, but it seems to give the
1799    // size of the register which isn't useful.
1800
1801    unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1802    unsigned PartOffset = VA.getLocMemOffset();
1803    unsigned Offset = 36 + VA.getLocMemOffset();
1804
1805    MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1806    SDValue Arg = DAG.getLoad(
1807        ISD::UNINDEXED, Ext, VT, DL, Chain,
1808        DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
1809        MemVT, /* Alignment = */ 4,
1810        MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant);
1811
1812    // 4 is the preferred alignment for the CONSTANT memory space.
1813    InVals.push_back(Arg);
1814    MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1815  }
1816  return Chain;
1817}
1818
1819EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1820                                           EVT VT) const {
1821   if (!VT.isVector())
1822     return MVT::i32;
1823   return VT.changeVectorElementTypeToInteger();
1824}
1825
1826bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1827                                                        unsigned AddrSpace,
1828                                                        unsigned Align,
1829                                                        bool *IsFast) const {
1830  if (IsFast)
1831    *IsFast = false;
1832
1833  if (!VT.isSimple() || VT == MVT::Other)
1834    return false;
1835
1836  if (VT.bitsLT(MVT::i32))
1837    return false;
1838
1839  // TODO: This is a rough estimate.
1840  if (IsFast)
1841    *IsFast = true;
1842
1843  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1844}
1845
1846static SDValue CompactSwizzlableVector(
1847  SelectionDAG &DAG, SDValue VectorEntry,
1848  DenseMap<unsigned, unsigned> &RemapSwizzle) {
1849  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1850  assert(RemapSwizzle.empty());
1851  SDValue NewBldVec[4] = {
1852    VectorEntry.getOperand(0),
1853    VectorEntry.getOperand(1),
1854    VectorEntry.getOperand(2),
1855    VectorEntry.getOperand(3)
1856  };
1857
1858  for (unsigned i = 0; i < 4; i++) {
1859    if (NewBldVec[i].isUndef())
1860      // We mask write here to teach later passes that the ith element of this
1861      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1862      // break false dependencies and additionnaly make assembly easier to read.
1863      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1864    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1865      if (C->isZero()) {
1866        RemapSwizzle[i] = 4; // SEL_0
1867        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1868      } else if (C->isExactlyValue(1.0)) {
1869        RemapSwizzle[i] = 5; // SEL_1
1870        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1871      }
1872    }
1873
1874    if (NewBldVec[i].isUndef())
1875      continue;
1876    for (unsigned j = 0; j < i; j++) {
1877      if (NewBldVec[i] == NewBldVec[j]) {
1878        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1879        RemapSwizzle[i] = j;
1880        break;
1881      }
1882    }
1883  }
1884
1885  return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1886                            NewBldVec);
1887}
1888
1889static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1890                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
1891  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1892  assert(RemapSwizzle.empty());
1893  SDValue NewBldVec[4] = {
1894      VectorEntry.getOperand(0),
1895      VectorEntry.getOperand(1),
1896      VectorEntry.getOperand(2),
1897      VectorEntry.getOperand(3)
1898  };
1899  bool isUnmovable[4] = { false, false, false, false };
1900  for (unsigned i = 0; i < 4; i++) {
1901    RemapSwizzle[i] = i;
1902    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1903      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1904          ->getZExtValue();
1905      if (i == Idx)
1906        isUnmovable[Idx] = true;
1907    }
1908  }
1909
1910  for (unsigned i = 0; i < 4; i++) {
1911    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1912      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1913          ->getZExtValue();
1914      if (isUnmovable[Idx])
1915        continue;
1916      // Swap i and Idx
1917      std::swap(NewBldVec[Idx], NewBldVec[i]);
1918      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1919      break;
1920    }
1921  }
1922
1923  return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1924                            NewBldVec);
1925}
1926
1927SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
1928                                            SelectionDAG &DAG,
1929                                            const SDLoc &DL) const {
1930  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1931  // Old -> New swizzle values
1932  DenseMap<unsigned, unsigned> SwizzleRemap;
1933
1934  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1935  for (unsigned i = 0; i < 4; i++) {
1936    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1937    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1938      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1939  }
1940
1941  SwizzleRemap.clear();
1942  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1943  for (unsigned i = 0; i < 4; i++) {
1944    unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1945    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1946      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1947  }
1948
1949  return BuildVector;
1950}
1951
1952
1953//===----------------------------------------------------------------------===//
1954// Custom DAG Optimizations
1955//===----------------------------------------------------------------------===//
1956
1957SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1958                                              DAGCombinerInfo &DCI) const {
1959  SelectionDAG &DAG = DCI.DAG;
1960
1961  switch (N->getOpcode()) {
1962  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1963  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1964  case ISD::FP_ROUND: {
1965      SDValue Arg = N->getOperand(0);
1966      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1967        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1968                           Arg.getOperand(0));
1969      }
1970      break;
1971    }
1972
1973  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1974  // (i32 select_cc f32, f32, -1, 0 cc)
1975  //
1976  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1977  // this to one of the SET*_DX10 instructions.
1978  case ISD::FP_TO_SINT: {
1979    SDValue FNeg = N->getOperand(0);
1980    if (FNeg.getOpcode() != ISD::FNEG) {
1981      return SDValue();
1982    }
1983    SDValue SelectCC = FNeg.getOperand(0);
1984    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1985        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1986        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1987        !isHWTrueValue(SelectCC.getOperand(2)) ||
1988        !isHWFalseValue(SelectCC.getOperand(3))) {
1989      return SDValue();
1990    }
1991
1992    SDLoc dl(N);
1993    return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1994                           SelectCC.getOperand(0), // LHS
1995                           SelectCC.getOperand(1), // RHS
1996                           DAG.getConstant(-1, dl, MVT::i32), // True
1997                           DAG.getConstant(0, dl, MVT::i32),  // False
1998                           SelectCC.getOperand(4)); // CC
1999
2000    break;
2001  }
2002
2003  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
2004  // => build_vector elt0, ... , NewEltIdx, ... , eltN
2005  case ISD::INSERT_VECTOR_ELT: {
2006    SDValue InVec = N->getOperand(0);
2007    SDValue InVal = N->getOperand(1);
2008    SDValue EltNo = N->getOperand(2);
2009    SDLoc dl(N);
2010
2011    // If the inserted element is an UNDEF, just use the input vector.
2012    if (InVal.isUndef())
2013      return InVec;
2014
2015    EVT VT = InVec.getValueType();
2016
2017    // If we can't generate a legal BUILD_VECTOR, exit
2018    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
2019      return SDValue();
2020
2021    // Check that we know which element is being inserted
2022    if (!isa<ConstantSDNode>(EltNo))
2023      return SDValue();
2024    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
2025
2026    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
2027    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
2028    // vector elements.
2029    SmallVector<SDValue, 8> Ops;
2030    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
2031      Ops.append(InVec.getNode()->op_begin(),
2032                 InVec.getNode()->op_end());
2033    } else if (InVec.isUndef()) {
2034      unsigned NElts = VT.getVectorNumElements();
2035      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
2036    } else {
2037      return SDValue();
2038    }
2039
2040    // Insert the element
2041    if (Elt < Ops.size()) {
2042      // All the operands of BUILD_VECTOR must have the same type;
2043      // we enforce that here.
2044      EVT OpVT = Ops[0].getValueType();
2045      if (InVal.getValueType() != OpVT)
2046        InVal = OpVT.bitsGT(InVal.getValueType()) ?
2047          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
2048          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
2049      Ops[Elt] = InVal;
2050    }
2051
2052    // Return the new vector
2053    return DAG.getBuildVector(VT, dl, Ops);
2054  }
2055
2056  // Extract_vec (Build_vector) generated by custom lowering
2057  // also needs to be customly combined
2058  case ISD::EXTRACT_VECTOR_ELT: {
2059    SDValue Arg = N->getOperand(0);
2060    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
2061      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2062        unsigned Element = Const->getZExtValue();
2063        return Arg->getOperand(Element);
2064      }
2065    }
2066    if (Arg.getOpcode() == ISD::BITCAST &&
2067        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
2068      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2069        unsigned Element = Const->getZExtValue();
2070        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
2071            Arg->getOperand(0).getOperand(Element));
2072      }
2073    }
2074    break;
2075  }
2076
2077  case ISD::SELECT_CC: {
2078    // Try common optimizations
2079    if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
2080      return Ret;
2081
2082    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
2083    //      selectcc x, y, a, b, inv(cc)
2084    //
2085    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
2086    //      selectcc x, y, a, b, cc
2087    SDValue LHS = N->getOperand(0);
2088    if (LHS.getOpcode() != ISD::SELECT_CC) {
2089      return SDValue();
2090    }
2091
2092    SDValue RHS = N->getOperand(1);
2093    SDValue True = N->getOperand(2);
2094    SDValue False = N->getOperand(3);
2095    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2096
2097    if (LHS.getOperand(2).getNode() != True.getNode() ||
2098        LHS.getOperand(3).getNode() != False.getNode() ||
2099        RHS.getNode() != False.getNode()) {
2100      return SDValue();
2101    }
2102
2103    switch (NCC) {
2104    default: return SDValue();
2105    case ISD::SETNE: return LHS;
2106    case ISD::SETEQ: {
2107      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2108      LHSCC = ISD::getSetCCInverse(LHSCC,
2109                                  LHS.getOperand(0).getValueType().isInteger());
2110      if (DCI.isBeforeLegalizeOps() ||
2111          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2112        return DAG.getSelectCC(SDLoc(N),
2113                               LHS.getOperand(0),
2114                               LHS.getOperand(1),
2115                               LHS.getOperand(2),
2116                               LHS.getOperand(3),
2117                               LHSCC);
2118      break;
2119    }
2120    }
2121    return SDValue();
2122  }
2123
2124  case AMDGPUISD::EXPORT: {
2125    SDValue Arg = N->getOperand(1);
2126    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2127      break;
2128
2129    SDValue NewArgs[8] = {
2130      N->getOperand(0), // Chain
2131      SDValue(),
2132      N->getOperand(2), // ArrayBase
2133      N->getOperand(3), // Type
2134      N->getOperand(4), // SWZ_X
2135      N->getOperand(5), // SWZ_Y
2136      N->getOperand(6), // SWZ_Z
2137      N->getOperand(7) // SWZ_W
2138    };
2139    SDLoc DL(N);
2140    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2141    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2142  }
2143  case AMDGPUISD::TEXTURE_FETCH: {
2144    SDValue Arg = N->getOperand(1);
2145    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2146      break;
2147
2148    SDValue NewArgs[19] = {
2149      N->getOperand(0),
2150      N->getOperand(1),
2151      N->getOperand(2),
2152      N->getOperand(3),
2153      N->getOperand(4),
2154      N->getOperand(5),
2155      N->getOperand(6),
2156      N->getOperand(7),
2157      N->getOperand(8),
2158      N->getOperand(9),
2159      N->getOperand(10),
2160      N->getOperand(11),
2161      N->getOperand(12),
2162      N->getOperand(13),
2163      N->getOperand(14),
2164      N->getOperand(15),
2165      N->getOperand(16),
2166      N->getOperand(17),
2167      N->getOperand(18),
2168    };
2169    SDLoc DL(N);
2170    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2171    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2172  }
2173  }
2174
2175  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2176}
2177
2178bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
2179                                     SDValue &Src, SDValue &Neg, SDValue &Abs,
2180                                     SDValue &Sel, SDValue &Imm,
2181                                     SelectionDAG &DAG) const {
2182  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2183  if (!Src.isMachineOpcode())
2184    return false;
2185
2186  switch (Src.getMachineOpcode()) {
2187  case AMDGPU::FNEG_R600:
2188    if (!Neg.getNode())
2189      return false;
2190    Src = Src.getOperand(0);
2191    Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2192    return true;
2193  case AMDGPU::FABS_R600:
2194    if (!Abs.getNode())
2195      return false;
2196    Src = Src.getOperand(0);
2197    Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2198    return true;
2199  case AMDGPU::CONST_COPY: {
2200    unsigned Opcode = ParentNode->getMachineOpcode();
2201    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2202
2203    if (!Sel.getNode())
2204      return false;
2205
2206    SDValue CstOffset = Src.getOperand(0);
2207    if (ParentNode->getValueType(0).isVector())
2208      return false;
2209
2210    // Gather constants values
2211    int SrcIndices[] = {
2212      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2213      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2214      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2215      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2216      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2217      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2218      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2219      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2220      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2221      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2222      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2223    };
2224    std::vector<unsigned> Consts;
2225    for (int OtherSrcIdx : SrcIndices) {
2226      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2227      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2228        continue;
2229      if (HasDst) {
2230        OtherSrcIdx--;
2231        OtherSelIdx--;
2232      }
2233      if (RegisterSDNode *Reg =
2234          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2235        if (Reg->getReg() == AMDGPU::ALU_CONST) {
2236          ConstantSDNode *Cst
2237            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2238          Consts.push_back(Cst->getZExtValue());
2239        }
2240      }
2241    }
2242
2243    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2244    Consts.push_back(Cst->getZExtValue());
2245    if (!TII->fitsConstReadLimitations(Consts)) {
2246      return false;
2247    }
2248
2249    Sel = CstOffset;
2250    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2251    return true;
2252  }
2253  case AMDGPU::MOV_IMM_GLOBAL_ADDR:
2254    // Check if the Imm slot is used. Taken from below.
2255    if (cast<ConstantSDNode>(Imm)->getZExtValue())
2256      return false;
2257    Imm = Src.getOperand(0);
2258    Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
2259    return true;
2260  case AMDGPU::MOV_IMM_I32:
2261  case AMDGPU::MOV_IMM_F32: {
2262    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2263    uint64_t ImmValue = 0;
2264
2265
2266    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2267      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2268      float FloatValue = FPC->getValueAPF().convertToFloat();
2269      if (FloatValue == 0.0) {
2270        ImmReg = AMDGPU::ZERO;
2271      } else if (FloatValue == 0.5) {
2272        ImmReg = AMDGPU::HALF;
2273      } else if (FloatValue == 1.0) {
2274        ImmReg = AMDGPU::ONE;
2275      } else {
2276        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2277      }
2278    } else {
2279      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2280      uint64_t Value = C->getZExtValue();
2281      if (Value == 0) {
2282        ImmReg = AMDGPU::ZERO;
2283      } else if (Value == 1) {
2284        ImmReg = AMDGPU::ONE_INT;
2285      } else {
2286        ImmValue = Value;
2287      }
2288    }
2289
2290    // Check that we aren't already using an immediate.
2291    // XXX: It's possible for an instruction to have more than one
2292    // immediate operand, but this is not supported yet.
2293    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2294      if (!Imm.getNode())
2295        return false;
2296      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2297      assert(C);
2298      if (C->getZExtValue())
2299        return false;
2300      Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2301    }
2302    Src = DAG.getRegister(ImmReg, MVT::i32);
2303    return true;
2304  }
2305  default:
2306    return false;
2307  }
2308}
2309
2310/// \brief Fold the instructions after selecting them
2311SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2312                                            SelectionDAG &DAG) const {
2313  const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2314  if (!Node->isMachineOpcode())
2315    return Node;
2316
2317  unsigned Opcode = Node->getMachineOpcode();
2318  SDValue FakeOp;
2319
2320  std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2321
2322  if (Opcode == AMDGPU::DOT_4) {
2323    int OperandIdx[] = {
2324      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2325      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2326      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2327      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2328      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2329      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2330      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2331      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2332        };
2333    int NegIdx[] = {
2334      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2335      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2336      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2337      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2338      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2339      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2340      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2341      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2342    };
2343    int AbsIdx[] = {
2344      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2345      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2346      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2347      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2348      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2349      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2350      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2351      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2352    };
2353    for (unsigned i = 0; i < 8; i++) {
2354      if (OperandIdx[i] < 0)
2355        return Node;
2356      SDValue &Src = Ops[OperandIdx[i] - 1];
2357      SDValue &Neg = Ops[NegIdx[i] - 1];
2358      SDValue &Abs = Ops[AbsIdx[i] - 1];
2359      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2360      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2361      if (HasDst)
2362        SelIdx--;
2363      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2364      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2365        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2366    }
2367  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2368    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2369      SDValue &Src = Ops[i];
2370      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2371        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2372    }
2373  } else if (Opcode == AMDGPU::CLAMP_R600) {
2374    SDValue Src = Node->getOperand(0);
2375    if (!Src.isMachineOpcode() ||
2376        !TII->hasInstrModifiers(Src.getMachineOpcode()))
2377      return Node;
2378    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2379        AMDGPU::OpName::clamp);
2380    if (ClampIdx < 0)
2381      return Node;
2382    SDLoc DL(Node);
2383    std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2384    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2385    return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2386                              Node->getVTList(), Ops);
2387  } else {
2388    if (!TII->hasInstrModifiers(Opcode))
2389      return Node;
2390    int OperandIdx[] = {
2391      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2392      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2393      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2394    };
2395    int NegIdx[] = {
2396      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2397      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2398      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2399    };
2400    int AbsIdx[] = {
2401      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2402      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2403      -1
2404    };
2405    for (unsigned i = 0; i < 3; i++) {
2406      if (OperandIdx[i] < 0)
2407        return Node;
2408      SDValue &Src = Ops[OperandIdx[i] - 1];
2409      SDValue &Neg = Ops[NegIdx[i] - 1];
2410      SDValue FakeAbs;
2411      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2412      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2413      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2414      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2415      if (HasDst) {
2416        SelIdx--;
2417        ImmIdx--;
2418      }
2419      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2420      SDValue &Imm = Ops[ImmIdx];
2421      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2422        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2423    }
2424  }
2425
2426  return Node;
2427}
2428