1//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10/// \file
11/// \brief Custom DAG lowering for R600
12//
13//===----------------------------------------------------------------------===//
14
15#include "R600ISelLowering.h"
16#include "R600Defines.h"
17#include "R600InstrInfo.h"
18#include "R600MachineFunctionInfo.h"
19#include "llvm/CodeGen/MachineFrameInfo.h"
20#include "llvm/CodeGen/MachineInstrBuilder.h"
21#include "llvm/CodeGen/MachineRegisterInfo.h"
22#include "llvm/CodeGen/SelectionDAG.h"
23#include "llvm/IR/Argument.h"
24#include "llvm/IR/Function.h"
25
26using namespace llvm;
27
28R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
29    AMDGPUTargetLowering(TM),
30    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
31  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
32  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
33  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
34  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
35  computeRegisterProperties();
36
37  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
38  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
39  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
40  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
41
42  setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
43  setOperationAction(ISD::AND,  MVT::v4i32, Expand);
44  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
45  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
46  setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
47  setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
48  setOperationAction(ISD::OR, MVT::v4i32, Expand);
49  setOperationAction(ISD::OR, MVT::v2i32, Expand);
50  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
51  setOperationAction(ISD::SHL, MVT::v4i32, Expand);
52  setOperationAction(ISD::SHL, MVT::v2i32, Expand);
53  setOperationAction(ISD::SRL, MVT::v4i32, Expand);
54  setOperationAction(ISD::SRL, MVT::v2i32, Expand);
55  setOperationAction(ISD::SRA, MVT::v4i32, Expand);
56  setOperationAction(ISD::SRA, MVT::v2i32, Expand);
57  setOperationAction(ISD::SUB, MVT::v4i32, Expand);
58  setOperationAction(ISD::SUB, MVT::v2i32, Expand);
59  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
60  setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
61  setOperationAction(ISD::UREM, MVT::v4i32, Expand);
62  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
63  setOperationAction(ISD::XOR, MVT::v4i32, Expand);
64  setOperationAction(ISD::XOR, MVT::v2i32, Expand);
65
66  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
67  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
68
69  setOperationAction(ISD::FSUB, MVT::f32, Expand);
70
71  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
72  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
73  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
74
75  setOperationAction(ISD::ROTL, MVT::i32, Custom);
76
77  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
78  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
79
80  setOperationAction(ISD::SETCC, MVT::i32, Expand);
81  setOperationAction(ISD::SETCC, MVT::f32, Expand);
82  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
83
84  setOperationAction(ISD::SELECT, MVT::i32, Custom);
85  setOperationAction(ISD::SELECT, MVT::f32, Custom);
86
87  setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
88  setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
89
90  // Legalize loads and stores to the private address space.
91  setOperationAction(ISD::LOAD, MVT::i32, Custom);
92  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
93  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
94  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
95  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
96  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
97  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
98  setOperationAction(ISD::STORE, MVT::i8, Custom);
99  setOperationAction(ISD::STORE, MVT::i32, Custom);
100  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
101  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
102
103  setOperationAction(ISD::LOAD, MVT::i32, Custom);
104  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
105  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
106
107  setTargetDAGCombine(ISD::FP_ROUND);
108  setTargetDAGCombine(ISD::FP_TO_SINT);
109  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
110  setTargetDAGCombine(ISD::SELECT_CC);
111
112  setBooleanContents(ZeroOrNegativeOneBooleanContent);
113  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
114  setSchedulingPreference(Sched::VLIW);
115}
116
117MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
118    MachineInstr * MI, MachineBasicBlock * BB) const {
119  MachineFunction * MF = BB->getParent();
120  MachineRegisterInfo &MRI = MF->getRegInfo();
121  MachineBasicBlock::iterator I = *MI;
122
123  switch (MI->getOpcode()) {
124  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
125  case AMDGPU::CLAMP_R600: {
126    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
127                                                   AMDGPU::MOV,
128                                                   MI->getOperand(0).getReg(),
129                                                   MI->getOperand(1).getReg());
130    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
131    break;
132  }
133
134  case AMDGPU::FABS_R600: {
135    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
136                                                    AMDGPU::MOV,
137                                                    MI->getOperand(0).getReg(),
138                                                    MI->getOperand(1).getReg());
139    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
140    break;
141  }
142
143  case AMDGPU::FNEG_R600: {
144    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
145                                                    AMDGPU::MOV,
146                                                    MI->getOperand(0).getReg(),
147                                                    MI->getOperand(1).getReg());
148    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
149    break;
150  }
151
152  case AMDGPU::MASK_WRITE: {
153    unsigned maskedRegister = MI->getOperand(0).getReg();
154    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
155    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
156    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
157    break;
158  }
159
160  case AMDGPU::MOV_IMM_F32:
161    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
162                     MI->getOperand(1).getFPImm()->getValueAPF()
163                         .bitcastToAPInt().getZExtValue());
164    break;
165  case AMDGPU::MOV_IMM_I32:
166    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
167                     MI->getOperand(1).getImm());
168    break;
169  case AMDGPU::CONST_COPY: {
170    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
171        MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
172    TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
173        MI->getOperand(1).getImm());
174    break;
175  }
176
177  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
178  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
179    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
180
181    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
182            .addOperand(MI->getOperand(0))
183            .addOperand(MI->getOperand(1))
184            .addImm(EOP); // Set End of program bit
185    break;
186  }
187
188  case AMDGPU::TXD: {
189    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
190    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
191
192    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
193            .addOperand(MI->getOperand(3))
194            .addOperand(MI->getOperand(4))
195            .addOperand(MI->getOperand(5))
196            .addOperand(MI->getOperand(6));
197    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
198            .addOperand(MI->getOperand(2))
199            .addOperand(MI->getOperand(4))
200            .addOperand(MI->getOperand(5))
201            .addOperand(MI->getOperand(6));
202    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
203            .addOperand(MI->getOperand(0))
204            .addOperand(MI->getOperand(1))
205            .addOperand(MI->getOperand(4))
206            .addOperand(MI->getOperand(5))
207            .addOperand(MI->getOperand(6))
208            .addReg(T0, RegState::Implicit)
209            .addReg(T1, RegState::Implicit);
210    break;
211  }
212
213  case AMDGPU::TXD_SHADOW: {
214    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
215    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
216
217    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
218            .addOperand(MI->getOperand(3))
219            .addOperand(MI->getOperand(4))
220            .addOperand(MI->getOperand(5))
221            .addOperand(MI->getOperand(6));
222    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
223            .addOperand(MI->getOperand(2))
224            .addOperand(MI->getOperand(4))
225            .addOperand(MI->getOperand(5))
226            .addOperand(MI->getOperand(6));
227    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
228            .addOperand(MI->getOperand(0))
229            .addOperand(MI->getOperand(1))
230            .addOperand(MI->getOperand(4))
231            .addOperand(MI->getOperand(5))
232            .addOperand(MI->getOperand(6))
233            .addReg(T0, RegState::Implicit)
234            .addReg(T1, RegState::Implicit);
235    break;
236  }
237
238  case AMDGPU::BRANCH:
239      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
240              .addOperand(MI->getOperand(0));
241      break;
242
243  case AMDGPU::BRANCH_COND_f32: {
244    MachineInstr *NewMI =
245      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
246              AMDGPU::PREDICATE_BIT)
247              .addOperand(MI->getOperand(1))
248              .addImm(OPCODE_IS_NOT_ZERO)
249              .addImm(0); // Flags
250    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
251    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
252            .addOperand(MI->getOperand(0))
253            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
254    break;
255  }
256
257  case AMDGPU::BRANCH_COND_i32: {
258    MachineInstr *NewMI =
259      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
260            AMDGPU::PREDICATE_BIT)
261            .addOperand(MI->getOperand(1))
262            .addImm(OPCODE_IS_NOT_ZERO_INT)
263            .addImm(0); // Flags
264    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
265    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
266           .addOperand(MI->getOperand(0))
267            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
268    break;
269  }
270
271  case AMDGPU::EG_ExportSwz:
272  case AMDGPU::R600_ExportSwz: {
273    // Instruction is left unmodified if its not the last one of its type
274    bool isLastInstructionOfItsType = true;
275    unsigned InstExportType = MI->getOperand(1).getImm();
276    for (MachineBasicBlock::iterator NextExportInst = llvm::next(I),
277         EndBlock = BB->end(); NextExportInst != EndBlock;
278         NextExportInst = llvm::next(NextExportInst)) {
279      if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
280          NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
281        unsigned CurrentInstExportType = NextExportInst->getOperand(1)
282            .getImm();
283        if (CurrentInstExportType == InstExportType) {
284          isLastInstructionOfItsType = false;
285          break;
286        }
287      }
288    }
289    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
290    if (!EOP && !isLastInstructionOfItsType)
291      return BB;
292    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
293    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
294            .addOperand(MI->getOperand(0))
295            .addOperand(MI->getOperand(1))
296            .addOperand(MI->getOperand(2))
297            .addOperand(MI->getOperand(3))
298            .addOperand(MI->getOperand(4))
299            .addOperand(MI->getOperand(5))
300            .addOperand(MI->getOperand(6))
301            .addImm(CfInst)
302            .addImm(EOP);
303    break;
304  }
305  case AMDGPU::RETURN: {
306    // RETURN instructions must have the live-out registers as implicit uses,
307    // otherwise they appear dead.
308    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
309    MachineInstrBuilder MIB(*MF, MI);
310    for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
311      MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
312    return BB;
313  }
314  }
315
316  MI->eraseFromParent();
317  return BB;
318}
319
320//===----------------------------------------------------------------------===//
321// Custom DAG Lowering Operations
322//===----------------------------------------------------------------------===//
323
324using namespace llvm::Intrinsic;
325using namespace llvm::AMDGPUIntrinsic;
326
327SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
328  switch (Op.getOpcode()) {
329  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
330  case ISD::ROTL: return LowerROTL(Op, DAG);
331  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
332  case ISD::SELECT: return LowerSELECT(Op, DAG);
333  case ISD::STORE: return LowerSTORE(Op, DAG);
334  case ISD::LOAD: return LowerLOAD(Op, DAG);
335  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
336  case ISD::INTRINSIC_VOID: {
337    SDValue Chain = Op.getOperand(0);
338    unsigned IntrinsicID =
339                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
340    switch (IntrinsicID) {
341    case AMDGPUIntrinsic::AMDGPU_store_output: {
342      MachineFunction &MF = DAG.getMachineFunction();
343      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
344      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
345      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
346      MFI->LiveOuts.push_back(Reg);
347      return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
348    }
349    case AMDGPUIntrinsic::R600_store_swizzle: {
350      const SDValue Args[8] = {
351        Chain,
352        Op.getOperand(2), // Export Value
353        Op.getOperand(3), // ArrayBase
354        Op.getOperand(4), // Type
355        DAG.getConstant(0, MVT::i32), // SWZ_X
356        DAG.getConstant(1, MVT::i32), // SWZ_Y
357        DAG.getConstant(2, MVT::i32), // SWZ_Z
358        DAG.getConstant(3, MVT::i32) // SWZ_W
359      };
360      return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(),
361          Args, 8);
362    }
363
364    // default for switch(IntrinsicID)
365    default: break;
366    }
367    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
368    break;
369  }
370  case ISD::INTRINSIC_WO_CHAIN: {
371    unsigned IntrinsicID =
372                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
373    EVT VT = Op.getValueType();
374    DebugLoc DL = Op.getDebugLoc();
375    switch(IntrinsicID) {
376    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
377    case AMDGPUIntrinsic::R600_load_input: {
378      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
379      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
380      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
381    }
382
383    case AMDGPUIntrinsic::R600_interp_input: {
384      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
385      int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
386      MachineSDNode *interp;
387      if (ijb < 0) {
388        interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
389            MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
390        return DAG.getTargetExtractSubreg(
391            TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
392            DL, MVT::f32, SDValue(interp, 0));
393      }
394
395      if (slot % 4 < 2)
396        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
397            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
398            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
399                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
400            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
401                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
402      else
403        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
404            MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
405            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
406                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
407            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
408                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
409
410      return SDValue(interp, slot % 2);
411    }
412
413    case r600_read_ngroups_x:
414      return LowerImplicitParameter(DAG, VT, DL, 0);
415    case r600_read_ngroups_y:
416      return LowerImplicitParameter(DAG, VT, DL, 1);
417    case r600_read_ngroups_z:
418      return LowerImplicitParameter(DAG, VT, DL, 2);
419    case r600_read_global_size_x:
420      return LowerImplicitParameter(DAG, VT, DL, 3);
421    case r600_read_global_size_y:
422      return LowerImplicitParameter(DAG, VT, DL, 4);
423    case r600_read_global_size_z:
424      return LowerImplicitParameter(DAG, VT, DL, 5);
425    case r600_read_local_size_x:
426      return LowerImplicitParameter(DAG, VT, DL, 6);
427    case r600_read_local_size_y:
428      return LowerImplicitParameter(DAG, VT, DL, 7);
429    case r600_read_local_size_z:
430      return LowerImplicitParameter(DAG, VT, DL, 8);
431
432    case r600_read_tgid_x:
433      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
434                                  AMDGPU::T1_X, VT);
435    case r600_read_tgid_y:
436      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
437                                  AMDGPU::T1_Y, VT);
438    case r600_read_tgid_z:
439      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
440                                  AMDGPU::T1_Z, VT);
441    case r600_read_tidig_x:
442      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
443                                  AMDGPU::T0_X, VT);
444    case r600_read_tidig_y:
445      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
446                                  AMDGPU::T0_Y, VT);
447    case r600_read_tidig_z:
448      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
449                                  AMDGPU::T0_Z, VT);
450    }
451    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
452    break;
453  }
454  } // end switch(Op.getOpcode())
455  return SDValue();
456}
457
458void R600TargetLowering::ReplaceNodeResults(SDNode *N,
459                                            SmallVectorImpl<SDValue> &Results,
460                                            SelectionDAG &DAG) const {
461  switch (N->getOpcode()) {
462  default: return;
463  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
464    return;
465  case ISD::LOAD: {
466    SDNode *Node = LowerLOAD(SDValue(N, 0), DAG).getNode();
467    Results.push_back(SDValue(Node, 0));
468    Results.push_back(SDValue(Node, 1));
469    // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
470    // function
471    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), SDValue(Node, 1));
472    return;
473  }
474  case ISD::STORE:
475    SDNode *Node = LowerSTORE(SDValue(N, 0), DAG).getNode();
476    Results.push_back(SDValue(Node, 0));
477    return;
478  }
479}
480
481SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
482  return DAG.getNode(
483      ISD::SETCC,
484      Op.getDebugLoc(),
485      MVT::i1,
486      Op, DAG.getConstantFP(0.0f, MVT::f32),
487      DAG.getCondCode(ISD::SETNE)
488      );
489}
490
491SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
492                                                   DebugLoc DL,
493                                                   unsigned DwordOffset) const {
494  unsigned ByteOffset = DwordOffset * 4;
495  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
496                                      AMDGPUAS::PARAM_I_ADDRESS);
497
498  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
499  assert(isInt<16>(ByteOffset));
500
501  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
502                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
503                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
504                     false, false, false, 0);
505}
506
507SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
508
509  MachineFunction &MF = DAG.getMachineFunction();
510  const AMDGPUFrameLowering *TFL =
511   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
512
513  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
514  assert(FIN);
515
516  unsigned FrameIndex = FIN->getIndex();
517  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
518  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
519}
520
521SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
522  DebugLoc DL = Op.getDebugLoc();
523  EVT VT = Op.getValueType();
524
525  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
526                     Op.getOperand(0),
527                     Op.getOperand(0),
528                     DAG.getNode(ISD::SUB, DL, VT,
529                                 DAG.getConstant(32, MVT::i32),
530                                 Op.getOperand(1)));
531}
532
533bool R600TargetLowering::isZero(SDValue Op) const {
534  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
535    return Cst->isNullValue();
536  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
537    return CstFP->isZero();
538  } else {
539    return false;
540  }
541}
542
543SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
544  DebugLoc DL = Op.getDebugLoc();
545  EVT VT = Op.getValueType();
546
547  SDValue LHS = Op.getOperand(0);
548  SDValue RHS = Op.getOperand(1);
549  SDValue True = Op.getOperand(2);
550  SDValue False = Op.getOperand(3);
551  SDValue CC = Op.getOperand(4);
552  SDValue Temp;
553
554  // LHS and RHS are guaranteed to be the same value type
555  EVT CompareVT = LHS.getValueType();
556
557  // Check if we can lower this to a native operation.
558
559  // Try to lower to a SET* instruction:
560  //
561  // SET* can match the following patterns:
562  //
563  // select_cc f32, f32, -1,  0, cc_any
564  // select_cc f32, f32, 1.0f, 0.0f, cc_any
565  // select_cc i32, i32, -1,  0, cc_any
566  //
567
568  // Move hardware True/False values to the correct operand.
569  if (isHWTrueValue(False) && isHWFalseValue(True)) {
570    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
571    std::swap(False, True);
572    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
573  }
574
575  if (isHWTrueValue(True) && isHWFalseValue(False) &&
576      (CompareVT == VT || VT == MVT::i32)) {
577    // This can be matched by a SET* instruction.
578    return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
579  }
580
581  // Try to lower to a CND* instruction:
582  //
583  // CND* can match the following patterns:
584  //
585  // select_cc f32, 0.0, f32, f32, cc_any
586  // select_cc f32, 0.0, i32, i32, cc_any
587  // select_cc i32, 0,   f32, f32, cc_any
588  // select_cc i32, 0,   i32, i32, cc_any
589  //
590  if (isZero(LHS) || isZero(RHS)) {
591    SDValue Cond = (isZero(LHS) ? RHS : LHS);
592    SDValue Zero = (isZero(LHS) ? LHS : RHS);
593    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
594    if (CompareVT != VT) {
595      // Bitcast True / False to the correct types.  This will end up being
596      // a nop, but it allows us to define only a single pattern in the
597      // .TD files for each CND* instruction rather than having to have
598      // one pattern for integer True/False and one for fp True/False
599      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
600      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
601    }
602    if (isZero(LHS)) {
603      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
604    }
605
606    switch (CCOpcode) {
607    case ISD::SETONE:
608    case ISD::SETUNE:
609    case ISD::SETNE:
610    case ISD::SETULE:
611    case ISD::SETULT:
612    case ISD::SETOLE:
613    case ISD::SETOLT:
614    case ISD::SETLE:
615    case ISD::SETLT:
616      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
617      Temp = True;
618      True = False;
619      False = Temp;
620      break;
621    default:
622      break;
623    }
624    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
625        Cond, Zero,
626        True, False,
627        DAG.getCondCode(CCOpcode));
628    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
629  }
630
631
632  // Possible Min/Max pattern
633  SDValue MinMax = LowerMinMax(Op, DAG);
634  if (MinMax.getNode()) {
635    return MinMax;
636  }
637
638  // If we make it this for it means we have no native instructions to handle
639  // this SELECT_CC, so we must lower it.
640  SDValue HWTrue, HWFalse;
641
642  if (CompareVT == MVT::f32) {
643    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
644    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
645  } else if (CompareVT == MVT::i32) {
646    HWTrue = DAG.getConstant(-1, CompareVT);
647    HWFalse = DAG.getConstant(0, CompareVT);
648  }
649  else {
650    assert(!"Unhandled value type in LowerSELECT_CC");
651  }
652
653  // Lower this unsupported SELECT_CC into a combination of two supported
654  // SELECT_CC operations.
655  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
656
657  return DAG.getNode(ISD::SELECT_CC, DL, VT,
658      Cond, HWFalse,
659      True, False,
660      DAG.getCondCode(ISD::SETNE));
661}
662
663SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
664  return DAG.getNode(ISD::SELECT_CC,
665      Op.getDebugLoc(),
666      Op.getValueType(),
667      Op.getOperand(0),
668      DAG.getConstant(0, MVT::i32),
669      Op.getOperand(1),
670      Op.getOperand(2),
671      DAG.getCondCode(ISD::SETNE));
672}
673
674/// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
675/// convert these pointers to a register index.  Each register holds
676/// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
677/// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
678/// for indirect addressing.
679SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
680                                               unsigned StackWidth,
681                                               SelectionDAG &DAG) const {
682  unsigned SRLPad;
683  switch(StackWidth) {
684  case 1:
685    SRLPad = 2;
686    break;
687  case 2:
688    SRLPad = 3;
689    break;
690  case 4:
691    SRLPad = 4;
692    break;
693  default: llvm_unreachable("Invalid stack width");
694  }
695
696  return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
697                     DAG.getConstant(SRLPad, MVT::i32));
698}
699
700void R600TargetLowering::getStackAddress(unsigned StackWidth,
701                                         unsigned ElemIdx,
702                                         unsigned &Channel,
703                                         unsigned &PtrIncr) const {
704  switch (StackWidth) {
705  default:
706  case 1:
707    Channel = 0;
708    if (ElemIdx > 0) {
709      PtrIncr = 1;
710    } else {
711      PtrIncr = 0;
712    }
713    break;
714  case 2:
715    Channel = ElemIdx % 2;
716    if (ElemIdx == 2) {
717      PtrIncr = 1;
718    } else {
719      PtrIncr = 0;
720    }
721    break;
722  case 4:
723    Channel = ElemIdx;
724    PtrIncr = 0;
725    break;
726  }
727}
728
729SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
730  DebugLoc DL = Op.getDebugLoc();
731  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
732  SDValue Chain = Op.getOperand(0);
733  SDValue Value = Op.getOperand(1);
734  SDValue Ptr = Op.getOperand(2);
735
736  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
737      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
738    // Convert pointer from byte address to dword address.
739    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
740                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
741                                  Ptr, DAG.getConstant(2, MVT::i32)));
742
743    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
744      assert(!"Truncated and indexed stores not supported yet");
745    } else {
746      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
747    }
748    return Chain;
749  }
750
751  EVT ValueVT = Value.getValueType();
752
753  if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
754    return SDValue();
755  }
756
757  // Lowering for indirect addressing
758
759  const MachineFunction &MF = DAG.getMachineFunction();
760  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
761                                         getTargetMachine().getFrameLowering());
762  unsigned StackWidth = TFL->getStackWidth(MF);
763
764  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
765
766  if (ValueVT.isVector()) {
767    unsigned NumElemVT = ValueVT.getVectorNumElements();
768    EVT ElemVT = ValueVT.getVectorElementType();
769    SDValue Stores[4];
770
771    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
772                                      "vector width in load");
773
774    for (unsigned i = 0; i < NumElemVT; ++i) {
775      unsigned Channel, PtrIncr;
776      getStackAddress(StackWidth, i, Channel, PtrIncr);
777      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
778                        DAG.getConstant(PtrIncr, MVT::i32));
779      SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
780                                 Value, DAG.getConstant(i, MVT::i32));
781
782      Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
783                              Chain, Elem, Ptr,
784                              DAG.getTargetConstant(Channel, MVT::i32));
785    }
786     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
787   } else {
788    if (ValueVT == MVT::i8) {
789      Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
790    }
791    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
792    DAG.getTargetConstant(0, MVT::i32)); // Channel
793  }
794
795  return Chain;
796}
797
798// return (512 + (kc_bank << 12)
799static int
800ConstantAddressBlock(unsigned AddressSpace) {
801  switch (AddressSpace) {
802  case AMDGPUAS::CONSTANT_BUFFER_0:
803    return 512;
804  case AMDGPUAS::CONSTANT_BUFFER_1:
805    return 512 + 4096;
806  case AMDGPUAS::CONSTANT_BUFFER_2:
807    return 512 + 4096 * 2;
808  case AMDGPUAS::CONSTANT_BUFFER_3:
809    return 512 + 4096 * 3;
810  case AMDGPUAS::CONSTANT_BUFFER_4:
811    return 512 + 4096 * 4;
812  case AMDGPUAS::CONSTANT_BUFFER_5:
813    return 512 + 4096 * 5;
814  case AMDGPUAS::CONSTANT_BUFFER_6:
815    return 512 + 4096 * 6;
816  case AMDGPUAS::CONSTANT_BUFFER_7:
817    return 512 + 4096 * 7;
818  case AMDGPUAS::CONSTANT_BUFFER_8:
819    return 512 + 4096 * 8;
820  case AMDGPUAS::CONSTANT_BUFFER_9:
821    return 512 + 4096 * 9;
822  case AMDGPUAS::CONSTANT_BUFFER_10:
823    return 512 + 4096 * 10;
824  case AMDGPUAS::CONSTANT_BUFFER_11:
825    return 512 + 4096 * 11;
826  case AMDGPUAS::CONSTANT_BUFFER_12:
827    return 512 + 4096 * 12;
828  case AMDGPUAS::CONSTANT_BUFFER_13:
829    return 512 + 4096 * 13;
830  case AMDGPUAS::CONSTANT_BUFFER_14:
831    return 512 + 4096 * 14;
832  case AMDGPUAS::CONSTANT_BUFFER_15:
833    return 512 + 4096 * 15;
834  default:
835    return -1;
836  }
837}
838
839SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
840{
841  EVT VT = Op.getValueType();
842  DebugLoc DL = Op.getDebugLoc();
843  LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
844  SDValue Chain = Op.getOperand(0);
845  SDValue Ptr = Op.getOperand(1);
846  SDValue LoweredLoad;
847
848  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
849  if (ConstantBlock > -1) {
850    SDValue Result;
851    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
852        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
853        dyn_cast<ConstantSDNode>(Ptr)) {
854      SDValue Slots[4];
855      for (unsigned i = 0; i < 4; i++) {
856        // We want Const position encoded with the following formula :
857        // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
858        // const_index is Ptr computed by llvm using an alignment of 16.
859        // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
860        // then div by 4 at the ISel step
861        SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
862            DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
863        Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
864      }
865      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
866    } else {
867      // non constant ptr cant be folded, keeps it as a v4f32 load
868      Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
869          DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
870          DAG.getConstant(LoadNode->getAddressSpace() -
871	                  AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
872          );
873    }
874
875    if (!VT.isVector()) {
876      Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
877          DAG.getConstant(0, MVT::i32));
878    }
879
880    SDValue MergedValues[2] = {
881        Result,
882        Chain
883    };
884    return DAG.getMergeValues(MergedValues, 2, DL);
885  }
886
887  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
888    return SDValue();
889  }
890
891  // Lowering for indirect addressing
892  const MachineFunction &MF = DAG.getMachineFunction();
893  const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering*>(
894                                         getTargetMachine().getFrameLowering());
895  unsigned StackWidth = TFL->getStackWidth(MF);
896
897  Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
898
899  if (VT.isVector()) {
900    unsigned NumElemVT = VT.getVectorNumElements();
901    EVT ElemVT = VT.getVectorElementType();
902    SDValue Loads[4];
903
904    assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
905                                      "vector width in load");
906
907    for (unsigned i = 0; i < NumElemVT; ++i) {
908      unsigned Channel, PtrIncr;
909      getStackAddress(StackWidth, i, Channel, PtrIncr);
910      Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
911                        DAG.getConstant(PtrIncr, MVT::i32));
912      Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
913                             Chain, Ptr,
914                             DAG.getTargetConstant(Channel, MVT::i32),
915                             Op.getOperand(2));
916    }
917    for (unsigned i = NumElemVT; i < 4; ++i) {
918      Loads[i] = DAG.getUNDEF(ElemVT);
919    }
920    EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
921    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
922  } else {
923    LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
924                              Chain, Ptr,
925                              DAG.getTargetConstant(0, MVT::i32), // Channel
926                              Op.getOperand(2));
927  }
928
929  SDValue Ops[2];
930  Ops[0] = LoweredLoad;
931  Ops[1] = Chain;
932
933  return DAG.getMergeValues(Ops, 2, DL);
934}
935
936/// XXX Only kernel functions are supported, so we can assume for now that
937/// every function is a kernel function, but in the future we should use
938/// separate calling conventions for kernel and non-kernel functions.
939SDValue R600TargetLowering::LowerFormalArguments(
940                                      SDValue Chain,
941                                      CallingConv::ID CallConv,
942                                      bool isVarArg,
943                                      const SmallVectorImpl<ISD::InputArg> &Ins,
944                                      DebugLoc DL, SelectionDAG &DAG,
945                                      SmallVectorImpl<SDValue> &InVals) const {
946  unsigned ParamOffsetBytes = 36;
947  Function::const_arg_iterator FuncArg =
948                            DAG.getMachineFunction().getFunction()->arg_begin();
949  for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
950    EVT VT = Ins[i].VT;
951    Type *ArgType = FuncArg->getType();
952    unsigned ArgSizeInBits = ArgType->isPointerTy() ?
953                             32 : ArgType->getPrimitiveSizeInBits();
954    unsigned ArgBytes = ArgSizeInBits >> 3;
955    EVT ArgVT;
956    if (ArgSizeInBits < VT.getSizeInBits()) {
957      assert(!ArgType->isFloatTy() &&
958             "Extending floating point arguments not supported yet");
959      ArgVT = MVT::getIntegerVT(ArgSizeInBits);
960    } else {
961      ArgVT = VT;
962    }
963    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
964                                                    AMDGPUAS::PARAM_I_ADDRESS);
965    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
966                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
967                                       MachinePointerInfo(UndefValue::get(PtrTy)),
968                                       ArgVT, false, false, ArgBytes);
969    InVals.push_back(Arg);
970    ParamOffsetBytes += ArgBytes;
971  }
972  return Chain;
973}
974
975EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
976   if (!VT.isVector()) return MVT::i32;
977   return VT.changeVectorElementTypeToInteger();
978}
979
980//===----------------------------------------------------------------------===//
981// Custom DAG Optimizations
982//===----------------------------------------------------------------------===//
983
984SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
985                                              DAGCombinerInfo &DCI) const {
986  SelectionDAG &DAG = DCI.DAG;
987
988  switch (N->getOpcode()) {
989  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
990  case ISD::FP_ROUND: {
991      SDValue Arg = N->getOperand(0);
992      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
993        return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
994                           Arg.getOperand(0));
995      }
996      break;
997    }
998
999  // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1000  // (i32 select_cc f32, f32, -1, 0 cc)
1001  //
1002  // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1003  // this to one of the SET*_DX10 instructions.
1004  case ISD::FP_TO_SINT: {
1005    SDValue FNeg = N->getOperand(0);
1006    if (FNeg.getOpcode() != ISD::FNEG) {
1007      return SDValue();
1008    }
1009    SDValue SelectCC = FNeg.getOperand(0);
1010    if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1011        SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1012        SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1013        !isHWTrueValue(SelectCC.getOperand(2)) ||
1014        !isHWFalseValue(SelectCC.getOperand(3))) {
1015      return SDValue();
1016    }
1017
1018    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
1019                           SelectCC.getOperand(0), // LHS
1020                           SelectCC.getOperand(1), // RHS
1021                           DAG.getConstant(-1, MVT::i32), // True
1022                           DAG.getConstant(0, MVT::i32),  // Flase
1023                           SelectCC.getOperand(4)); // CC
1024
1025    break;
1026  }
1027  // Extract_vec (Build_vector) generated by custom lowering
1028  // also needs to be customly combined
1029  case ISD::EXTRACT_VECTOR_ELT: {
1030    SDValue Arg = N->getOperand(0);
1031    if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1032      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1033        unsigned Element = Const->getZExtValue();
1034        return Arg->getOperand(Element);
1035      }
1036    }
1037    if (Arg.getOpcode() == ISD::BITCAST &&
1038        Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1039      if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1040        unsigned Element = Const->getZExtValue();
1041        return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
1042            Arg->getOperand(0).getOperand(Element));
1043      }
1044    }
1045  }
1046
1047  case ISD::SELECT_CC: {
1048    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1049    //      selectcc x, y, a, b, inv(cc)
1050    //
1051    // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1052    //      selectcc x, y, a, b, cc
1053    SDValue LHS = N->getOperand(0);
1054    if (LHS.getOpcode() != ISD::SELECT_CC) {
1055      return SDValue();
1056    }
1057
1058    SDValue RHS = N->getOperand(1);
1059    SDValue True = N->getOperand(2);
1060    SDValue False = N->getOperand(3);
1061    ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1062
1063    if (LHS.getOperand(2).getNode() != True.getNode() ||
1064        LHS.getOperand(3).getNode() != False.getNode() ||
1065        RHS.getNode() != False.getNode()) {
1066      return SDValue();
1067    }
1068
1069    switch (NCC) {
1070    default: return SDValue();
1071    case ISD::SETNE: return LHS;
1072    case ISD::SETEQ: {
1073      ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1074      LHSCC = ISD::getSetCCInverse(LHSCC,
1075                                  LHS.getOperand(0).getValueType().isInteger());
1076      return DAG.getSelectCC(N->getDebugLoc(),
1077                             LHS.getOperand(0),
1078                             LHS.getOperand(1),
1079                             LHS.getOperand(2),
1080                             LHS.getOperand(3),
1081                             LHSCC);
1082    }
1083    }
1084  }
1085  case AMDGPUISD::EXPORT: {
1086    SDValue Arg = N->getOperand(1);
1087    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
1088      break;
1089    SDValue NewBldVec[4] = {
1090        DAG.getUNDEF(MVT::f32),
1091        DAG.getUNDEF(MVT::f32),
1092        DAG.getUNDEF(MVT::f32),
1093        DAG.getUNDEF(MVT::f32)
1094      };
1095    SDValue NewArgs[8] = {
1096      N->getOperand(0), // Chain
1097      SDValue(),
1098      N->getOperand(2), // ArrayBase
1099      N->getOperand(3), // Type
1100      N->getOperand(4), // SWZ_X
1101      N->getOperand(5), // SWZ_Y
1102      N->getOperand(6), // SWZ_Z
1103      N->getOperand(7) // SWZ_W
1104    };
1105    for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
1106      if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
1107        if (C->isZero()) {
1108          NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
1109        } else if (C->isExactlyValue(1.0)) {
1110          NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
1111        } else {
1112          NewBldVec[i] = Arg.getOperand(i);
1113        }
1114      } else {
1115        NewBldVec[i] = Arg.getOperand(i);
1116      }
1117    }
1118    DebugLoc DL = N->getDebugLoc();
1119    NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
1120    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
1121  }
1122  }
1123  return SDValue();
1124}
1125