1//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
2//                     The LLVM Compiler Infrastructure
3//
4// This file is distributed under the University of Illinois Open Source
5// License. See LICENSE.TXT for details.
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SPUTargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "SPUISelLowering.h"
14#include "SPUTargetMachine.h"
15#include "SPUFrameLowering.h"
16#include "SPUMachineFunction.h"
17#include "llvm/Constants.h"
18#include "llvm/Function.h"
19#include "llvm/Intrinsics.h"
20#include "llvm/CallingConv.h"
21#include "llvm/Type.h"
22#include "llvm/CodeGen/CallingConvLower.h"
23#include "llvm/CodeGen/MachineFrameInfo.h"
24#include "llvm/CodeGen/MachineFunction.h"
25#include "llvm/CodeGen/MachineInstrBuilder.h"
26#include "llvm/CodeGen/MachineRegisterInfo.h"
27#include "llvm/CodeGen/SelectionDAG.h"
28#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
29#include "llvm/Target/TargetOptions.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/ErrorHandling.h"
32#include "llvm/Support/MathExtras.h"
33#include "llvm/Support/raw_ostream.h"
34
35using namespace llvm;
36
37namespace {
38  // Byte offset of the preferred slot (counted from the MSB)
39  int prefslotOffset(EVT VT) {
40    int retval=0;
41    if (VT==MVT::i1) retval=3;
42    if (VT==MVT::i8) retval=3;
43    if (VT==MVT::i16) retval=2;
44
45    return retval;
46  }
47
48  //! Expand a library call into an actual call DAG node
49  /*!
50   \note
51   This code is taken from SelectionDAGLegalize, since it is not exposed as
52   part of the LLVM SelectionDAG API.
53   */
54
55  SDValue
56  ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
57                bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
58    // The input chain to this libcall is the entry node of the function.
59    // Legalizing the call will automatically add the previous call to the
60    // dependence.
61    SDValue InChain = DAG.getEntryNode();
62
63    TargetLowering::ArgListTy Args;
64    TargetLowering::ArgListEntry Entry;
65    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
66      EVT ArgVT = Op.getOperand(i).getValueType();
67      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
68      Entry.Node = Op.getOperand(i);
69      Entry.Ty = ArgTy;
70      Entry.isSExt = isSigned;
71      Entry.isZExt = !isSigned;
72      Args.push_back(Entry);
73    }
74    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
75                                           TLI.getPointerTy());
76
77    // Splice the libcall in wherever FindInputOutputChains tells us to.
78    Type *RetTy =
79                Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
80    TargetLowering::CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned,
81                                         false, false,
82                            0, TLI.getLibcallCallingConv(LC),
83                            /*isTailCall=*/false,
84                                         /*doesNotRet=*/false,
85                                         /*isReturnValueUsed=*/true,
86                            Callee, Args, DAG, Op.getDebugLoc());
87    std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
88
89    return CallInfo.first;
90  }
91}
92
93SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
94  : TargetLowering(TM, new TargetLoweringObjectFileELF()),
95    SPUTM(TM) {
96
97  // Use _setjmp/_longjmp instead of setjmp/longjmp.
98  setUseUnderscoreSetJmp(true);
99  setUseUnderscoreLongJmp(true);
100
101  // Set RTLIB libcall names as used by SPU:
102  setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
103
104  // Set up the SPU's register classes:
105  addRegisterClass(MVT::i8,   &SPU::R8CRegClass);
106  addRegisterClass(MVT::i16,  &SPU::R16CRegClass);
107  addRegisterClass(MVT::i32,  &SPU::R32CRegClass);
108  addRegisterClass(MVT::i64,  &SPU::R64CRegClass);
109  addRegisterClass(MVT::f32,  &SPU::R32FPRegClass);
110  addRegisterClass(MVT::f64,  &SPU::R64FPRegClass);
111  addRegisterClass(MVT::i128, &SPU::GPRCRegClass);
112
113  // SPU has no sign or zero extended loads for i1, i8, i16:
114  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
115  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
116  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
117
118  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
119  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
120
121  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
122  setTruncStoreAction(MVT::i128, MVT::i32, Expand);
123  setTruncStoreAction(MVT::i128, MVT::i16, Expand);
124  setTruncStoreAction(MVT::i128, MVT::i8, Expand);
125
126  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
127
128  // SPU constant load actions are custom lowered:
129  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
130  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
131
132  // SPU's loads and stores have to be custom lowered:
133  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
134       ++sctype) {
135    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
136
137    setOperationAction(ISD::LOAD,   VT, Custom);
138    setOperationAction(ISD::STORE,  VT, Custom);
139    setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
140    setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
141    setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
142
143    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
144      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
145      setTruncStoreAction(VT, StoreVT, Expand);
146    }
147  }
148
149  for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
150       ++sctype) {
151    MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
152
153    setOperationAction(ISD::LOAD,   VT, Custom);
154    setOperationAction(ISD::STORE,  VT, Custom);
155
156    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
157      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
158      setTruncStoreAction(VT, StoreVT, Expand);
159    }
160  }
161
162  // Expand the jumptable branches
163  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
164  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
165
166  // Custom lower SELECT_CC for most cases, but expand by default
167  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
168  setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
169  setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
170  setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
171  setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
172
173  // SPU has no intrinsics for these particular operations:
174  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
175  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
176
177  // SPU has no division/remainder instructions
178  setOperationAction(ISD::SREM,    MVT::i8,   Expand);
179  setOperationAction(ISD::UREM,    MVT::i8,   Expand);
180  setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
181  setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
182  setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
183  setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
184  setOperationAction(ISD::SREM,    MVT::i16,  Expand);
185  setOperationAction(ISD::UREM,    MVT::i16,  Expand);
186  setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
187  setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
188  setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
189  setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
190  setOperationAction(ISD::SREM,    MVT::i32,  Expand);
191  setOperationAction(ISD::UREM,    MVT::i32,  Expand);
192  setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
193  setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
194  setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
195  setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
196  setOperationAction(ISD::SREM,    MVT::i64,  Expand);
197  setOperationAction(ISD::UREM,    MVT::i64,  Expand);
198  setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
199  setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
200  setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
201  setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
202  setOperationAction(ISD::SREM,    MVT::i128, Expand);
203  setOperationAction(ISD::UREM,    MVT::i128, Expand);
204  setOperationAction(ISD::SDIV,    MVT::i128, Expand);
205  setOperationAction(ISD::UDIV,    MVT::i128, Expand);
206  setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
207  setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
208
209  // We don't support sin/cos/sqrt/fmod
210  setOperationAction(ISD::FSIN , MVT::f64, Expand);
211  setOperationAction(ISD::FCOS , MVT::f64, Expand);
212  setOperationAction(ISD::FREM , MVT::f64, Expand);
213  setOperationAction(ISD::FSIN , MVT::f32, Expand);
214  setOperationAction(ISD::FCOS , MVT::f32, Expand);
215  setOperationAction(ISD::FREM , MVT::f32, Expand);
216
217  // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
218  // for f32!)
219  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
220  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
221
222  setOperationAction(ISD::FMA, MVT::f64, Expand);
223  setOperationAction(ISD::FMA, MVT::f32, Expand);
224
225  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
226  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
227
228  // SPU can do rotate right and left, so legalize it... but customize for i8
229  // because instructions don't exist.
230
231  // FIXME: Change from "expand" to appropriate type once ROTR is supported in
232  //        .td files.
233  setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
234  setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
235  setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
236
237  setOperationAction(ISD::ROTL, MVT::i32,    Legal);
238  setOperationAction(ISD::ROTL, MVT::i16,    Legal);
239  setOperationAction(ISD::ROTL, MVT::i8,     Custom);
240
241  // SPU has no native version of shift left/right for i8
242  setOperationAction(ISD::SHL,  MVT::i8,     Custom);
243  setOperationAction(ISD::SRL,  MVT::i8,     Custom);
244  setOperationAction(ISD::SRA,  MVT::i8,     Custom);
245
246  // Make these operations legal and handle them during instruction selection:
247  setOperationAction(ISD::SHL,  MVT::i64,    Legal);
248  setOperationAction(ISD::SRL,  MVT::i64,    Legal);
249  setOperationAction(ISD::SRA,  MVT::i64,    Legal);
250
251  // Custom lower i8, i32 and i64 multiplications
252  setOperationAction(ISD::MUL,  MVT::i8,     Custom);
253  setOperationAction(ISD::MUL,  MVT::i32,    Legal);
254  setOperationAction(ISD::MUL,  MVT::i64,    Legal);
255
256  // Expand double-width multiplication
257  // FIXME: It would probably be reasonable to support some of these operations
258  setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
259  setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
260  setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
261  setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
262  setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
263  setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
264  setOperationAction(ISD::MULHU,     MVT::i16, Expand);
265  setOperationAction(ISD::MULHS,     MVT::i16, Expand);
266  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
267  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
268  setOperationAction(ISD::MULHU,     MVT::i32, Expand);
269  setOperationAction(ISD::MULHS,     MVT::i32, Expand);
270  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
271  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
272  setOperationAction(ISD::MULHU,     MVT::i64, Expand);
273  setOperationAction(ISD::MULHS,     MVT::i64, Expand);
274
275  // Need to custom handle (some) common i8, i64 math ops
276  setOperationAction(ISD::ADD,  MVT::i8,     Custom);
277  setOperationAction(ISD::ADD,  MVT::i64,    Legal);
278  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
279  setOperationAction(ISD::SUB,  MVT::i64,    Legal);
280
281  // SPU does not have BSWAP. It does have i32 support CTLZ.
282  // CTPOP has to be custom lowered.
283  setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
284  setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
285
286  setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
287  setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
288  setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
289  setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
290  setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
291
292  setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
293  setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
294  setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
295  setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
296  setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
297  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i8,    Expand);
298  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16,   Expand);
299  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32,   Expand);
300  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64,   Expand);
301  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i128,  Expand);
302
303  setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
304  setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
305  setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
306  setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
307  setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
308  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8,    Expand);
309  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16,   Expand);
310  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32,   Expand);
311  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64,   Expand);
312  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i128,  Expand);
313
314  // SPU has a version of select that implements (a&~c)|(b&c), just like
315  // select ought to work:
316  setOperationAction(ISD::SELECT, MVT::i8,   Legal);
317  setOperationAction(ISD::SELECT, MVT::i16,  Legal);
318  setOperationAction(ISD::SELECT, MVT::i32,  Legal);
319  setOperationAction(ISD::SELECT, MVT::i64,  Legal);
320
321  setOperationAction(ISD::SETCC, MVT::i8,    Legal);
322  setOperationAction(ISD::SETCC, MVT::i16,   Legal);
323  setOperationAction(ISD::SETCC, MVT::i32,   Legal);
324  setOperationAction(ISD::SETCC, MVT::i64,   Legal);
325  setOperationAction(ISD::SETCC, MVT::f64,   Custom);
326
327  // Custom lower i128 -> i64 truncates
328  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
329
330  // Custom lower i32/i64 -> i128 sign extend
331  setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
332
333  setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
334  setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
335  setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
336  setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
337  // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
338  // to expand to a libcall, hence the custom lowering:
339  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
340  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
341  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
342  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
343  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
344  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
345
346  // FDIV on SPU requires custom lowering
347  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
348
349  // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
350  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
351  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
352  setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
353  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
354  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
355  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
356  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
357  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
358
359  setOperationAction(ISD::BITCAST, MVT::i32, Legal);
360  setOperationAction(ISD::BITCAST, MVT::f32, Legal);
361  setOperationAction(ISD::BITCAST, MVT::i64, Legal);
362  setOperationAction(ISD::BITCAST, MVT::f64, Legal);
363
364  // We cannot sextinreg(i1).  Expand to shifts.
365  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
366
367  // We want to legalize GlobalAddress and ConstantPool nodes into the
368  // appropriate instructions to materialize the address.
369  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
370       ++sctype) {
371    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
372
373    setOperationAction(ISD::GlobalAddress,  VT, Custom);
374    setOperationAction(ISD::ConstantPool,   VT, Custom);
375    setOperationAction(ISD::JumpTable,      VT, Custom);
376  }
377
378  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
379  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
380
381  // Use the default implementation.
382  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
383  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
384  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
385  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
386  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
387  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
388  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
389
390  // Cell SPU has instructions for converting between i64 and fp.
391  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
392  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
393
394  // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
395  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
396
397  // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
398  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
399
400  // First set operation action for all vector types to expand. Then we
401  // will selectively turn on ones that can be effectively codegen'd.
402  addRegisterClass(MVT::v16i8, &SPU::VECREGRegClass);
403  addRegisterClass(MVT::v8i16, &SPU::VECREGRegClass);
404  addRegisterClass(MVT::v4i32, &SPU::VECREGRegClass);
405  addRegisterClass(MVT::v2i64, &SPU::VECREGRegClass);
406  addRegisterClass(MVT::v4f32, &SPU::VECREGRegClass);
407  addRegisterClass(MVT::v2f64, &SPU::VECREGRegClass);
408
409  for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
410       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
411    MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
412
413    // Set operation actions to legal types only.
414    if (!isTypeLegal(VT)) continue;
415
416    // add/sub are legal for all supported vector VT's.
417    setOperationAction(ISD::ADD,     VT, Legal);
418    setOperationAction(ISD::SUB,     VT, Legal);
419    // mul has to be custom lowered.
420    setOperationAction(ISD::MUL,     VT, Legal);
421
422    setOperationAction(ISD::AND,     VT, Legal);
423    setOperationAction(ISD::OR,      VT, Legal);
424    setOperationAction(ISD::XOR,     VT, Legal);
425    setOperationAction(ISD::LOAD,    VT, Custom);
426    setOperationAction(ISD::SELECT,  VT, Legal);
427    setOperationAction(ISD::STORE,   VT, Custom);
428
429    // These operations need to be expanded:
430    setOperationAction(ISD::SDIV,    VT, Expand);
431    setOperationAction(ISD::SREM,    VT, Expand);
432    setOperationAction(ISD::UDIV,    VT, Expand);
433    setOperationAction(ISD::UREM,    VT, Expand);
434
435    // Expand all trunc stores
436    for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
437         j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
438      MVT::SimpleValueType TargetVT = (MVT::SimpleValueType)j;
439    setTruncStoreAction(VT, TargetVT, Expand);
440    }
441
442    // Custom lower build_vector, constant pool spills, insert and
443    // extract vector elements:
444    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
445    setOperationAction(ISD::ConstantPool, VT, Custom);
446    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
447    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
448    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
449    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
450  }
451
452  setOperationAction(ISD::SHL, MVT::v2i64, Expand);
453
454  setOperationAction(ISD::AND, MVT::v16i8, Custom);
455  setOperationAction(ISD::OR,  MVT::v16i8, Custom);
456  setOperationAction(ISD::XOR, MVT::v16i8, Custom);
457  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
458
459  setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
460
461  setBooleanContents(ZeroOrNegativeOneBooleanContent);
462  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // FIXME: Is this correct?
463
464  setStackPointerRegisterToSaveRestore(SPU::R1);
465
466  // We have target-specific dag combine patterns for the following nodes:
467  setTargetDAGCombine(ISD::ADD);
468  setTargetDAGCombine(ISD::ZERO_EXTEND);
469  setTargetDAGCombine(ISD::SIGN_EXTEND);
470  setTargetDAGCombine(ISD::ANY_EXTEND);
471
472  setMinFunctionAlignment(3);
473
474  computeRegisterProperties();
475
476  // Set pre-RA register scheduler default to BURR, which produces slightly
477  // better code than the default (could also be TDRR, but TargetLowering.h
478  // needs a mod to support that model):
479  setSchedulingPreference(Sched::RegPressure);
480}
481
482const char *SPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
483  switch (Opcode) {
484  default: return 0;
485  case SPUISD::RET_FLAG: return "SPUISD::RET_FLAG";
486  case SPUISD::Hi: return "SPUISD::Hi";
487  case SPUISD::Lo: return "SPUISD::Lo";
488  case SPUISD::PCRelAddr: return "SPUISD::PCRelAddr";
489  case SPUISD::AFormAddr: return "SPUISD::AFormAddr";
490  case SPUISD::IndirectAddr: return "SPUISD::IndirectAddr";
491  case SPUISD::LDRESULT: return "SPUISD::LDRESULT";
492  case SPUISD::CALL: return "SPUISD::CALL";
493  case SPUISD::SHUFB: return "SPUISD::SHUFB";
494  case SPUISD::SHUFFLE_MASK: return "SPUISD::SHUFFLE_MASK";
495  case SPUISD::CNTB: return "SPUISD::CNTB";
496  case SPUISD::PREFSLOT2VEC: return "SPUISD::PREFSLOT2VEC";
497  case SPUISD::VEC2PREFSLOT: return "SPUISD::VEC2PREFSLOT";
498  case SPUISD::SHL_BITS: return "SPUISD::SHL_BITS";
499  case SPUISD::SHL_BYTES: return "SPUISD::SHL_BYTES";
500  case SPUISD::VEC_ROTL: return "SPUISD::VEC_ROTL";
501  case SPUISD::VEC_ROTR: return "SPUISD::VEC_ROTR";
502  case SPUISD::ROTBYTES_LEFT: return "SPUISD::ROTBYTES_LEFT";
503  case SPUISD::ROTBYTES_LEFT_BITS: return "SPUISD::ROTBYTES_LEFT_BITS";
504  case SPUISD::SELECT_MASK: return "SPUISD::SELECT_MASK";
505  case SPUISD::SELB: return "SPUISD::SELB";
506  case SPUISD::ADD64_MARKER: return "SPUISD::ADD64_MARKER";
507  case SPUISD::SUB64_MARKER: return "SPUISD::SUB64_MARKER";
508  case SPUISD::MUL64_MARKER: return "SPUISD::MUL64_MARKER";
509  }
510}
511
512//===----------------------------------------------------------------------===//
513// Return the Cell SPU's SETCC result type
514//===----------------------------------------------------------------------===//
515
516EVT SPUTargetLowering::getSetCCResultType(EVT VT) const {
517  // i8, i16 and i32 are valid SETCC result types
518  MVT::SimpleValueType retval;
519
520  switch(VT.getSimpleVT().SimpleTy){
521    case MVT::i1:
522    case MVT::i8:
523      retval = MVT::i8; break;
524    case MVT::i16:
525      retval = MVT::i16; break;
526    case MVT::i32:
527    default:
528      retval = MVT::i32;
529  }
530  return retval;
531}
532
533//===----------------------------------------------------------------------===//
534// Calling convention code:
535//===----------------------------------------------------------------------===//
536
537#include "SPUGenCallingConv.inc"
538
539//===----------------------------------------------------------------------===//
540//  LowerOperation implementation
541//===----------------------------------------------------------------------===//
542
543/// Custom lower loads for CellSPU
544/*!
545 All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
546 within a 16-byte block, we have to rotate to extract the requested element.
547
548 For extending loads, we also want to ensure that the following sequence is
549 emitted, e.g. for MVT::f32 extending load to MVT::f64:
550
551\verbatim
552%1  v16i8,ch = load
553%2  v16i8,ch = rotate %1
554%3  v4f8, ch = bitconvert %2
555%4  f32      = vec2perfslot %3
556%5  f64      = fp_extend %4
557\endverbatim
558*/
559static SDValue
560LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
561  LoadSDNode *LN = cast<LoadSDNode>(Op);
562  SDValue the_chain = LN->getChain();
563  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
564  EVT InVT = LN->getMemoryVT();
565  EVT OutVT = Op.getValueType();
566  ISD::LoadExtType ExtType = LN->getExtensionType();
567  unsigned alignment = LN->getAlignment();
568  int pso = prefslotOffset(InVT);
569  DebugLoc dl = Op.getDebugLoc();
570  EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
571                                                  (128 / InVT.getSizeInBits()));
572
573  // two sanity checks
574  assert( LN->getAddressingMode() == ISD::UNINDEXED
575          && "we should get only UNINDEXED adresses");
576  // clean aligned loads can be selected as-is
577  if (InVT.getSizeInBits() == 128 && (alignment%16) == 0)
578    return SDValue();
579
580  // Get pointerinfos to the memory chunk(s) that contain the data to load
581  uint64_t mpi_offset = LN->getPointerInfo().Offset;
582  mpi_offset -= mpi_offset%16;
583  MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset);
584  MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16);
585
586  SDValue result;
587  SDValue basePtr = LN->getBasePtr();
588  SDValue rotate;
589
590  if ((alignment%16) == 0) {
591    ConstantSDNode *CN;
592
593    // Special cases for a known aligned load to simplify the base pointer
594    // and the rotation amount:
595    if (basePtr.getOpcode() == ISD::ADD
596        && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
597      // Known offset into basePtr
598      int64_t offset = CN->getSExtValue();
599      int64_t rotamt = int64_t((offset & 0xf) - pso);
600
601      if (rotamt < 0)
602        rotamt += 16;
603
604      rotate = DAG.getConstant(rotamt, MVT::i16);
605
606      // Simplify the base pointer for this case:
607      basePtr = basePtr.getOperand(0);
608      if ((offset & ~0xf) > 0) {
609        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
610                              basePtr,
611                              DAG.getConstant((offset & ~0xf), PtrVT));
612      }
613    } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
614               || (basePtr.getOpcode() == SPUISD::IndirectAddr
615                   && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
616                   && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
617      // Plain aligned a-form address: rotate into preferred slot
618      // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
619      int64_t rotamt = -pso;
620      if (rotamt < 0)
621        rotamt += 16;
622      rotate = DAG.getConstant(rotamt, MVT::i16);
623    } else {
624      // Offset the rotate amount by the basePtr and the preferred slot
625      // byte offset
626      int64_t rotamt = -pso;
627      if (rotamt < 0)
628        rotamt += 16;
629      rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
630                           basePtr,
631                           DAG.getConstant(rotamt, PtrVT));
632    }
633  } else {
634    // Unaligned load: must be more pessimistic about addressing modes:
635    if (basePtr.getOpcode() == ISD::ADD) {
636      MachineFunction &MF = DAG.getMachineFunction();
637      MachineRegisterInfo &RegInfo = MF.getRegInfo();
638      unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
639      SDValue Flag;
640
641      SDValue Op0 = basePtr.getOperand(0);
642      SDValue Op1 = basePtr.getOperand(1);
643
644      if (isa<ConstantSDNode>(Op1)) {
645        // Convert the (add <ptr>, <const>) to an indirect address contained
646        // in a register. Note that this is done because we need to avoid
647        // creating a 0(reg) d-form address due to the SPU's block loads.
648        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
649        the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
650        basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
651      } else {
652        // Convert the (add <arg1>, <arg2>) to an indirect address, which
653        // will likely be lowered as a reg(reg) x-form address.
654        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
655      }
656    } else {
657      basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
658                            basePtr,
659                            DAG.getConstant(0, PtrVT));
660   }
661
662    // Offset the rotate amount by the basePtr and the preferred slot
663    // byte offset
664    rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
665                         basePtr,
666                         DAG.getConstant(-pso, PtrVT));
667  }
668
669  // Do the load as a i128 to allow possible shifting
670  SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
671                       lowMemPtr,
672                       LN->isVolatile(), LN->isNonTemporal(), false, 16);
673
674  // When the size is not greater than alignment we get all data with just
675  // one load
676  if (alignment >= InVT.getSizeInBits()/8) {
677    // Update the chain
678    the_chain = low.getValue(1);
679
680    // Rotate into the preferred slot:
681    result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
682                         low.getValue(0), rotate);
683
684    // Convert the loaded v16i8 vector to the appropriate vector type
685    // specified by the operand:
686    EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
687                                 InVT, (128 / InVT.getSizeInBits()));
688    result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
689                         DAG.getNode(ISD::BITCAST, dl, vecVT, result));
690  }
691  // When alignment is less than the size, we might need (known only at
692  // run-time) two loads
693  // TODO: if the memory address is composed only from constants, we have
694  // extra kowledge, and might avoid the second load
695  else {
696    // storage position offset from lower 16 byte aligned memory chunk
697    SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
698                                  basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
699    // get a registerfull of ones. (this implementation is a workaround: LLVM
700    // cannot handle 128 bit signed int constants)
701    SDValue ones = DAG.getConstant(-1, MVT::v4i32 );
702    ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
703
704    SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
705                               DAG.getNode(ISD::ADD, dl, PtrVT,
706                                           basePtr,
707                                           DAG.getConstant(16, PtrVT)),
708                               highMemPtr,
709                               LN->isVolatile(), LN->isNonTemporal(), false,
710                               16);
711
712    the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
713                                                              high.getValue(1));
714
715    // Shift the (possible) high part right to compensate the misalignemnt.
716    // if there is no highpart (i.e. value is i64 and offset is 4), this
717    // will zero out the high value.
718    high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high,
719                                     DAG.getNode(ISD::SUB, dl, MVT::i32,
720                                                 DAG.getConstant( 16, MVT::i32),
721                                                 offset
722                                                ));
723
724    // Shift the low similarly
725    // TODO: add SPUISD::SHL_BYTES
726    low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
727
728    // Merge the two parts
729    result = DAG.getNode(ISD::BITCAST, dl, vecVT,
730                          DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
731
732    if (!InVT.isVector()) {
733      result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result );
734     }
735
736  }
737    // Handle extending loads by extending the scalar result:
738    if (ExtType == ISD::SEXTLOAD) {
739      result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
740    } else if (ExtType == ISD::ZEXTLOAD) {
741      result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
742    } else if (ExtType == ISD::EXTLOAD) {
743      unsigned NewOpc = ISD::ANY_EXTEND;
744
745      if (OutVT.isFloatingPoint())
746        NewOpc = ISD::FP_EXTEND;
747
748      result = DAG.getNode(NewOpc, dl, OutVT, result);
749    }
750
751    SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
752    SDValue retops[2] = {
753      result,
754      the_chain
755    };
756
757    result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
758                         retops, sizeof(retops) / sizeof(retops[0]));
759    return result;
760}
761
762/// Custom lower stores for CellSPU
763/*!
764 All CellSPU stores are aligned to 16-byte boundaries, so for elements
765 within a 16-byte block, we have to generate a shuffle to insert the
766 requested element into its place, then store the resulting block.
767 */
768static SDValue
769LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
770  StoreSDNode *SN = cast<StoreSDNode>(Op);
771  SDValue Value = SN->getValue();
772  EVT VT = Value.getValueType();
773  EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
774  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
775  DebugLoc dl = Op.getDebugLoc();
776  unsigned alignment = SN->getAlignment();
777  SDValue result;
778  EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
779                                                 (128 / StVT.getSizeInBits()));
780  // Get pointerinfos to the memory chunk(s) that contain the data to load
781  uint64_t mpi_offset = SN->getPointerInfo().Offset;
782  mpi_offset -= mpi_offset%16;
783  MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset);
784  MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16);
785
786
787  // two sanity checks
788  assert( SN->getAddressingMode() == ISD::UNINDEXED
789          && "we should get only UNINDEXED adresses");
790  // clean aligned loads can be selected as-is
791  if (StVT.getSizeInBits() == 128 && (alignment%16) == 0)
792    return SDValue();
793
794  SDValue alignLoadVec;
795  SDValue basePtr = SN->getBasePtr();
796  SDValue the_chain = SN->getChain();
797  SDValue insertEltOffs;
798
799  if ((alignment%16) == 0) {
800    ConstantSDNode *CN;
801    // Special cases for a known aligned load to simplify the base pointer
802    // and insertion byte:
803    if (basePtr.getOpcode() == ISD::ADD
804        && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
805      // Known offset into basePtr
806      int64_t offset = CN->getSExtValue();
807
808      // Simplify the base pointer for this case:
809      basePtr = basePtr.getOperand(0);
810      insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
811                                  basePtr,
812                                  DAG.getConstant((offset & 0xf), PtrVT));
813
814      if ((offset & ~0xf) > 0) {
815        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
816                              basePtr,
817                              DAG.getConstant((offset & ~0xf), PtrVT));
818      }
819    } else {
820      // Otherwise, assume it's at byte 0 of basePtr
821      insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
822                                  basePtr,
823                                  DAG.getConstant(0, PtrVT));
824      basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
825                                  basePtr,
826                                  DAG.getConstant(0, PtrVT));
827    }
828  } else {
829    // Unaligned load: must be more pessimistic about addressing modes:
830    if (basePtr.getOpcode() == ISD::ADD) {
831      MachineFunction &MF = DAG.getMachineFunction();
832      MachineRegisterInfo &RegInfo = MF.getRegInfo();
833      unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
834      SDValue Flag;
835
836      SDValue Op0 = basePtr.getOperand(0);
837      SDValue Op1 = basePtr.getOperand(1);
838
839      if (isa<ConstantSDNode>(Op1)) {
840        // Convert the (add <ptr>, <const>) to an indirect address contained
841        // in a register. Note that this is done because we need to avoid
842        // creating a 0(reg) d-form address due to the SPU's block loads.
843        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
844        the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
845        basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
846      } else {
847        // Convert the (add <arg1>, <arg2>) to an indirect address, which
848        // will likely be lowered as a reg(reg) x-form address.
849        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
850      }
851    } else {
852      basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
853                            basePtr,
854                            DAG.getConstant(0, PtrVT));
855    }
856
857    // Insertion point is solely determined by basePtr's contents
858    insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
859                                basePtr,
860                                DAG.getConstant(0, PtrVT));
861  }
862
863  // Load the lower part of the memory to which to store.
864  SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
865                          lowMemPtr, SN->isVolatile(), SN->isNonTemporal(),
866                            false, 16);
867
868  // if we don't need to store over the 16 byte boundary, one store suffices
869  if (alignment >= StVT.getSizeInBits()/8) {
870    // Update the chain
871    the_chain = low.getValue(1);
872
873    LoadSDNode *LN = cast<LoadSDNode>(low);
874    SDValue theValue = SN->getValue();
875
876    if (StVT != VT
877        && (theValue.getOpcode() == ISD::AssertZext
878            || theValue.getOpcode() == ISD::AssertSext)) {
879      // Drill down and get the value for zero- and sign-extended
880      // quantities
881      theValue = theValue.getOperand(0);
882    }
883
884    // If the base pointer is already a D-form address, then just create
885    // a new D-form address with a slot offset and the orignal base pointer.
886    // Otherwise generate a D-form address with the slot offset relative
887    // to the stack pointer, which is always aligned.
888#if !defined(NDEBUG)
889      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
890        errs() << "CellSPU LowerSTORE: basePtr = ";
891        basePtr.getNode()->dump(&DAG);
892        errs() << "\n";
893      }
894#endif
895
896    SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
897                                      insertEltOffs);
898    SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
899                                      theValue);
900
901    result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
902                         vectorizeOp, low,
903                         DAG.getNode(ISD::BITCAST, dl,
904                                     MVT::v4i32, insertEltOp));
905
906    result = DAG.getStore(the_chain, dl, result, basePtr,
907                          lowMemPtr,
908                          LN->isVolatile(), LN->isNonTemporal(),
909                          16);
910
911  }
912  // do the store when it might cross the 16 byte memory access boundary.
913  else {
914    // TODO issue a warning if SN->isVolatile()== true? This is likely not
915    // what the user wanted.
916
917    // address offset from nearest lower 16byte alinged address
918    SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
919                                    SN->getBasePtr(),
920                                    DAG.getConstant(0xf, MVT::i32));
921    // 16 - offset
922    SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
923                                           DAG.getConstant( 16, MVT::i32),
924                                           offset);
925    // 16 - sizeof(Value)
926    SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
927                                     DAG.getConstant( 16, MVT::i32),
928                                     DAG.getConstant( VT.getSizeInBits()/8,
929                                                      MVT::i32));
930    // get a registerfull of ones
931    SDValue ones = DAG.getConstant(-1, MVT::v4i32);
932    ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
933
934    // Create the 128 bit masks that have ones where the data to store is
935    // located.
936    SDValue lowmask, himask;
937    // if the value to store don't fill up the an entire 128 bits, zero
938    // out the last bits of the mask so that only the value we want to store
939    // is masked.
940    // this is e.g. in the case of store i32, align 2
941    if (!VT.isVector()){
942      Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
943      lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
944      lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
945                                                               surplus);
946      Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
947      Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
948
949    }
950    else {
951      lowmask = ones;
952      Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
953    }
954    // this will zero, if there are no data that goes to the high quad
955    himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
956                                                            offset_compl);
957    lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
958                                                             offset);
959
960    // Load in the old data and zero out the parts that will be overwritten with
961    // the new data to store.
962    SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
963                               DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
964                                           DAG.getConstant( 16, PtrVT)),
965                               highMemPtr,
966                               SN->isVolatile(), SN->isNonTemporal(),
967                               false, 16);
968    the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
969                                                              hi.getValue(1));
970
971    low = DAG.getNode(ISD::AND, dl, MVT::i128,
972                        DAG.getNode( ISD::BITCAST, dl, MVT::i128, low),
973                        DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
974    hi = DAG.getNode(ISD::AND, dl, MVT::i128,
975                        DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi),
976                        DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
977
978    // Shift the Value to store into place. rlow contains the parts that go to
979    // the lower memory chunk, rhi has the parts that go to the upper one.
980    SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
981    rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
982    SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
983                                                            offset_compl);
984
985    // Merge the old data and the new data and store the results
986    // Need to convert vectors here to integer as 'OR'ing floats assert
987    rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
988                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, low),
989                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow));
990    rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
991                         DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi),
992                         DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi));
993
994    low = DAG.getStore(the_chain, dl, rlow, basePtr,
995                          lowMemPtr,
996                          SN->isVolatile(), SN->isNonTemporal(), 16);
997    hi  = DAG.getStore(the_chain, dl, rhi,
998                            DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
999                                        DAG.getConstant( 16, PtrVT)),
1000                            highMemPtr,
1001                            SN->isVolatile(), SN->isNonTemporal(), 16);
1002    result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
1003                                                           hi.getValue(0));
1004  }
1005
1006  return result;
1007}
1008
1009//! Generate the address of a constant pool entry.
1010static SDValue
1011LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1012  EVT PtrVT = Op.getValueType();
1013  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
1014  const Constant *C = CP->getConstVal();
1015  SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
1016  SDValue Zero = DAG.getConstant(0, PtrVT);
1017  const TargetMachine &TM = DAG.getTarget();
1018  // FIXME there is no actual debug info here
1019  DebugLoc dl = Op.getDebugLoc();
1020
1021  if (TM.getRelocationModel() == Reloc::Static) {
1022    if (!ST->usingLargeMem()) {
1023      // Just return the SDValue with the constant pool address in it.
1024      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
1025    } else {
1026      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
1027      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
1028      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1029    }
1030  }
1031
1032  llvm_unreachable("LowerConstantPool: Relocation model other than static"
1033                   " not supported.");
1034}
1035
1036//! Alternate entry point for generating the address of a constant pool entry
1037SDValue
1038SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
1039  return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
1040}
1041
1042static SDValue
1043LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1044  EVT PtrVT = Op.getValueType();
1045  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
1046  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
1047  SDValue Zero = DAG.getConstant(0, PtrVT);
1048  const TargetMachine &TM = DAG.getTarget();
1049  // FIXME there is no actual debug info here
1050  DebugLoc dl = Op.getDebugLoc();
1051
1052  if (TM.getRelocationModel() == Reloc::Static) {
1053    if (!ST->usingLargeMem()) {
1054      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
1055    } else {
1056      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
1057      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
1058      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1059    }
1060  }
1061
1062  llvm_unreachable("LowerJumpTable: Relocation model other than static"
1063                   " not supported.");
1064}
1065
1066static SDValue
1067LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1068  EVT PtrVT = Op.getValueType();
1069  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
1070  const GlobalValue *GV = GSDN->getGlobal();
1071  SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
1072                                          PtrVT, GSDN->getOffset());
1073  const TargetMachine &TM = DAG.getTarget();
1074  SDValue Zero = DAG.getConstant(0, PtrVT);
1075  // FIXME there is no actual debug info here
1076  DebugLoc dl = Op.getDebugLoc();
1077
1078  if (TM.getRelocationModel() == Reloc::Static) {
1079    if (!ST->usingLargeMem()) {
1080      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
1081    } else {
1082      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
1083      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
1084      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1085    }
1086  } else {
1087    report_fatal_error("LowerGlobalAddress: Relocation model other than static"
1088                      "not supported.");
1089    /*NOTREACHED*/
1090  }
1091}
1092
1093//! Custom lower double precision floating point constants
1094static SDValue
1095LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
1096  EVT VT = Op.getValueType();
1097  // FIXME there is no actual debug info here
1098  DebugLoc dl = Op.getDebugLoc();
1099
1100  if (VT == MVT::f64) {
1101    ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
1102
1103    assert((FP != 0) &&
1104           "LowerConstantFP: Node is not ConstantFPSDNode");
1105
1106    uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
1107    SDValue T = DAG.getConstant(dbits, MVT::i64);
1108    SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
1109    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
1110                       DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec));
1111  }
1112
1113  return SDValue();
1114}
1115
1116SDValue
1117SPUTargetLowering::LowerFormalArguments(SDValue Chain,
1118                                        CallingConv::ID CallConv, bool isVarArg,
1119                                        const SmallVectorImpl<ISD::InputArg>
1120                                          &Ins,
1121                                        DebugLoc dl, SelectionDAG &DAG,
1122                                        SmallVectorImpl<SDValue> &InVals)
1123                                          const {
1124
1125  MachineFunction &MF = DAG.getMachineFunction();
1126  MachineFrameInfo *MFI = MF.getFrameInfo();
1127  MachineRegisterInfo &RegInfo = MF.getRegInfo();
1128  SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
1129
1130  unsigned ArgOffset = SPUFrameLowering::minStackSize();
1131  unsigned ArgRegIdx = 0;
1132  unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
1133
1134  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1135
1136  SmallVector<CCValAssign, 16> ArgLocs;
1137  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1138                 getTargetMachine(), ArgLocs, *DAG.getContext());
1139  // FIXME: allow for other calling conventions
1140  CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
1141
1142  // Add DAG nodes to load the arguments or copy them out of registers.
1143  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
1144    EVT ObjectVT = Ins[ArgNo].VT;
1145    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
1146    SDValue ArgVal;
1147    CCValAssign &VA = ArgLocs[ArgNo];
1148
1149    if (VA.isRegLoc()) {
1150      const TargetRegisterClass *ArgRegClass;
1151
1152      switch (ObjectVT.getSimpleVT().SimpleTy) {
1153      default:
1154        report_fatal_error("LowerFormalArguments Unhandled argument type: " +
1155                           Twine(ObjectVT.getEVTString()));
1156      case MVT::i8:
1157        ArgRegClass = &SPU::R8CRegClass;
1158        break;
1159      case MVT::i16:
1160        ArgRegClass = &SPU::R16CRegClass;
1161        break;
1162      case MVT::i32:
1163        ArgRegClass = &SPU::R32CRegClass;
1164        break;
1165      case MVT::i64:
1166        ArgRegClass = &SPU::R64CRegClass;
1167        break;
1168      case MVT::i128:
1169        ArgRegClass = &SPU::GPRCRegClass;
1170        break;
1171      case MVT::f32:
1172        ArgRegClass = &SPU::R32FPRegClass;
1173        break;
1174      case MVT::f64:
1175        ArgRegClass = &SPU::R64FPRegClass;
1176        break;
1177      case MVT::v2f64:
1178      case MVT::v4f32:
1179      case MVT::v2i64:
1180      case MVT::v4i32:
1181      case MVT::v8i16:
1182      case MVT::v16i8:
1183        ArgRegClass = &SPU::VECREGRegClass;
1184        break;
1185      }
1186
1187      unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
1188      RegInfo.addLiveIn(VA.getLocReg(), VReg);
1189      ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
1190      ++ArgRegIdx;
1191    } else {
1192      // We need to load the argument to a virtual register if we determined
1193      // above that we ran out of physical registers of the appropriate type
1194      // or we're forced to do vararg
1195      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
1196      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1197      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
1198                           false, false, false, 0);
1199      ArgOffset += StackSlotSize;
1200    }
1201
1202    InVals.push_back(ArgVal);
1203    // Update the chain
1204    Chain = ArgVal.getOperand(0);
1205  }
1206
1207  // vararg handling:
1208  if (isVarArg) {
1209    // FIXME: we should be able to query the argument registers from
1210    //        tablegen generated code.
1211    static const uint16_t ArgRegs[] = {
1212      SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
1213      SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
1214      SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
1215      SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
1216      SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
1217      SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
1218      SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
1219      SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
1220      SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
1221      SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
1222      SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
1223    };
1224    // size of ArgRegs array
1225    const unsigned NumArgRegs = 77;
1226
1227    // We will spill (79-3)+1 registers to the stack
1228    SmallVector<SDValue, 79-3+1> MemOps;
1229
1230    // Create the frame slot
1231    for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
1232      FuncInfo->setVarArgsFrameIndex(
1233        MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
1234      SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
1235      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
1236      SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
1237      SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
1238                                   false, false, 0);
1239      Chain = Store.getOperand(0);
1240      MemOps.push_back(Store);
1241
1242      // Increment address by stack slot size for the next stored argument
1243      ArgOffset += StackSlotSize;
1244    }
1245    if (!MemOps.empty())
1246      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1247                          &MemOps[0], MemOps.size());
1248  }
1249
1250  return Chain;
1251}
1252
1253/// isLSAAddress - Return the immediate to use if the specified
1254/// value is representable as a LSA address.
1255static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
1256  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
1257  if (!C) return 0;
1258
1259  int Addr = C->getZExtValue();
1260  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
1261      (Addr << 14 >> 14) != Addr)
1262    return 0;  // Top 14 bits have to be sext of immediate.
1263
1264  return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
1265}
1266
1267SDValue
1268SPUTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1269                             SmallVectorImpl<SDValue> &InVals) const {
1270  SelectionDAG &DAG                     = CLI.DAG;
1271  DebugLoc &dl                          = CLI.DL;
1272  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
1273  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
1274  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
1275  SDValue Chain                         = CLI.Chain;
1276  SDValue Callee                        = CLI.Callee;
1277  bool &isTailCall                      = CLI.IsTailCall;
1278  CallingConv::ID CallConv              = CLI.CallConv;
1279  bool isVarArg                         = CLI.IsVarArg;
1280
1281  // CellSPU target does not yet support tail call optimization.
1282  isTailCall = false;
1283
1284  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
1285  unsigned NumOps     = Outs.size();
1286  unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
1287
1288  SmallVector<CCValAssign, 16> ArgLocs;
1289  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1290                 getTargetMachine(), ArgLocs, *DAG.getContext());
1291  // FIXME: allow for other calling conventions
1292  CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
1293
1294  const unsigned NumArgRegs = ArgLocs.size();
1295
1296
1297  // Handy pointer type
1298  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1299
1300  // Set up a copy of the stack pointer for use loading and storing any
1301  // arguments that may not fit in the registers available for argument
1302  // passing.
1303  SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
1304
1305  // Figure out which arguments are going to go in registers, and which in
1306  // memory.
1307  unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR]
1308  unsigned ArgRegIdx = 0;
1309
1310  // Keep track of registers passing arguments
1311  std::vector<std::pair<unsigned, SDValue> > RegsToPass;
1312  // And the arguments passed on the stack
1313  SmallVector<SDValue, 8> MemOpChains;
1314
1315  for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
1316    SDValue Arg = OutVals[ArgRegIdx];
1317    CCValAssign &VA = ArgLocs[ArgRegIdx];
1318
1319    // PtrOff will be used to store the current argument to the stack if a
1320    // register cannot be found for it.
1321    SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
1322    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
1323
1324    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
1325    default: llvm_unreachable("Unexpected ValueType for argument!");
1326    case MVT::i8:
1327    case MVT::i16:
1328    case MVT::i32:
1329    case MVT::i64:
1330    case MVT::i128:
1331    case MVT::f32:
1332    case MVT::f64:
1333    case MVT::v2i64:
1334    case MVT::v2f64:
1335    case MVT::v4f32:
1336    case MVT::v4i32:
1337    case MVT::v8i16:
1338    case MVT::v16i8:
1339      if (ArgRegIdx != NumArgRegs) {
1340        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1341      } else {
1342        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
1343                                           MachinePointerInfo(),
1344                                           false, false, 0));
1345        ArgOffset += StackSlotSize;
1346      }
1347      break;
1348    }
1349  }
1350
1351  // Accumulate how many bytes are to be pushed on the stack, including the
1352  // linkage area, and parameter passing area.  According to the SPU ABI,
1353  // we minimally need space for [LR] and [SP].
1354  unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize();
1355
1356  // Insert a call sequence start
1357  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
1358                                                            true));
1359
1360  if (!MemOpChains.empty()) {
1361    // Adjust the stack pointer for the stack arguments.
1362    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1363                        &MemOpChains[0], MemOpChains.size());
1364  }
1365
1366  // Build a sequence of copy-to-reg nodes chained together with token chain
1367  // and flag operands which copy the outgoing args into the appropriate regs.
1368  SDValue InFlag;
1369  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1370    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1371                             RegsToPass[i].second, InFlag);
1372    InFlag = Chain.getValue(1);
1373  }
1374
1375  SmallVector<SDValue, 8> Ops;
1376  unsigned CallOpc = SPUISD::CALL;
1377
1378  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1379  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1380  // node so that legalize doesn't hack it.
1381  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1382    const GlobalValue *GV = G->getGlobal();
1383    EVT CalleeVT = Callee.getValueType();
1384    SDValue Zero = DAG.getConstant(0, PtrVT);
1385    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
1386
1387    if (!ST->usingLargeMem()) {
1388      // Turn calls to targets that are defined (i.e., have bodies) into BRSL
1389      // style calls, otherwise, external symbols are BRASL calls. This assumes
1390      // that declared/defined symbols are in the same compilation unit and can
1391      // be reached through PC-relative jumps.
1392      //
1393      // NOTE:
1394      // This may be an unsafe assumption for JIT and really large compilation
1395      // units.
1396      if (GV->isDeclaration()) {
1397        Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
1398      } else {
1399        Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
1400      }
1401    } else {
1402      // "Large memory" mode: Turn all calls into indirect calls with a X-form
1403      // address pairs:
1404      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
1405    }
1406  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1407    EVT CalleeVT = Callee.getValueType();
1408    SDValue Zero = DAG.getConstant(0, PtrVT);
1409    SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
1410        Callee.getValueType());
1411
1412    if (!ST->usingLargeMem()) {
1413      Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
1414    } else {
1415      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
1416    }
1417  } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
1418    // If this is an absolute destination address that appears to be a legal
1419    // local store address, use the munged value.
1420    Callee = SDValue(Dest, 0);
1421  }
1422
1423  Ops.push_back(Chain);
1424  Ops.push_back(Callee);
1425
1426  // Add argument registers to the end of the list so that they are known live
1427  // into the call.
1428  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1429    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1430                                  RegsToPass[i].second.getValueType()));
1431
1432  if (InFlag.getNode())
1433    Ops.push_back(InFlag);
1434  // Returns a chain and a flag for retval copy to use.
1435  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue),
1436                      &Ops[0], Ops.size());
1437  InFlag = Chain.getValue(1);
1438
1439  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
1440                             DAG.getIntPtrConstant(0, true), InFlag);
1441  if (!Ins.empty())
1442    InFlag = Chain.getValue(1);
1443
1444  // If the function returns void, just return the chain.
1445  if (Ins.empty())
1446    return Chain;
1447
1448  // Now handle the return value(s)
1449  SmallVector<CCValAssign, 16> RVLocs;
1450  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1451                    getTargetMachine(), RVLocs, *DAG.getContext());
1452  CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
1453
1454
1455  // If the call has results, copy the values out of the ret val registers.
1456  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1457    CCValAssign VA = RVLocs[i];
1458
1459    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1460                                     InFlag);
1461    Chain = Val.getValue(1);
1462    InFlag = Val.getValue(2);
1463    InVals.push_back(Val);
1464   }
1465
1466  return Chain;
1467}
1468
1469SDValue
1470SPUTargetLowering::LowerReturn(SDValue Chain,
1471                               CallingConv::ID CallConv, bool isVarArg,
1472                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1473                               const SmallVectorImpl<SDValue> &OutVals,
1474                               DebugLoc dl, SelectionDAG &DAG) const {
1475
1476  SmallVector<CCValAssign, 16> RVLocs;
1477  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1478                 getTargetMachine(), RVLocs, *DAG.getContext());
1479  CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
1480
1481  // If this is the first return lowered for this function, add the regs to the
1482  // liveout set for the function.
1483  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1484    for (unsigned i = 0; i != RVLocs.size(); ++i)
1485      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1486  }
1487
1488  SDValue Flag;
1489
1490  // Copy the result values into the output registers.
1491  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1492    CCValAssign &VA = RVLocs[i];
1493    assert(VA.isRegLoc() && "Can only return in registers!");
1494    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
1495                             OutVals[i], Flag);
1496    Flag = Chain.getValue(1);
1497  }
1498
1499  if (Flag.getNode())
1500    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
1501  else
1502    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
1503}
1504
1505
1506//===----------------------------------------------------------------------===//
1507// Vector related lowering:
1508//===----------------------------------------------------------------------===//
1509
1510static ConstantSDNode *
1511getVecImm(SDNode *N) {
1512  SDValue OpVal(0, 0);
1513
1514  // Check to see if this buildvec has a single non-undef value in its elements.
1515  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
1516    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
1517    if (OpVal.getNode() == 0)
1518      OpVal = N->getOperand(i);
1519    else if (OpVal != N->getOperand(i))
1520      return 0;
1521  }
1522
1523  if (OpVal.getNode() != 0) {
1524    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
1525      return CN;
1526    }
1527  }
1528
1529  return 0;
1530}
1531
1532/// get_vec_i18imm - Test if this vector is a vector filled with the same value
1533/// and the value fits into an unsigned 18-bit constant, and if so, return the
1534/// constant
1535SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
1536                              EVT ValueType) {
1537  if (ConstantSDNode *CN = getVecImm(N)) {
1538    uint64_t Value = CN->getZExtValue();
1539    if (ValueType == MVT::i64) {
1540      uint64_t UValue = CN->getZExtValue();
1541      uint32_t upper = uint32_t(UValue >> 32);
1542      uint32_t lower = uint32_t(UValue);
1543      if (upper != lower)
1544        return SDValue();
1545      Value = Value >> 32;
1546    }
1547    if (Value <= 0x3ffff)
1548      return DAG.getTargetConstant(Value, ValueType);
1549  }
1550
1551  return SDValue();
1552}
1553
1554/// get_vec_i16imm - Test if this vector is a vector filled with the same value
1555/// and the value fits into a signed 16-bit constant, and if so, return the
1556/// constant
1557SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
1558                              EVT ValueType) {
1559  if (ConstantSDNode *CN = getVecImm(N)) {
1560    int64_t Value = CN->getSExtValue();
1561    if (ValueType == MVT::i64) {
1562      uint64_t UValue = CN->getZExtValue();
1563      uint32_t upper = uint32_t(UValue >> 32);
1564      uint32_t lower = uint32_t(UValue);
1565      if (upper != lower)
1566        return SDValue();
1567      Value = Value >> 32;
1568    }
1569    if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
1570      return DAG.getTargetConstant(Value, ValueType);
1571    }
1572  }
1573
1574  return SDValue();
1575}
1576
1577/// get_vec_i10imm - Test if this vector is a vector filled with the same value
1578/// and the value fits into a signed 10-bit constant, and if so, return the
1579/// constant
1580SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
1581                              EVT ValueType) {
1582  if (ConstantSDNode *CN = getVecImm(N)) {
1583    int64_t Value = CN->getSExtValue();
1584    if (ValueType == MVT::i64) {
1585      uint64_t UValue = CN->getZExtValue();
1586      uint32_t upper = uint32_t(UValue >> 32);
1587      uint32_t lower = uint32_t(UValue);
1588      if (upper != lower)
1589        return SDValue();
1590      Value = Value >> 32;
1591    }
1592    if (isInt<10>(Value))
1593      return DAG.getTargetConstant(Value, ValueType);
1594  }
1595
1596  return SDValue();
1597}
1598
1599/// get_vec_i8imm - Test if this vector is a vector filled with the same value
1600/// and the value fits into a signed 8-bit constant, and if so, return the
1601/// constant.
1602///
1603/// @note: The incoming vector is v16i8 because that's the only way we can load
1604/// constant vectors. Thus, we test to see if the upper and lower bytes are the
1605/// same value.
1606SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
1607                             EVT ValueType) {
1608  if (ConstantSDNode *CN = getVecImm(N)) {
1609    int Value = (int) CN->getZExtValue();
1610    if (ValueType == MVT::i16
1611        && Value <= 0xffff                 /* truncated from uint64_t */
1612        && ((short) Value >> 8) == ((short) Value & 0xff))
1613      return DAG.getTargetConstant(Value & 0xff, ValueType);
1614    else if (ValueType == MVT::i8
1615             && (Value & 0xff) == Value)
1616      return DAG.getTargetConstant(Value, ValueType);
1617  }
1618
1619  return SDValue();
1620}
1621
1622/// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
1623/// and the value fits into a signed 16-bit constant, and if so, return the
1624/// constant
1625SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
1626                               EVT ValueType) {
1627  if (ConstantSDNode *CN = getVecImm(N)) {
1628    uint64_t Value = CN->getZExtValue();
1629    if ((ValueType == MVT::i32
1630          && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
1631        || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
1632      return DAG.getTargetConstant(Value >> 16, ValueType);
1633  }
1634
1635  return SDValue();
1636}
1637
1638/// get_v4i32_imm - Catch-all for general 32-bit constant vectors
1639SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
1640  if (ConstantSDNode *CN = getVecImm(N)) {
1641    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
1642  }
1643
1644  return SDValue();
1645}
1646
1647/// get_v4i32_imm - Catch-all for general 64-bit constant vectors
1648SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
1649  if (ConstantSDNode *CN = getVecImm(N)) {
1650    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
1651  }
1652
1653  return SDValue();
1654}
1655
1656//! Lower a BUILD_VECTOR instruction creatively:
1657static SDValue
1658LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
1659  EVT VT = Op.getValueType();
1660  EVT EltVT = VT.getVectorElementType();
1661  DebugLoc dl = Op.getDebugLoc();
1662  BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
1663  assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
1664  unsigned minSplatBits = EltVT.getSizeInBits();
1665
1666  if (minSplatBits < 16)
1667    minSplatBits = 16;
1668
1669  APInt APSplatBits, APSplatUndef;
1670  unsigned SplatBitSize;
1671  bool HasAnyUndefs;
1672
1673  if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
1674                            HasAnyUndefs, minSplatBits)
1675      || minSplatBits < SplatBitSize)
1676    return SDValue();   // Wasn't a constant vector or splat exceeded min
1677
1678  uint64_t SplatBits = APSplatBits.getZExtValue();
1679
1680  switch (VT.getSimpleVT().SimpleTy) {
1681  default:
1682    report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
1683                       Twine(VT.getEVTString()));
1684    /*NOTREACHED*/
1685  case MVT::v4f32: {
1686    uint32_t Value32 = uint32_t(SplatBits);
1687    assert(SplatBitSize == 32
1688           && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
1689    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1690    SDValue T = DAG.getConstant(Value32, MVT::i32);
1691    return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,
1692                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
1693  }
1694  case MVT::v2f64: {
1695    uint64_t f64val = uint64_t(SplatBits);
1696    assert(SplatBitSize == 64
1697           && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
1698    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1699    SDValue T = DAG.getConstant(f64val, MVT::i64);
1700    return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64,
1701                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
1702  }
1703  case MVT::v16i8: {
1704   // 8-bit constants have to be expanded to 16-bits
1705   unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
1706   SmallVector<SDValue, 8> Ops;
1707
1708   Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
1709   return DAG.getNode(ISD::BITCAST, dl, VT,
1710                      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
1711  }
1712  case MVT::v8i16: {
1713    unsigned short Value16 = SplatBits;
1714    SDValue T = DAG.getConstant(Value16, EltVT);
1715    SmallVector<SDValue, 8> Ops;
1716
1717    Ops.assign(8, T);
1718    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
1719  }
1720  case MVT::v4i32: {
1721    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
1722    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
1723  }
1724  case MVT::v2i64: {
1725    return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
1726  }
1727  }
1728}
1729
1730/*!
1731 */
1732SDValue
1733SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
1734                     DebugLoc dl) {
1735  uint32_t upper = uint32_t(SplatVal >> 32);
1736  uint32_t lower = uint32_t(SplatVal);
1737
1738  if (upper == lower) {
1739    // Magic constant that can be matched by IL, ILA, et. al.
1740    SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
1741    return DAG.getNode(ISD::BITCAST, dl, OpVT,
1742                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1743                                   Val, Val, Val, Val));
1744  } else {
1745    bool upper_special, lower_special;
1746
1747    // NOTE: This code creates common-case shuffle masks that can be easily
1748    // detected as common expressions. It is not attempting to create highly
1749    // specialized masks to replace any and all 0's, 0xff's and 0x80's.
1750
1751    // Detect if the upper or lower half is a special shuffle mask pattern:
1752    upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
1753    lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
1754
1755    // Both upper and lower are special, lower to a constant pool load:
1756    if (lower_special && upper_special) {
1757      SDValue UpperVal = DAG.getConstant(upper, MVT::i32);
1758      SDValue LowerVal = DAG.getConstant(lower, MVT::i32);
1759      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1760                         UpperVal, LowerVal, UpperVal, LowerVal);
1761      return DAG.getNode(ISD::BITCAST, dl, OpVT, BV);
1762    }
1763
1764    SDValue LO32;
1765    SDValue HI32;
1766    SmallVector<SDValue, 16> ShufBytes;
1767    SDValue Result;
1768
1769    // Create lower vector if not a special pattern
1770    if (!lower_special) {
1771      SDValue LO32C = DAG.getConstant(lower, MVT::i32);
1772      LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
1773                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1774                                     LO32C, LO32C, LO32C, LO32C));
1775    }
1776
1777    // Create upper vector if not a special pattern
1778    if (!upper_special) {
1779      SDValue HI32C = DAG.getConstant(upper, MVT::i32);
1780      HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
1781                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1782                                     HI32C, HI32C, HI32C, HI32C));
1783    }
1784
1785    // If either upper or lower are special, then the two input operands are
1786    // the same (basically, one of them is a "don't care")
1787    if (lower_special)
1788      LO32 = HI32;
1789    if (upper_special)
1790      HI32 = LO32;
1791
1792    for (int i = 0; i < 4; ++i) {
1793      uint64_t val = 0;
1794      for (int j = 0; j < 4; ++j) {
1795        SDValue V;
1796        bool process_upper, process_lower;
1797        val <<= 8;
1798        process_upper = (upper_special && (i & 1) == 0);
1799        process_lower = (lower_special && (i & 1) == 1);
1800
1801        if (process_upper || process_lower) {
1802          if ((process_upper && upper == 0)
1803                  || (process_lower && lower == 0))
1804            val |= 0x80;
1805          else if ((process_upper && upper == 0xffffffff)
1806                  || (process_lower && lower == 0xffffffff))
1807            val |= 0xc0;
1808          else if ((process_upper && upper == 0x80000000)
1809                  || (process_lower && lower == 0x80000000))
1810            val |= (j == 0 ? 0xe0 : 0x80);
1811        } else
1812          val |= i * 4 + j + ((i & 1) * 16);
1813      }
1814
1815      ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
1816    }
1817
1818    return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
1819                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1820                                   &ShufBytes[0], ShufBytes.size()));
1821  }
1822}
1823
1824/// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
1825/// which the Cell can operate. The code inspects V3 to ascertain whether the
1826/// permutation vector, V3, is monotonically increasing with one "exception"
1827/// element, e.g., (0, 1, _, 3). If this is the case, then generate a
1828/// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
1829/// In either case, the net result is going to eventually invoke SHUFB to
1830/// permute/shuffle the bytes from V1 and V2.
1831/// \note
1832/// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
1833/// control word for byte/halfword/word insertion. This takes care of a single
1834/// element move from V2 into V1.
1835/// \note
1836/// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
1837static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
1838  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
1839  SDValue V1 = Op.getOperand(0);
1840  SDValue V2 = Op.getOperand(1);
1841  DebugLoc dl = Op.getDebugLoc();
1842
1843  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
1844
1845  // If we have a single element being moved from V1 to V2, this can be handled
1846  // using the C*[DX] compute mask instructions, but the vector elements have
1847  // to be monotonically increasing with one exception element, and the source
1848  // slot of the element to move must be the same as the destination.
1849  EVT VecVT = V1.getValueType();
1850  EVT EltVT = VecVT.getVectorElementType();
1851  unsigned EltsFromV2 = 0;
1852  unsigned V2EltOffset = 0;
1853  unsigned V2EltIdx0 = 0;
1854  unsigned CurrElt = 0;
1855  unsigned MaxElts = VecVT.getVectorNumElements();
1856  unsigned PrevElt = 0;
1857  bool monotonic = true;
1858  bool rotate = true;
1859  int rotamt=0;
1860  EVT maskVT;             // which of the c?d instructions to use
1861
1862  if (EltVT == MVT::i8) {
1863    V2EltIdx0 = 16;
1864    maskVT = MVT::v16i8;
1865  } else if (EltVT == MVT::i16) {
1866    V2EltIdx0 = 8;
1867    maskVT = MVT::v8i16;
1868  } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
1869    V2EltIdx0 = 4;
1870    maskVT = MVT::v4i32;
1871  } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
1872    V2EltIdx0 = 2;
1873    maskVT = MVT::v2i64;
1874  } else
1875    llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
1876
1877  for (unsigned i = 0; i != MaxElts; ++i) {
1878    if (SVN->getMaskElt(i) < 0)
1879      continue;
1880
1881    unsigned SrcElt = SVN->getMaskElt(i);
1882
1883    if (monotonic) {
1884      if (SrcElt >= V2EltIdx0) {
1885        // TODO: optimize for the monotonic case when several consecutive
1886        // elements are taken form V2. Do we ever get such a case?
1887        if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
1888          V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
1889        else
1890          monotonic = false;
1891        ++EltsFromV2;
1892      } else if (CurrElt != SrcElt) {
1893        monotonic = false;
1894      }
1895
1896      ++CurrElt;
1897    }
1898
1899    if (rotate) {
1900      if (PrevElt > 0 && SrcElt < MaxElts) {
1901        if ((PrevElt == SrcElt - 1)
1902            || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
1903          PrevElt = SrcElt;
1904        } else {
1905          rotate = false;
1906        }
1907      } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
1908        // First time or after a "wrap around"
1909        rotamt = SrcElt-i;
1910        PrevElt = SrcElt;
1911      } else {
1912        // This isn't a rotation, takes elements from vector 2
1913        rotate = false;
1914      }
1915    }
1916  }
1917
1918  if (EltsFromV2 == 1 && monotonic) {
1919    // Compute mask and shuffle
1920    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1921
1922    // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
1923    // R1 ($sp) is used here only as it is guaranteed to have last bits zero
1924    SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
1925                                DAG.getRegister(SPU::R1, PtrVT),
1926                                DAG.getConstant(V2EltOffset, MVT::i32));
1927    SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
1928                                     maskVT, Pointer);
1929
1930    // Use shuffle mask in SHUFB synthetic instruction:
1931    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
1932                       ShufMaskOp);
1933  } else if (rotate) {
1934    if (rotamt < 0)
1935      rotamt +=MaxElts;
1936    rotamt *= EltVT.getSizeInBits()/8;
1937    return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
1938                       V1, DAG.getConstant(rotamt, MVT::i16));
1939  } else {
1940   // Convert the SHUFFLE_VECTOR mask's input element units to the
1941   // actual bytes.
1942    unsigned BytesPerElement = EltVT.getSizeInBits()/8;
1943
1944    SmallVector<SDValue, 16> ResultMask;
1945    for (unsigned i = 0, e = MaxElts; i != e; ++i) {
1946      unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
1947
1948      for (unsigned j = 0; j < BytesPerElement; ++j)
1949        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
1950    }
1951    SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
1952                                    &ResultMask[0], ResultMask.size());
1953    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
1954  }
1955}
1956
1957static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
1958  SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
1959  DebugLoc dl = Op.getDebugLoc();
1960
1961  if (Op0.getNode()->getOpcode() == ISD::Constant) {
1962    // For a constant, build the appropriate constant vector, which will
1963    // eventually simplify to a vector register load.
1964
1965    ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
1966    SmallVector<SDValue, 16> ConstVecValues;
1967    EVT VT;
1968    size_t n_copies;
1969
1970    // Create a constant vector:
1971    switch (Op.getValueType().getSimpleVT().SimpleTy) {
1972    default: llvm_unreachable("Unexpected constant value type in "
1973                              "LowerSCALAR_TO_VECTOR");
1974    case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
1975    case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
1976    case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
1977    case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
1978    case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
1979    case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
1980    }
1981
1982    SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
1983    for (size_t j = 0; j < n_copies; ++j)
1984      ConstVecValues.push_back(CValue);
1985
1986    return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
1987                       &ConstVecValues[0], ConstVecValues.size());
1988  } else {
1989    // Otherwise, copy the value from one register to another:
1990    switch (Op0.getValueType().getSimpleVT().SimpleTy) {
1991    default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
1992    case MVT::i8:
1993    case MVT::i16:
1994    case MVT::i32:
1995    case MVT::i64:
1996    case MVT::f32:
1997    case MVT::f64:
1998      return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
1999    }
2000  }
2001}
2002
2003static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
2004  EVT VT = Op.getValueType();
2005  SDValue N = Op.getOperand(0);
2006  SDValue Elt = Op.getOperand(1);
2007  DebugLoc dl = Op.getDebugLoc();
2008  SDValue retval;
2009
2010  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2011    // Constant argument:
2012    int EltNo = (int) C->getZExtValue();
2013
2014    // sanity checks:
2015    if (VT == MVT::i8 && EltNo >= 16)
2016      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
2017    else if (VT == MVT::i16 && EltNo >= 8)
2018      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
2019    else if (VT == MVT::i32 && EltNo >= 4)
2020      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
2021    else if (VT == MVT::i64 && EltNo >= 2)
2022      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
2023
2024    if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
2025      // i32 and i64: Element 0 is the preferred slot
2026      return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
2027    }
2028
2029    // Need to generate shuffle mask and extract:
2030    int prefslot_begin = -1, prefslot_end = -1;
2031    int elt_byte = EltNo * VT.getSizeInBits() / 8;
2032
2033    switch (VT.getSimpleVT().SimpleTy) {
2034    default: llvm_unreachable("Invalid value type!");
2035    case MVT::i8: {
2036      prefslot_begin = prefslot_end = 3;
2037      break;
2038    }
2039    case MVT::i16: {
2040      prefslot_begin = 2; prefslot_end = 3;
2041      break;
2042    }
2043    case MVT::i32:
2044    case MVT::f32: {
2045      prefslot_begin = 0; prefslot_end = 3;
2046      break;
2047    }
2048    case MVT::i64:
2049    case MVT::f64: {
2050      prefslot_begin = 0; prefslot_end = 7;
2051      break;
2052    }
2053    }
2054
2055    assert(prefslot_begin != -1 && prefslot_end != -1 &&
2056           "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
2057
2058    unsigned int ShufBytes[16] = {
2059      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
2060    };
2061    for (int i = 0; i < 16; ++i) {
2062      // zero fill uppper part of preferred slot, don't care about the
2063      // other slots:
2064      unsigned int mask_val;
2065      if (i <= prefslot_end) {
2066        mask_val =
2067          ((i < prefslot_begin)
2068           ? 0x80
2069           : elt_byte + (i - prefslot_begin));
2070
2071        ShufBytes[i] = mask_val;
2072      } else
2073        ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
2074    }
2075
2076    SDValue ShufMask[4];
2077    for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
2078      unsigned bidx = i * 4;
2079      unsigned int bits = ((ShufBytes[bidx] << 24) |
2080                           (ShufBytes[bidx+1] << 16) |
2081                           (ShufBytes[bidx+2] << 8) |
2082                           ShufBytes[bidx+3]);
2083      ShufMask[i] = DAG.getConstant(bits, MVT::i32);
2084    }
2085
2086    SDValue ShufMaskVec =
2087      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2088                  &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
2089
2090    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
2091                         DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
2092                                     N, N, ShufMaskVec));
2093  } else {
2094    // Variable index: Rotate the requested element into slot 0, then replicate
2095    // slot 0 across the vector
2096    EVT VecVT = N.getValueType();
2097    if (!VecVT.isSimple() || !VecVT.isVector()) {
2098      report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
2099                        "vector type!");
2100    }
2101
2102    // Make life easier by making sure the index is zero-extended to i32
2103    if (Elt.getValueType() != MVT::i32)
2104      Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
2105
2106    // Scale the index to a bit/byte shift quantity
2107    APInt scaleFactor =
2108            APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
2109    unsigned scaleShift = scaleFactor.logBase2();
2110    SDValue vecShift;
2111
2112    if (scaleShift > 0) {
2113      // Scale the shift factor:
2114      Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
2115                        DAG.getConstant(scaleShift, MVT::i32));
2116    }
2117
2118    vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
2119
2120    // Replicate the bytes starting at byte 0 across the entire vector (for
2121    // consistency with the notion of a unified register set)
2122    SDValue replicate;
2123
2124    switch (VT.getSimpleVT().SimpleTy) {
2125    default:
2126      report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
2127                        "type");
2128      /*NOTREACHED*/
2129    case MVT::i8: {
2130      SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
2131      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2132                              factor, factor, factor, factor);
2133      break;
2134    }
2135    case MVT::i16: {
2136      SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
2137      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2138                              factor, factor, factor, factor);
2139      break;
2140    }
2141    case MVT::i32:
2142    case MVT::f32: {
2143      SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
2144      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2145                              factor, factor, factor, factor);
2146      break;
2147    }
2148    case MVT::i64:
2149    case MVT::f64: {
2150      SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
2151      SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
2152      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2153                              loFactor, hiFactor, loFactor, hiFactor);
2154      break;
2155    }
2156    }
2157
2158    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
2159                         DAG.getNode(SPUISD::SHUFB, dl, VecVT,
2160                                     vecShift, vecShift, replicate));
2161  }
2162
2163  return retval;
2164}
2165
2166static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
2167  SDValue VecOp = Op.getOperand(0);
2168  SDValue ValOp = Op.getOperand(1);
2169  SDValue IdxOp = Op.getOperand(2);
2170  DebugLoc dl = Op.getDebugLoc();
2171  EVT VT = Op.getValueType();
2172  EVT eltVT = ValOp.getValueType();
2173
2174  // use 0 when the lane to insert to is 'undef'
2175  int64_t Offset=0;
2176  if (IdxOp.getOpcode() != ISD::UNDEF) {
2177    ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
2178    assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
2179    Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
2180  }
2181
2182  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2183  // Use $sp ($1) because it's always 16-byte aligned and it's available:
2184  SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
2185                                DAG.getRegister(SPU::R1, PtrVT),
2186                                DAG.getConstant(Offset, PtrVT));
2187  // widen the mask when dealing with half vectors
2188  EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
2189                                128/ VT.getVectorElementType().getSizeInBits());
2190  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
2191
2192  SDValue result =
2193    DAG.getNode(SPUISD::SHUFB, dl, VT,
2194                DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
2195                VecOp,
2196                DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask));
2197
2198  return result;
2199}
2200
2201static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
2202                           const TargetLowering &TLI)
2203{
2204  SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
2205  DebugLoc dl = Op.getDebugLoc();
2206  EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
2207
2208  assert(Op.getValueType() == MVT::i8);
2209  switch (Opc) {
2210  default:
2211    llvm_unreachable("Unhandled i8 math operator");
2212  case ISD::ADD: {
2213    // 8-bit addition: Promote the arguments up to 16-bits and truncate
2214    // the result:
2215    SDValue N1 = Op.getOperand(1);
2216    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2217    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2218    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2219                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2220
2221  }
2222
2223  case ISD::SUB: {
2224    // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
2225    // the result:
2226    SDValue N1 = Op.getOperand(1);
2227    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2228    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2229    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2230                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2231  }
2232  case ISD::ROTR:
2233  case ISD::ROTL: {
2234    SDValue N1 = Op.getOperand(1);
2235    EVT N1VT = N1.getValueType();
2236
2237    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
2238    if (!N1VT.bitsEq(ShiftVT)) {
2239      unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
2240                       ? ISD::ZERO_EXTEND
2241                       : ISD::TRUNCATE;
2242      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2243    }
2244
2245    // Replicate lower 8-bits into upper 8:
2246    SDValue ExpandArg =
2247      DAG.getNode(ISD::OR, dl, MVT::i16, N0,
2248                  DAG.getNode(ISD::SHL, dl, MVT::i16,
2249                              N0, DAG.getConstant(8, MVT::i32)));
2250
2251    // Truncate back down to i8
2252    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2253                       DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
2254  }
2255  case ISD::SRL:
2256  case ISD::SHL: {
2257    SDValue N1 = Op.getOperand(1);
2258    EVT N1VT = N1.getValueType();
2259
2260    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
2261    if (!N1VT.bitsEq(ShiftVT)) {
2262      unsigned N1Opc = ISD::ZERO_EXTEND;
2263
2264      if (N1.getValueType().bitsGT(ShiftVT))
2265        N1Opc = ISD::TRUNCATE;
2266
2267      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2268    }
2269
2270    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2271                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2272  }
2273  case ISD::SRA: {
2274    SDValue N1 = Op.getOperand(1);
2275    EVT N1VT = N1.getValueType();
2276
2277    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2278    if (!N1VT.bitsEq(ShiftVT)) {
2279      unsigned N1Opc = ISD::SIGN_EXTEND;
2280
2281      if (N1VT.bitsGT(ShiftVT))
2282        N1Opc = ISD::TRUNCATE;
2283      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2284    }
2285
2286    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2287                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2288  }
2289  case ISD::MUL: {
2290    SDValue N1 = Op.getOperand(1);
2291
2292    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2293    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2294    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2295                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2296  }
2297  }
2298}
2299
2300//! Lower byte immediate operations for v16i8 vectors:
2301static SDValue
2302LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
2303  SDValue ConstVec;
2304  SDValue Arg;
2305  EVT VT = Op.getValueType();
2306  DebugLoc dl = Op.getDebugLoc();
2307
2308  ConstVec = Op.getOperand(0);
2309  Arg = Op.getOperand(1);
2310  if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
2311    if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
2312      ConstVec = ConstVec.getOperand(0);
2313    } else {
2314      ConstVec = Op.getOperand(1);
2315      Arg = Op.getOperand(0);
2316      if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
2317        ConstVec = ConstVec.getOperand(0);
2318      }
2319    }
2320  }
2321
2322  if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
2323    BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
2324    assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
2325
2326    APInt APSplatBits, APSplatUndef;
2327    unsigned SplatBitSize;
2328    bool HasAnyUndefs;
2329    unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
2330
2331    if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
2332                              HasAnyUndefs, minSplatBits)
2333        && minSplatBits <= SplatBitSize) {
2334      uint64_t SplatBits = APSplatBits.getZExtValue();
2335      SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
2336
2337      SmallVector<SDValue, 16> tcVec;
2338      tcVec.assign(16, tc);
2339      return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
2340                         DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
2341    }
2342  }
2343
2344  // These operations (AND, OR, XOR) are legal, they just couldn't be custom
2345  // lowered.  Return the operation, rather than a null SDValue.
2346  return Op;
2347}
2348
2349//! Custom lowering for CTPOP (count population)
2350/*!
2351  Custom lowering code that counts the number ones in the input
2352  operand. SPU has such an instruction, but it counts the number of
2353  ones per byte, which then have to be accumulated.
2354*/
2355static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
2356  EVT VT = Op.getValueType();
2357  EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
2358                               VT, (128 / VT.getSizeInBits()));
2359  DebugLoc dl = Op.getDebugLoc();
2360
2361  switch (VT.getSimpleVT().SimpleTy) {
2362  default: llvm_unreachable("Invalid value type!");
2363  case MVT::i8: {
2364    SDValue N = Op.getOperand(0);
2365    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2366
2367    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2368    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2369
2370    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
2371  }
2372
2373  case MVT::i16: {
2374    MachineFunction &MF = DAG.getMachineFunction();
2375    MachineRegisterInfo &RegInfo = MF.getRegInfo();
2376
2377    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
2378
2379    SDValue N = Op.getOperand(0);
2380    SDValue Elt0 = DAG.getConstant(0, MVT::i16);
2381    SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
2382    SDValue Shift1 = DAG.getConstant(8, MVT::i32);
2383
2384    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2385    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2386
2387    // CNTB_result becomes the chain to which all of the virtual registers
2388    // CNTB_reg, SUM1_reg become associated:
2389    SDValue CNTB_result =
2390      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
2391
2392    SDValue CNTB_rescopy =
2393      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
2394
2395    SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
2396
2397    return DAG.getNode(ISD::AND, dl, MVT::i16,
2398                       DAG.getNode(ISD::ADD, dl, MVT::i16,
2399                                   DAG.getNode(ISD::SRL, dl, MVT::i16,
2400                                               Tmp1, Shift1),
2401                                   Tmp1),
2402                       Mask0);
2403  }
2404
2405  case MVT::i32: {
2406    MachineFunction &MF = DAG.getMachineFunction();
2407    MachineRegisterInfo &RegInfo = MF.getRegInfo();
2408
2409    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2410    unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2411
2412    SDValue N = Op.getOperand(0);
2413    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2414    SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
2415    SDValue Shift1 = DAG.getConstant(16, MVT::i32);
2416    SDValue Shift2 = DAG.getConstant(8, MVT::i32);
2417
2418    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2419    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2420
2421    // CNTB_result becomes the chain to which all of the virtual registers
2422    // CNTB_reg, SUM1_reg become associated:
2423    SDValue CNTB_result =
2424      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
2425
2426    SDValue CNTB_rescopy =
2427      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
2428
2429    SDValue Comp1 =
2430      DAG.getNode(ISD::SRL, dl, MVT::i32,
2431                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
2432                  Shift1);
2433
2434    SDValue Sum1 =
2435      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
2436                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
2437
2438    SDValue Sum1_rescopy =
2439      DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
2440
2441    SDValue Comp2 =
2442      DAG.getNode(ISD::SRL, dl, MVT::i32,
2443                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
2444                  Shift2);
2445    SDValue Sum2 =
2446      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
2447                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
2448
2449    return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
2450  }
2451
2452  case MVT::i64:
2453    break;
2454  }
2455
2456  return SDValue();
2457}
2458
2459//! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
2460/*!
2461 f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
2462 All conversions to i64 are expanded to a libcall.
2463 */
2464static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
2465                              const SPUTargetLowering &TLI) {
2466  EVT OpVT = Op.getValueType();
2467  SDValue Op0 = Op.getOperand(0);
2468  EVT Op0VT = Op0.getValueType();
2469
2470  if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
2471      || OpVT == MVT::i64) {
2472    // Convert f32 / f64 to i32 / i64 via libcall.
2473    RTLIB::Libcall LC =
2474            (Op.getOpcode() == ISD::FP_TO_SINT)
2475             ? RTLIB::getFPTOSINT(Op0VT, OpVT)
2476             : RTLIB::getFPTOUINT(Op0VT, OpVT);
2477    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
2478    SDValue Dummy;
2479    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
2480  }
2481
2482  return Op;
2483}
2484
2485//! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
2486/*!
2487 i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
2488 All conversions from i64 are expanded to a libcall.
2489 */
2490static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
2491                              const SPUTargetLowering &TLI) {
2492  EVT OpVT = Op.getValueType();
2493  SDValue Op0 = Op.getOperand(0);
2494  EVT Op0VT = Op0.getValueType();
2495
2496  if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
2497      || Op0VT == MVT::i64) {
2498    // Convert i32, i64 to f64 via libcall:
2499    RTLIB::Libcall LC =
2500            (Op.getOpcode() == ISD::SINT_TO_FP)
2501             ? RTLIB::getSINTTOFP(Op0VT, OpVT)
2502             : RTLIB::getUINTTOFP(Op0VT, OpVT);
2503    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
2504    SDValue Dummy;
2505    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
2506  }
2507
2508  return Op;
2509}
2510
2511//! Lower ISD::SETCC
2512/*!
2513 This handles MVT::f64 (double floating point) condition lowering
2514 */
2515static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
2516                          const TargetLowering &TLI) {
2517  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
2518  DebugLoc dl = Op.getDebugLoc();
2519  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
2520
2521  SDValue lhs = Op.getOperand(0);
2522  SDValue rhs = Op.getOperand(1);
2523  EVT lhsVT = lhs.getValueType();
2524  assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
2525
2526  EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
2527  APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
2528  EVT IntVT(MVT::i64);
2529
2530  // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
2531  // selected to a NOP:
2532  SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs);
2533  SDValue lhsHi32 =
2534          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
2535                      DAG.getNode(ISD::SRL, dl, IntVT,
2536                                  i64lhs, DAG.getConstant(32, MVT::i32)));
2537  SDValue lhsHi32abs =
2538          DAG.getNode(ISD::AND, dl, MVT::i32,
2539                      lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
2540  SDValue lhsLo32 =
2541          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
2542
2543  // SETO and SETUO only use the lhs operand:
2544  if (CC->get() == ISD::SETO) {
2545    // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
2546    // SETUO
2547    APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
2548    return DAG.getNode(ISD::XOR, dl, ccResultVT,
2549                       DAG.getSetCC(dl, ccResultVT,
2550                                    lhs, DAG.getConstantFP(0.0, lhsVT),
2551                                    ISD::SETUO),
2552                       DAG.getConstant(ccResultAllOnes, ccResultVT));
2553  } else if (CC->get() == ISD::SETUO) {
2554    // Evaluates to true if Op0 is [SQ]NaN
2555    return DAG.getNode(ISD::AND, dl, ccResultVT,
2556                       DAG.getSetCC(dl, ccResultVT,
2557                                    lhsHi32abs,
2558                                    DAG.getConstant(0x7ff00000, MVT::i32),
2559                                    ISD::SETGE),
2560                       DAG.getSetCC(dl, ccResultVT,
2561                                    lhsLo32,
2562                                    DAG.getConstant(0, MVT::i32),
2563                                    ISD::SETGT));
2564  }
2565
2566  SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs);
2567  SDValue rhsHi32 =
2568          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
2569                      DAG.getNode(ISD::SRL, dl, IntVT,
2570                                  i64rhs, DAG.getConstant(32, MVT::i32)));
2571
2572  // If a value is negative, subtract from the sign magnitude constant:
2573  SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
2574
2575  // Convert the sign-magnitude representation into 2's complement:
2576  SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
2577                                      lhsHi32, DAG.getConstant(31, MVT::i32));
2578  SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
2579  SDValue lhsSelect =
2580          DAG.getNode(ISD::SELECT, dl, IntVT,
2581                      lhsSelectMask, lhsSignMag2TC, i64lhs);
2582
2583  SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
2584                                      rhsHi32, DAG.getConstant(31, MVT::i32));
2585  SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
2586  SDValue rhsSelect =
2587          DAG.getNode(ISD::SELECT, dl, IntVT,
2588                      rhsSelectMask, rhsSignMag2TC, i64rhs);
2589
2590  unsigned compareOp;
2591
2592  switch (CC->get()) {
2593  case ISD::SETOEQ:
2594  case ISD::SETUEQ:
2595    compareOp = ISD::SETEQ; break;
2596  case ISD::SETOGT:
2597  case ISD::SETUGT:
2598    compareOp = ISD::SETGT; break;
2599  case ISD::SETOGE:
2600  case ISD::SETUGE:
2601    compareOp = ISD::SETGE; break;
2602  case ISD::SETOLT:
2603  case ISD::SETULT:
2604    compareOp = ISD::SETLT; break;
2605  case ISD::SETOLE:
2606  case ISD::SETULE:
2607    compareOp = ISD::SETLE; break;
2608  case ISD::SETUNE:
2609  case ISD::SETONE:
2610    compareOp = ISD::SETNE; break;
2611  default:
2612    report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
2613  }
2614
2615  SDValue result =
2616          DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
2617                       (ISD::CondCode) compareOp);
2618
2619  if ((CC->get() & 0x8) == 0) {
2620    // Ordered comparison:
2621    SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
2622                                  lhs, DAG.getConstantFP(0.0, MVT::f64),
2623                                  ISD::SETO);
2624    SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
2625                                  rhs, DAG.getConstantFP(0.0, MVT::f64),
2626                                  ISD::SETO);
2627    SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
2628
2629    result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
2630  }
2631
2632  return result;
2633}
2634
2635//! Lower ISD::SELECT_CC
2636/*!
2637  ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
2638  SELB instruction.
2639
2640  \note Need to revisit this in the future: if the code path through the true
2641  and false value computations is longer than the latency of a branch (6
2642  cycles), then it would be more advantageous to branch and insert a new basic
2643  block and branch on the condition. However, this code does not make that
2644  assumption, given the simplisitc uses so far.
2645 */
2646
2647static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
2648                              const TargetLowering &TLI) {
2649  EVT VT = Op.getValueType();
2650  SDValue lhs = Op.getOperand(0);
2651  SDValue rhs = Op.getOperand(1);
2652  SDValue trueval = Op.getOperand(2);
2653  SDValue falseval = Op.getOperand(3);
2654  SDValue condition = Op.getOperand(4);
2655  DebugLoc dl = Op.getDebugLoc();
2656
2657  // NOTE: SELB's arguments: $rA, $rB, $mask
2658  //
2659  // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
2660  // where bits in $mask are 1. CCond will be inverted, having 1s where the
2661  // condition was true and 0s where the condition was false. Hence, the
2662  // arguments to SELB get reversed.
2663
2664  // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
2665  // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
2666  // with another "cannot select select_cc" assert:
2667
2668  SDValue compare = DAG.getNode(ISD::SETCC, dl,
2669                                TLI.getSetCCResultType(Op.getValueType()),
2670                                lhs, rhs, condition);
2671  return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
2672}
2673
2674//! Custom lower ISD::TRUNCATE
2675static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
2676{
2677  // Type to truncate to
2678  EVT VT = Op.getValueType();
2679  MVT simpleVT = VT.getSimpleVT();
2680  EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
2681                               VT, (128 / VT.getSizeInBits()));
2682  DebugLoc dl = Op.getDebugLoc();
2683
2684  // Type to truncate from
2685  SDValue Op0 = Op.getOperand(0);
2686  EVT Op0VT = Op0.getValueType();
2687
2688  if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
2689    // Create shuffle mask, least significant doubleword of quadword
2690    unsigned maskHigh = 0x08090a0b;
2691    unsigned maskLow = 0x0c0d0e0f;
2692    // Use a shuffle to perform the truncation
2693    SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2694                                   DAG.getConstant(maskHigh, MVT::i32),
2695                                   DAG.getConstant(maskLow, MVT::i32),
2696                                   DAG.getConstant(maskHigh, MVT::i32),
2697                                   DAG.getConstant(maskLow, MVT::i32));
2698
2699    SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
2700                                       Op0, Op0, shufMask);
2701
2702    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
2703  }
2704
2705  return SDValue();             // Leave the truncate unmolested
2706}
2707
2708/*!
2709 * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
2710 * algorithm is to duplicate the sign bit using rotmai to generate at
2711 * least one byte full of sign bits. Then propagate the "sign-byte" into
2712 * the leftmost words and the i64/i32 into the rightmost words using shufb.
2713 *
2714 * @param Op The sext operand
2715 * @param DAG The current DAG
2716 * @return The SDValue with the entire instruction sequence
2717 */
2718static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
2719{
2720  DebugLoc dl = Op.getDebugLoc();
2721
2722  // Type to extend to
2723  MVT OpVT = Op.getValueType().getSimpleVT();
2724
2725  // Type to extend from
2726  SDValue Op0 = Op.getOperand(0);
2727  MVT Op0VT = Op0.getValueType().getSimpleVT();
2728
2729  // extend i8 & i16 via i32
2730  if (Op0VT == MVT::i8 || Op0VT == MVT::i16) {
2731    Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0);
2732    Op0VT = MVT::i32;
2733  }
2734
2735  // The type to extend to needs to be a i128 and
2736  // the type to extend from needs to be i64 or i32.
2737  assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
2738          "LowerSIGN_EXTEND: input and/or output operand have wrong size");
2739  (void)OpVT;
2740
2741  // Create shuffle mask
2742  unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
2743  unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
2744  unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
2745  SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2746                                 DAG.getConstant(mask1, MVT::i32),
2747                                 DAG.getConstant(mask1, MVT::i32),
2748                                 DAG.getConstant(mask2, MVT::i32),
2749                                 DAG.getConstant(mask3, MVT::i32));
2750
2751  // Word wise arithmetic right shift to generate at least one byte
2752  // that contains sign bits.
2753  MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
2754  SDValue sraVal = DAG.getNode(ISD::SRA,
2755                 dl,
2756                 mvt,
2757                 DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
2758                 DAG.getConstant(31, MVT::i32));
2759
2760  // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
2761  SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
2762                                        dl, Op0VT, Op0,
2763                                        DAG.getTargetConstant(
2764                                                  SPU::GPRCRegClass.getID(),
2765                                                  MVT::i32)), 0);
2766  // Shuffle bytes - Copy the sign bits into the upper 64 bits
2767  // and the input value into the lower 64 bits.
2768  SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
2769        extended, sraVal, shufMask);
2770  return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle);
2771}
2772
2773//! Custom (target-specific) lowering entry point
2774/*!
2775  This is where LLVM's DAG selection process calls to do target-specific
2776  lowering of nodes.
2777 */
2778SDValue
2779SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
2780{
2781  unsigned Opc = (unsigned) Op.getOpcode();
2782  EVT VT = Op.getValueType();
2783
2784  switch (Opc) {
2785  default: {
2786#ifndef NDEBUG
2787    errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
2788    errs() << "Op.getOpcode() = " << Opc << "\n";
2789    errs() << "*Op.getNode():\n";
2790    Op.getNode()->dump();
2791#endif
2792    llvm_unreachable(0);
2793  }
2794  case ISD::LOAD:
2795  case ISD::EXTLOAD:
2796  case ISD::SEXTLOAD:
2797  case ISD::ZEXTLOAD:
2798    return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
2799  case ISD::STORE:
2800    return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
2801  case ISD::ConstantPool:
2802    return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
2803  case ISD::GlobalAddress:
2804    return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
2805  case ISD::JumpTable:
2806    return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
2807  case ISD::ConstantFP:
2808    return LowerConstantFP(Op, DAG);
2809
2810  // i8, i64 math ops:
2811  case ISD::ADD:
2812  case ISD::SUB:
2813  case ISD::ROTR:
2814  case ISD::ROTL:
2815  case ISD::SRL:
2816  case ISD::SHL:
2817  case ISD::SRA: {
2818    if (VT == MVT::i8)
2819      return LowerI8Math(Op, DAG, Opc, *this);
2820    break;
2821  }
2822
2823  case ISD::FP_TO_SINT:
2824  case ISD::FP_TO_UINT:
2825    return LowerFP_TO_INT(Op, DAG, *this);
2826
2827  case ISD::SINT_TO_FP:
2828  case ISD::UINT_TO_FP:
2829    return LowerINT_TO_FP(Op, DAG, *this);
2830
2831  // Vector-related lowering.
2832  case ISD::BUILD_VECTOR:
2833    return LowerBUILD_VECTOR(Op, DAG);
2834  case ISD::SCALAR_TO_VECTOR:
2835    return LowerSCALAR_TO_VECTOR(Op, DAG);
2836  case ISD::VECTOR_SHUFFLE:
2837    return LowerVECTOR_SHUFFLE(Op, DAG);
2838  case ISD::EXTRACT_VECTOR_ELT:
2839    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2840  case ISD::INSERT_VECTOR_ELT:
2841    return LowerINSERT_VECTOR_ELT(Op, DAG);
2842
2843  // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
2844  case ISD::AND:
2845  case ISD::OR:
2846  case ISD::XOR:
2847    return LowerByteImmed(Op, DAG);
2848
2849  // Vector and i8 multiply:
2850  case ISD::MUL:
2851    if (VT == MVT::i8)
2852      return LowerI8Math(Op, DAG, Opc, *this);
2853
2854  case ISD::CTPOP:
2855    return LowerCTPOP(Op, DAG);
2856
2857  case ISD::SELECT_CC:
2858    return LowerSELECT_CC(Op, DAG, *this);
2859
2860  case ISD::SETCC:
2861    return LowerSETCC(Op, DAG, *this);
2862
2863  case ISD::TRUNCATE:
2864    return LowerTRUNCATE(Op, DAG);
2865
2866  case ISD::SIGN_EXTEND:
2867    return LowerSIGN_EXTEND(Op, DAG);
2868  }
2869
2870  return SDValue();
2871}
2872
2873void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
2874                                           SmallVectorImpl<SDValue>&Results,
2875                                           SelectionDAG &DAG) const
2876{
2877#if 0
2878  unsigned Opc = (unsigned) N->getOpcode();
2879  EVT OpVT = N->getValueType(0);
2880
2881  switch (Opc) {
2882  default: {
2883    errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
2884    errs() << "Op.getOpcode() = " << Opc << "\n";
2885    errs() << "*Op.getNode():\n";
2886    N->dump();
2887    abort();
2888    /*NOTREACHED*/
2889  }
2890  }
2891#endif
2892
2893  /* Otherwise, return unchanged */
2894}
2895
2896//===----------------------------------------------------------------------===//
2897// Target Optimization Hooks
2898//===----------------------------------------------------------------------===//
2899
2900SDValue
2901SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
2902{
2903#if 0
2904  TargetMachine &TM = getTargetMachine();
2905#endif
2906  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
2907  SelectionDAG &DAG = DCI.DAG;
2908  SDValue Op0 = N->getOperand(0);       // everything has at least one operand
2909  EVT NodeVT = N->getValueType(0);      // The node's value type
2910  EVT Op0VT = Op0.getValueType();       // The first operand's result
2911  SDValue Result;                       // Initially, empty result
2912  DebugLoc dl = N->getDebugLoc();
2913
2914  switch (N->getOpcode()) {
2915  default: break;
2916  case ISD::ADD: {
2917    SDValue Op1 = N->getOperand(1);
2918
2919    if (Op0.getOpcode() == SPUISD::IndirectAddr
2920        || Op1.getOpcode() == SPUISD::IndirectAddr) {
2921      // Normalize the operands to reduce repeated code
2922      SDValue IndirectArg = Op0, AddArg = Op1;
2923
2924      if (Op1.getOpcode() == SPUISD::IndirectAddr) {
2925        IndirectArg = Op1;
2926        AddArg = Op0;
2927      }
2928
2929      if (isa<ConstantSDNode>(AddArg)) {
2930        ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
2931        SDValue IndOp1 = IndirectArg.getOperand(1);
2932
2933        if (CN0->isNullValue()) {
2934          // (add (SPUindirect <arg>, <arg>), 0) ->
2935          // (SPUindirect <arg>, <arg>)
2936
2937#if !defined(NDEBUG)
2938          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2939            errs() << "\n"
2940                 << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
2941                 << "With:    (SPUindirect <arg>, <arg>)\n";
2942          }
2943#endif
2944
2945          return IndirectArg;
2946        } else if (isa<ConstantSDNode>(IndOp1)) {
2947          // (add (SPUindirect <arg>, <const>), <const>) ->
2948          // (SPUindirect <arg>, <const + const>)
2949          ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
2950          int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
2951          SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
2952
2953#if !defined(NDEBUG)
2954          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2955            errs() << "\n"
2956                 << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
2957                 << "), " << CN0->getSExtValue() << ")\n"
2958                 << "With:    (SPUindirect <arg>, "
2959                 << combinedConst << ")\n";
2960          }
2961#endif
2962
2963          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
2964                             IndirectArg, combinedValue);
2965        }
2966      }
2967    }
2968    break;
2969  }
2970  case ISD::SIGN_EXTEND:
2971  case ISD::ZERO_EXTEND:
2972  case ISD::ANY_EXTEND: {
2973    if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
2974      // (any_extend (SPUextract_elt0 <arg>)) ->
2975      // (SPUextract_elt0 <arg>)
2976      // Types must match, however...
2977#if !defined(NDEBUG)
2978      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2979        errs() << "\nReplace: ";
2980        N->dump(&DAG);
2981        errs() << "\nWith:    ";
2982        Op0.getNode()->dump(&DAG);
2983        errs() << "\n";
2984      }
2985#endif
2986
2987      return Op0;
2988    }
2989    break;
2990  }
2991  case SPUISD::IndirectAddr: {
2992    if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
2993      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
2994      if (CN != 0 && CN->isNullValue()) {
2995        // (SPUindirect (SPUaform <addr>, 0), 0) ->
2996        // (SPUaform <addr>, 0)
2997
2998        DEBUG(errs() << "Replace: ");
2999        DEBUG(N->dump(&DAG));
3000        DEBUG(errs() << "\nWith:    ");
3001        DEBUG(Op0.getNode()->dump(&DAG));
3002        DEBUG(errs() << "\n");
3003
3004        return Op0;
3005      }
3006    } else if (Op0.getOpcode() == ISD::ADD) {
3007      SDValue Op1 = N->getOperand(1);
3008      if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
3009        // (SPUindirect (add <arg>, <arg>), 0) ->
3010        // (SPUindirect <arg>, <arg>)
3011        if (CN1->isNullValue()) {
3012
3013#if !defined(NDEBUG)
3014          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
3015            errs() << "\n"
3016                 << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
3017                 << "With:    (SPUindirect <arg>, <arg>)\n";
3018          }
3019#endif
3020
3021          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
3022                             Op0.getOperand(0), Op0.getOperand(1));
3023        }
3024      }
3025    }
3026    break;
3027  }
3028  case SPUISD::SHL_BITS:
3029  case SPUISD::SHL_BYTES:
3030  case SPUISD::ROTBYTES_LEFT: {
3031    SDValue Op1 = N->getOperand(1);
3032
3033    // Kill degenerate vector shifts:
3034    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
3035      if (CN->isNullValue()) {
3036        Result = Op0;
3037      }
3038    }
3039    break;
3040  }
3041  case SPUISD::PREFSLOT2VEC: {
3042    switch (Op0.getOpcode()) {
3043    default:
3044      break;
3045    case ISD::ANY_EXTEND:
3046    case ISD::ZERO_EXTEND:
3047    case ISD::SIGN_EXTEND: {
3048      // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
3049      // <arg>
3050      // but only if the SPUprefslot2vec and <arg> types match.
3051      SDValue Op00 = Op0.getOperand(0);
3052      if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
3053        SDValue Op000 = Op00.getOperand(0);
3054        if (Op000.getValueType() == NodeVT) {
3055          Result = Op000;
3056        }
3057      }
3058      break;
3059    }
3060    case SPUISD::VEC2PREFSLOT: {
3061      // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
3062      // <arg>
3063      Result = Op0.getOperand(0);
3064      break;
3065    }
3066    }
3067    break;
3068  }
3069  }
3070
3071  // Otherwise, return unchanged.
3072#ifndef NDEBUG
3073  if (Result.getNode()) {
3074    DEBUG(errs() << "\nReplace.SPU: ");
3075    DEBUG(N->dump(&DAG));
3076    DEBUG(errs() << "\nWith:        ");
3077    DEBUG(Result.getNode()->dump(&DAG));
3078    DEBUG(errs() << "\n");
3079  }
3080#endif
3081
3082  return Result;
3083}
3084
3085//===----------------------------------------------------------------------===//
3086// Inline Assembly Support
3087//===----------------------------------------------------------------------===//
3088
3089/// getConstraintType - Given a constraint letter, return the type of
3090/// constraint it is for this target.
3091SPUTargetLowering::ConstraintType
3092SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
3093  if (ConstraintLetter.size() == 1) {
3094    switch (ConstraintLetter[0]) {
3095    default: break;
3096    case 'b':
3097    case 'r':
3098    case 'f':
3099    case 'v':
3100    case 'y':
3101      return C_RegisterClass;
3102    }
3103  }
3104  return TargetLowering::getConstraintType(ConstraintLetter);
3105}
3106
3107/// Examine constraint type and operand type and determine a weight value.
3108/// This object must already have been set up with the operand type
3109/// and the current alternative constraint selected.
3110TargetLowering::ConstraintWeight
3111SPUTargetLowering::getSingleConstraintMatchWeight(
3112    AsmOperandInfo &info, const char *constraint) const {
3113  ConstraintWeight weight = CW_Invalid;
3114  Value *CallOperandVal = info.CallOperandVal;
3115    // If we don't have a value, we can't do a match,
3116    // but allow it at the lowest weight.
3117  if (CallOperandVal == NULL)
3118    return CW_Default;
3119  // Look at the constraint type.
3120  switch (*constraint) {
3121  default:
3122    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
3123    break;
3124    //FIXME: Seems like the supported constraint letters were just copied
3125    // from PPC, as the following doesn't correspond to the GCC docs.
3126    // I'm leaving it so until someone adds the corresponding lowering support.
3127  case 'b':
3128  case 'r':
3129  case 'f':
3130  case 'd':
3131  case 'v':
3132  case 'y':
3133    weight = CW_Register;
3134    break;
3135  }
3136  return weight;
3137}
3138
3139std::pair<unsigned, const TargetRegisterClass*>
3140SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
3141                                                EVT VT) const
3142{
3143  if (Constraint.size() == 1) {
3144    // GCC RS6000 Constraint Letters
3145    switch (Constraint[0]) {
3146    case 'b':   // R1-R31
3147    case 'r':   // R0-R31
3148      if (VT == MVT::i64)
3149        return std::make_pair(0U, &SPU::R64CRegClass);
3150      return std::make_pair(0U, &SPU::R32CRegClass);
3151    case 'f':
3152      if (VT == MVT::f32)
3153        return std::make_pair(0U, &SPU::R32FPRegClass);
3154      if (VT == MVT::f64)
3155        return std::make_pair(0U, &SPU::R64FPRegClass);
3156      break;
3157    case 'v':
3158      return std::make_pair(0U, &SPU::GPRCRegClass);
3159    }
3160  }
3161
3162  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
3163}
3164
3165//! Compute used/known bits for a SPU operand
3166void
3167SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
3168                                                  APInt &KnownZero,
3169                                                  APInt &KnownOne,
3170                                                  const SelectionDAG &DAG,
3171                                                  unsigned Depth ) const {
3172#if 0
3173  const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
3174
3175  switch (Op.getOpcode()) {
3176  default:
3177    // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
3178    break;
3179  case CALL:
3180  case SHUFB:
3181  case SHUFFLE_MASK:
3182  case CNTB:
3183  case SPUISD::PREFSLOT2VEC:
3184  case SPUISD::LDRESULT:
3185  case SPUISD::VEC2PREFSLOT:
3186  case SPUISD::SHLQUAD_L_BITS:
3187  case SPUISD::SHLQUAD_L_BYTES:
3188  case SPUISD::VEC_ROTL:
3189  case SPUISD::VEC_ROTR:
3190  case SPUISD::ROTBYTES_LEFT:
3191  case SPUISD::SELECT_MASK:
3192  case SPUISD::SELB:
3193  }
3194#endif
3195}
3196
3197unsigned
3198SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
3199                                                   unsigned Depth) const {
3200  switch (Op.getOpcode()) {
3201  default:
3202    return 1;
3203
3204  case ISD::SETCC: {
3205    EVT VT = Op.getValueType();
3206
3207    if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
3208      VT = MVT::i32;
3209    }
3210    return VT.getSizeInBits();
3211  }
3212  }
3213}
3214
3215// LowerAsmOperandForConstraint
3216void
3217SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
3218                                                std::string &Constraint,
3219                                                std::vector<SDValue> &Ops,
3220                                                SelectionDAG &DAG) const {
3221  // Default, for the time being, to the base class handler
3222  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3223}
3224
3225/// isLegalAddressImmediate - Return true if the integer value can be used
3226/// as the offset of the target addressing mode.
3227bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
3228                                                Type *Ty) const {
3229  // SPU's addresses are 256K:
3230  return (V > -(1 << 18) && V < (1 << 18) - 1);
3231}
3232
3233bool SPUTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const {
3234  return false;
3235}
3236
3237bool
3238SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
3239  // The SPU target isn't yet aware of offsets.
3240  return false;
3241}
3242
3243// can we compare to Imm without writing it into a register?
3244bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
3245  //ceqi, cgti, etc. all take s10 operand
3246  return isInt<10>(Imm);
3247}
3248
3249bool
3250SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
3251                                         Type * ) const{
3252
3253  // A-form: 18bit absolute address.
3254  if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
3255    return true;
3256
3257  // D-form: reg + 14bit offset
3258  if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
3259    return true;
3260
3261  // X-form: reg+reg
3262  if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
3263    return true;
3264
3265  return false;
3266}
3267