NVPTXISelLowering.cpp revision 296417
1//
2//                     The LLVM Compiler Infrastructure
3//
4// This file is distributed under the University of Illinois Open Source
5// License. See LICENSE.TXT for details.
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
15#include "NVPTX.h"
16#include "NVPTXTargetMachine.h"
17#include "NVPTXTargetObjectFile.h"
18#include "NVPTXUtilities.h"
19#include "llvm/CodeGen/Analysis.h"
20#include "llvm/CodeGen/MachineFrameInfo.h"
21#include "llvm/CodeGen/MachineFunction.h"
22#include "llvm/CodeGen/MachineInstrBuilder.h"
23#include "llvm/CodeGen/MachineRegisterInfo.h"
24#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
25#include "llvm/IR/CallSite.h"
26#include "llvm/IR/DerivedTypes.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/GlobalValue.h"
29#include "llvm/IR/IntrinsicInst.h"
30#include "llvm/IR/Intrinsics.h"
31#include "llvm/IR/Module.h"
32#include "llvm/MC/MCSectionELF.h"
33#include "llvm/Support/CommandLine.h"
34#include "llvm/Support/Debug.h"
35#include "llvm/Support/ErrorHandling.h"
36#include "llvm/Support/MathExtras.h"
37#include "llvm/Support/raw_ostream.h"
38#include <sstream>
39
40#undef DEBUG_TYPE
41#define DEBUG_TYPE "nvptx-lower"
42
43using namespace llvm;
44
45static unsigned int uniqueCallSite = 0;
46
47static cl::opt<bool> sched4reg(
48    "nvptx-sched4reg",
49    cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
50
51static cl::opt<unsigned>
52FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
53                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
54                             " 1: do it  2: do it aggressively"),
55                    cl::init(2));
56
57static bool IsPTXVectorType(MVT VT) {
58  switch (VT.SimpleTy) {
59  default:
60    return false;
61  case MVT::v2i1:
62  case MVT::v4i1:
63  case MVT::v2i8:
64  case MVT::v4i8:
65  case MVT::v2i16:
66  case MVT::v4i16:
67  case MVT::v2i32:
68  case MVT::v4i32:
69  case MVT::v2i64:
70  case MVT::v2f32:
71  case MVT::v4f32:
72  case MVT::v2f64:
73    return true;
74  }
75}
76
77/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
78/// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
79/// into their primitive components.
80/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
81/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
82/// LowerCall, and LowerReturn.
83static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
84                               Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
85                               SmallVectorImpl<uint64_t> *Offsets = nullptr,
86                               uint64_t StartingOffset = 0) {
87  SmallVector<EVT, 16> TempVTs;
88  SmallVector<uint64_t, 16> TempOffsets;
89
90  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
91  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
92    EVT VT = TempVTs[i];
93    uint64_t Off = TempOffsets[i];
94    if (VT.isVector())
95      for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
96        ValueVTs.push_back(VT.getVectorElementType());
97        if (Offsets)
98          Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
99      }
100    else {
101      ValueVTs.push_back(VT);
102      if (Offsets)
103        Offsets->push_back(Off);
104    }
105  }
106}
107
108// NVPTXTargetLowering Constructor.
109NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
110                                         const NVPTXSubtarget &STI)
111    : TargetLowering(TM), nvTM(&TM), STI(STI) {
112
113  // always lower memset, memcpy, and memmove intrinsics to load/store
114  // instructions, rather
115  // then generating calls to memset, mempcy or memmove.
116  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
117  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
118  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
119
120  setBooleanContents(ZeroOrNegativeOneBooleanContent);
121  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
122
123  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
124  // condition branches.
125  setJumpIsExpensive(true);
126
127  // Wide divides are _very_ slow. Try to reduce the width of the divide if
128  // possible.
129  addBypassSlowDiv(64, 32);
130
131  // By default, use the Source scheduling
132  if (sched4reg)
133    setSchedulingPreference(Sched::RegPressure);
134  else
135    setSchedulingPreference(Sched::Source);
136
137  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
138  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
139  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
140  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
141  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
142  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
143
144  // Operations not directly supported by NVPTX.
145  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
146  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
147  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
148  setOperationAction(ISD::SELECT_CC, MVT::i8, Expand);
149  setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
150  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
151  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
152  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
153  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
154  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
155  setOperationAction(ISD::BR_CC, MVT::i8, Expand);
156  setOperationAction(ISD::BR_CC, MVT::i16, Expand);
157  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
158  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
159  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
160  // For others we will expand to a SHL/SRA pair.
161  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
162  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
163  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
164  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
165  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
166
167  setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
168  setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
169  setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
170  setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
171  setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
172  setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
173
174  if (STI.hasROT64()) {
175    setOperationAction(ISD::ROTL, MVT::i64, Legal);
176    setOperationAction(ISD::ROTR, MVT::i64, Legal);
177  } else {
178    setOperationAction(ISD::ROTL, MVT::i64, Expand);
179    setOperationAction(ISD::ROTR, MVT::i64, Expand);
180  }
181  if (STI.hasROT32()) {
182    setOperationAction(ISD::ROTL, MVT::i32, Legal);
183    setOperationAction(ISD::ROTR, MVT::i32, Legal);
184  } else {
185    setOperationAction(ISD::ROTL, MVT::i32, Expand);
186    setOperationAction(ISD::ROTR, MVT::i32, Expand);
187  }
188
189  setOperationAction(ISD::ROTL, MVT::i16, Expand);
190  setOperationAction(ISD::ROTR, MVT::i16, Expand);
191  setOperationAction(ISD::ROTL, MVT::i8, Expand);
192  setOperationAction(ISD::ROTR, MVT::i8, Expand);
193  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
194  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
195  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
196
197  // Indirect branch is not supported.
198  // This also disables Jump Table creation.
199  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
200  setOperationAction(ISD::BRIND, MVT::Other, Expand);
201
202  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
203  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
204
205  // We want to legalize constant related memmove and memcopy
206  // intrinsics.
207  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
208
209  // Turn FP extload into load/fextend
210  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
211  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
212  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
213  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
214  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
215  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
216  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
217  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
218  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
219  // Turn FP truncstore into trunc + store.
220  // FIXME: vector types should also be expanded
221  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
222  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
223  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
224
225  // PTX does not support load / store predicate registers
226  setOperationAction(ISD::LOAD, MVT::i1, Custom);
227  setOperationAction(ISD::STORE, MVT::i1, Custom);
228
229  for (MVT VT : MVT::integer_valuetypes()) {
230    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
231    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
232    setTruncStoreAction(VT, MVT::i1, Expand);
233  }
234
235  // This is legal in NVPTX
236  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
237  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
238
239  // TRAP can be lowered to PTX trap
240  setOperationAction(ISD::TRAP, MVT::Other, Legal);
241
242  setOperationAction(ISD::ADDC, MVT::i64, Expand);
243  setOperationAction(ISD::ADDE, MVT::i64, Expand);
244
245  // Register custom handling for vector loads/stores
246  for (MVT VT : MVT::vector_valuetypes()) {
247    if (IsPTXVectorType(VT)) {
248      setOperationAction(ISD::LOAD, VT, Custom);
249      setOperationAction(ISD::STORE, VT, Custom);
250      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
251    }
252  }
253
254  // Custom handling for i8 intrinsics
255  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
256
257  setOperationAction(ISD::CTLZ, MVT::i16, Legal);
258  setOperationAction(ISD::CTLZ, MVT::i32, Legal);
259  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
260  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal);
261  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal);
262  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal);
263  setOperationAction(ISD::CTTZ, MVT::i16, Expand);
264  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
265  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
266  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand);
267  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
268  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
269  setOperationAction(ISD::CTPOP, MVT::i16, Legal);
270  setOperationAction(ISD::CTPOP, MVT::i32, Legal);
271  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
272
273  // PTX does not directly support SELP of i1, so promote to i32 first
274  setOperationAction(ISD::SELECT, MVT::i1, Custom);
275
276  // We have some custom DAG combine patterns for these nodes
277  setTargetDAGCombine(ISD::ADD);
278  setTargetDAGCombine(ISD::AND);
279  setTargetDAGCombine(ISD::FADD);
280  setTargetDAGCombine(ISD::MUL);
281  setTargetDAGCombine(ISD::SHL);
282  setTargetDAGCombine(ISD::SELECT);
283
284  // Now deduce the information based on the above mentioned
285  // actions
286  computeRegisterProperties(STI.getRegisterInfo());
287}
288
289const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
290  switch ((NVPTXISD::NodeType)Opcode) {
291  case NVPTXISD::FIRST_NUMBER:
292    break;
293  case NVPTXISD::CALL:
294    return "NVPTXISD::CALL";
295  case NVPTXISD::RET_FLAG:
296    return "NVPTXISD::RET_FLAG";
297  case NVPTXISD::LOAD_PARAM:
298    return "NVPTXISD::LOAD_PARAM";
299  case NVPTXISD::Wrapper:
300    return "NVPTXISD::Wrapper";
301  case NVPTXISD::DeclareParam:
302    return "NVPTXISD::DeclareParam";
303  case NVPTXISD::DeclareScalarParam:
304    return "NVPTXISD::DeclareScalarParam";
305  case NVPTXISD::DeclareRet:
306    return "NVPTXISD::DeclareRet";
307  case NVPTXISD::DeclareScalarRet:
308    return "NVPTXISD::DeclareScalarRet";
309  case NVPTXISD::DeclareRetParam:
310    return "NVPTXISD::DeclareRetParam";
311  case NVPTXISD::PrintCall:
312    return "NVPTXISD::PrintCall";
313  case NVPTXISD::PrintCallUni:
314    return "NVPTXISD::PrintCallUni";
315  case NVPTXISD::LoadParam:
316    return "NVPTXISD::LoadParam";
317  case NVPTXISD::LoadParamV2:
318    return "NVPTXISD::LoadParamV2";
319  case NVPTXISD::LoadParamV4:
320    return "NVPTXISD::LoadParamV4";
321  case NVPTXISD::StoreParam:
322    return "NVPTXISD::StoreParam";
323  case NVPTXISD::StoreParamV2:
324    return "NVPTXISD::StoreParamV2";
325  case NVPTXISD::StoreParamV4:
326    return "NVPTXISD::StoreParamV4";
327  case NVPTXISD::StoreParamS32:
328    return "NVPTXISD::StoreParamS32";
329  case NVPTXISD::StoreParamU32:
330    return "NVPTXISD::StoreParamU32";
331  case NVPTXISD::CallArgBegin:
332    return "NVPTXISD::CallArgBegin";
333  case NVPTXISD::CallArg:
334    return "NVPTXISD::CallArg";
335  case NVPTXISD::LastCallArg:
336    return "NVPTXISD::LastCallArg";
337  case NVPTXISD::CallArgEnd:
338    return "NVPTXISD::CallArgEnd";
339  case NVPTXISD::CallVoid:
340    return "NVPTXISD::CallVoid";
341  case NVPTXISD::CallVal:
342    return "NVPTXISD::CallVal";
343  case NVPTXISD::CallSymbol:
344    return "NVPTXISD::CallSymbol";
345  case NVPTXISD::Prototype:
346    return "NVPTXISD::Prototype";
347  case NVPTXISD::MoveParam:
348    return "NVPTXISD::MoveParam";
349  case NVPTXISD::StoreRetval:
350    return "NVPTXISD::StoreRetval";
351  case NVPTXISD::StoreRetvalV2:
352    return "NVPTXISD::StoreRetvalV2";
353  case NVPTXISD::StoreRetvalV4:
354    return "NVPTXISD::StoreRetvalV4";
355  case NVPTXISD::PseudoUseParam:
356    return "NVPTXISD::PseudoUseParam";
357  case NVPTXISD::RETURN:
358    return "NVPTXISD::RETURN";
359  case NVPTXISD::CallSeqBegin:
360    return "NVPTXISD::CallSeqBegin";
361  case NVPTXISD::CallSeqEnd:
362    return "NVPTXISD::CallSeqEnd";
363  case NVPTXISD::CallPrototype:
364    return "NVPTXISD::CallPrototype";
365  case NVPTXISD::LoadV2:
366    return "NVPTXISD::LoadV2";
367  case NVPTXISD::LoadV4:
368    return "NVPTXISD::LoadV4";
369  case NVPTXISD::LDGV2:
370    return "NVPTXISD::LDGV2";
371  case NVPTXISD::LDGV4:
372    return "NVPTXISD::LDGV4";
373  case NVPTXISD::LDUV2:
374    return "NVPTXISD::LDUV2";
375  case NVPTXISD::LDUV4:
376    return "NVPTXISD::LDUV4";
377  case NVPTXISD::StoreV2:
378    return "NVPTXISD::StoreV2";
379  case NVPTXISD::StoreV4:
380    return "NVPTXISD::StoreV4";
381  case NVPTXISD::FUN_SHFL_CLAMP:
382    return "NVPTXISD::FUN_SHFL_CLAMP";
383  case NVPTXISD::FUN_SHFR_CLAMP:
384    return "NVPTXISD::FUN_SHFR_CLAMP";
385  case NVPTXISD::IMAD:
386    return "NVPTXISD::IMAD";
387  case NVPTXISD::Dummy:
388    return "NVPTXISD::Dummy";
389  case NVPTXISD::MUL_WIDE_SIGNED:
390    return "NVPTXISD::MUL_WIDE_SIGNED";
391  case NVPTXISD::MUL_WIDE_UNSIGNED:
392    return "NVPTXISD::MUL_WIDE_UNSIGNED";
393  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
394  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
395  case NVPTXISD::Tex1DFloatFloatLevel:
396    return "NVPTXISD::Tex1DFloatFloatLevel";
397  case NVPTXISD::Tex1DFloatFloatGrad:
398    return "NVPTXISD::Tex1DFloatFloatGrad";
399  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
400  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
401  case NVPTXISD::Tex1DS32FloatLevel:
402    return "NVPTXISD::Tex1DS32FloatLevel";
403  case NVPTXISD::Tex1DS32FloatGrad:
404    return "NVPTXISD::Tex1DS32FloatGrad";
405  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
406  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
407  case NVPTXISD::Tex1DU32FloatLevel:
408    return "NVPTXISD::Tex1DU32FloatLevel";
409  case NVPTXISD::Tex1DU32FloatGrad:
410    return "NVPTXISD::Tex1DU32FloatGrad";
411  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
412  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
413  case NVPTXISD::Tex1DArrayFloatFloatLevel:
414    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
415  case NVPTXISD::Tex1DArrayFloatFloatGrad:
416    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
417  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
418  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
419  case NVPTXISD::Tex1DArrayS32FloatLevel:
420    return "NVPTXISD::Tex1DArrayS32FloatLevel";
421  case NVPTXISD::Tex1DArrayS32FloatGrad:
422    return "NVPTXISD::Tex1DArrayS32FloatGrad";
423  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
424  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
425  case NVPTXISD::Tex1DArrayU32FloatLevel:
426    return "NVPTXISD::Tex1DArrayU32FloatLevel";
427  case NVPTXISD::Tex1DArrayU32FloatGrad:
428    return "NVPTXISD::Tex1DArrayU32FloatGrad";
429  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
430  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
431  case NVPTXISD::Tex2DFloatFloatLevel:
432    return "NVPTXISD::Tex2DFloatFloatLevel";
433  case NVPTXISD::Tex2DFloatFloatGrad:
434    return "NVPTXISD::Tex2DFloatFloatGrad";
435  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
436  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
437  case NVPTXISD::Tex2DS32FloatLevel:
438    return "NVPTXISD::Tex2DS32FloatLevel";
439  case NVPTXISD::Tex2DS32FloatGrad:
440    return "NVPTXISD::Tex2DS32FloatGrad";
441  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
442  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
443  case NVPTXISD::Tex2DU32FloatLevel:
444    return "NVPTXISD::Tex2DU32FloatLevel";
445  case NVPTXISD::Tex2DU32FloatGrad:
446    return "NVPTXISD::Tex2DU32FloatGrad";
447  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
448  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
449  case NVPTXISD::Tex2DArrayFloatFloatLevel:
450    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
451  case NVPTXISD::Tex2DArrayFloatFloatGrad:
452    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
453  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
454  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
455  case NVPTXISD::Tex2DArrayS32FloatLevel:
456    return "NVPTXISD::Tex2DArrayS32FloatLevel";
457  case NVPTXISD::Tex2DArrayS32FloatGrad:
458    return "NVPTXISD::Tex2DArrayS32FloatGrad";
459  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
460  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
461  case NVPTXISD::Tex2DArrayU32FloatLevel:
462    return "NVPTXISD::Tex2DArrayU32FloatLevel";
463  case NVPTXISD::Tex2DArrayU32FloatGrad:
464    return "NVPTXISD::Tex2DArrayU32FloatGrad";
465  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
466  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
467  case NVPTXISD::Tex3DFloatFloatLevel:
468    return "NVPTXISD::Tex3DFloatFloatLevel";
469  case NVPTXISD::Tex3DFloatFloatGrad:
470    return "NVPTXISD::Tex3DFloatFloatGrad";
471  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
472  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
473  case NVPTXISD::Tex3DS32FloatLevel:
474    return "NVPTXISD::Tex3DS32FloatLevel";
475  case NVPTXISD::Tex3DS32FloatGrad:
476    return "NVPTXISD::Tex3DS32FloatGrad";
477  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
478  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
479  case NVPTXISD::Tex3DU32FloatLevel:
480    return "NVPTXISD::Tex3DU32FloatLevel";
481  case NVPTXISD::Tex3DU32FloatGrad:
482    return "NVPTXISD::Tex3DU32FloatGrad";
483  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
484  case NVPTXISD::TexCubeFloatFloatLevel:
485    return "NVPTXISD::TexCubeFloatFloatLevel";
486  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
487  case NVPTXISD::TexCubeS32FloatLevel:
488    return "NVPTXISD::TexCubeS32FloatLevel";
489  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
490  case NVPTXISD::TexCubeU32FloatLevel:
491    return "NVPTXISD::TexCubeU32FloatLevel";
492  case NVPTXISD::TexCubeArrayFloatFloat:
493    return "NVPTXISD::TexCubeArrayFloatFloat";
494  case NVPTXISD::TexCubeArrayFloatFloatLevel:
495    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
496  case NVPTXISD::TexCubeArrayS32Float:
497    return "NVPTXISD::TexCubeArrayS32Float";
498  case NVPTXISD::TexCubeArrayS32FloatLevel:
499    return "NVPTXISD::TexCubeArrayS32FloatLevel";
500  case NVPTXISD::TexCubeArrayU32Float:
501    return "NVPTXISD::TexCubeArrayU32Float";
502  case NVPTXISD::TexCubeArrayU32FloatLevel:
503    return "NVPTXISD::TexCubeArrayU32FloatLevel";
504  case NVPTXISD::Tld4R2DFloatFloat:
505    return "NVPTXISD::Tld4R2DFloatFloat";
506  case NVPTXISD::Tld4G2DFloatFloat:
507    return "NVPTXISD::Tld4G2DFloatFloat";
508  case NVPTXISD::Tld4B2DFloatFloat:
509    return "NVPTXISD::Tld4B2DFloatFloat";
510  case NVPTXISD::Tld4A2DFloatFloat:
511    return "NVPTXISD::Tld4A2DFloatFloat";
512  case NVPTXISD::Tld4R2DS64Float:
513    return "NVPTXISD::Tld4R2DS64Float";
514  case NVPTXISD::Tld4G2DS64Float:
515    return "NVPTXISD::Tld4G2DS64Float";
516  case NVPTXISD::Tld4B2DS64Float:
517    return "NVPTXISD::Tld4B2DS64Float";
518  case NVPTXISD::Tld4A2DS64Float:
519    return "NVPTXISD::Tld4A2DS64Float";
520  case NVPTXISD::Tld4R2DU64Float:
521    return "NVPTXISD::Tld4R2DU64Float";
522  case NVPTXISD::Tld4G2DU64Float:
523    return "NVPTXISD::Tld4G2DU64Float";
524  case NVPTXISD::Tld4B2DU64Float:
525    return "NVPTXISD::Tld4B2DU64Float";
526  case NVPTXISD::Tld4A2DU64Float:
527    return "NVPTXISD::Tld4A2DU64Float";
528
529  case NVPTXISD::TexUnified1DFloatS32:
530    return "NVPTXISD::TexUnified1DFloatS32";
531  case NVPTXISD::TexUnified1DFloatFloat:
532    return "NVPTXISD::TexUnified1DFloatFloat";
533  case NVPTXISD::TexUnified1DFloatFloatLevel:
534    return "NVPTXISD::TexUnified1DFloatFloatLevel";
535  case NVPTXISD::TexUnified1DFloatFloatGrad:
536    return "NVPTXISD::TexUnified1DFloatFloatGrad";
537  case NVPTXISD::TexUnified1DS32S32:
538    return "NVPTXISD::TexUnified1DS32S32";
539  case NVPTXISD::TexUnified1DS32Float:
540    return "NVPTXISD::TexUnified1DS32Float";
541  case NVPTXISD::TexUnified1DS32FloatLevel:
542    return "NVPTXISD::TexUnified1DS32FloatLevel";
543  case NVPTXISD::TexUnified1DS32FloatGrad:
544    return "NVPTXISD::TexUnified1DS32FloatGrad";
545  case NVPTXISD::TexUnified1DU32S32:
546    return "NVPTXISD::TexUnified1DU32S32";
547  case NVPTXISD::TexUnified1DU32Float:
548    return "NVPTXISD::TexUnified1DU32Float";
549  case NVPTXISD::TexUnified1DU32FloatLevel:
550    return "NVPTXISD::TexUnified1DU32FloatLevel";
551  case NVPTXISD::TexUnified1DU32FloatGrad:
552    return "NVPTXISD::TexUnified1DU32FloatGrad";
553  case NVPTXISD::TexUnified1DArrayFloatS32:
554    return "NVPTXISD::TexUnified1DArrayFloatS32";
555  case NVPTXISD::TexUnified1DArrayFloatFloat:
556    return "NVPTXISD::TexUnified1DArrayFloatFloat";
557  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
558    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
559  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
560    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
561  case NVPTXISD::TexUnified1DArrayS32S32:
562    return "NVPTXISD::TexUnified1DArrayS32S32";
563  case NVPTXISD::TexUnified1DArrayS32Float:
564    return "NVPTXISD::TexUnified1DArrayS32Float";
565  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
566    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
567  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
568    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
569  case NVPTXISD::TexUnified1DArrayU32S32:
570    return "NVPTXISD::TexUnified1DArrayU32S32";
571  case NVPTXISD::TexUnified1DArrayU32Float:
572    return "NVPTXISD::TexUnified1DArrayU32Float";
573  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
574    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
575  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
576    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
577  case NVPTXISD::TexUnified2DFloatS32:
578    return "NVPTXISD::TexUnified2DFloatS32";
579  case NVPTXISD::TexUnified2DFloatFloat:
580    return "NVPTXISD::TexUnified2DFloatFloat";
581  case NVPTXISD::TexUnified2DFloatFloatLevel:
582    return "NVPTXISD::TexUnified2DFloatFloatLevel";
583  case NVPTXISD::TexUnified2DFloatFloatGrad:
584    return "NVPTXISD::TexUnified2DFloatFloatGrad";
585  case NVPTXISD::TexUnified2DS32S32:
586    return "NVPTXISD::TexUnified2DS32S32";
587  case NVPTXISD::TexUnified2DS32Float:
588    return "NVPTXISD::TexUnified2DS32Float";
589  case NVPTXISD::TexUnified2DS32FloatLevel:
590    return "NVPTXISD::TexUnified2DS32FloatLevel";
591  case NVPTXISD::TexUnified2DS32FloatGrad:
592    return "NVPTXISD::TexUnified2DS32FloatGrad";
593  case NVPTXISD::TexUnified2DU32S32:
594    return "NVPTXISD::TexUnified2DU32S32";
595  case NVPTXISD::TexUnified2DU32Float:
596    return "NVPTXISD::TexUnified2DU32Float";
597  case NVPTXISD::TexUnified2DU32FloatLevel:
598    return "NVPTXISD::TexUnified2DU32FloatLevel";
599  case NVPTXISD::TexUnified2DU32FloatGrad:
600    return "NVPTXISD::TexUnified2DU32FloatGrad";
601  case NVPTXISD::TexUnified2DArrayFloatS32:
602    return "NVPTXISD::TexUnified2DArrayFloatS32";
603  case NVPTXISD::TexUnified2DArrayFloatFloat:
604    return "NVPTXISD::TexUnified2DArrayFloatFloat";
605  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
606    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
607  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
608    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
609  case NVPTXISD::TexUnified2DArrayS32S32:
610    return "NVPTXISD::TexUnified2DArrayS32S32";
611  case NVPTXISD::TexUnified2DArrayS32Float:
612    return "NVPTXISD::TexUnified2DArrayS32Float";
613  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
614    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
615  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
616    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
617  case NVPTXISD::TexUnified2DArrayU32S32:
618    return "NVPTXISD::TexUnified2DArrayU32S32";
619  case NVPTXISD::TexUnified2DArrayU32Float:
620    return "NVPTXISD::TexUnified2DArrayU32Float";
621  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
622    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
623  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
624    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
625  case NVPTXISD::TexUnified3DFloatS32:
626    return "NVPTXISD::TexUnified3DFloatS32";
627  case NVPTXISD::TexUnified3DFloatFloat:
628    return "NVPTXISD::TexUnified3DFloatFloat";
629  case NVPTXISD::TexUnified3DFloatFloatLevel:
630    return "NVPTXISD::TexUnified3DFloatFloatLevel";
631  case NVPTXISD::TexUnified3DFloatFloatGrad:
632    return "NVPTXISD::TexUnified3DFloatFloatGrad";
633  case NVPTXISD::TexUnified3DS32S32:
634    return "NVPTXISD::TexUnified3DS32S32";
635  case NVPTXISD::TexUnified3DS32Float:
636    return "NVPTXISD::TexUnified3DS32Float";
637  case NVPTXISD::TexUnified3DS32FloatLevel:
638    return "NVPTXISD::TexUnified3DS32FloatLevel";
639  case NVPTXISD::TexUnified3DS32FloatGrad:
640    return "NVPTXISD::TexUnified3DS32FloatGrad";
641  case NVPTXISD::TexUnified3DU32S32:
642    return "NVPTXISD::TexUnified3DU32S32";
643  case NVPTXISD::TexUnified3DU32Float:
644    return "NVPTXISD::TexUnified3DU32Float";
645  case NVPTXISD::TexUnified3DU32FloatLevel:
646    return "NVPTXISD::TexUnified3DU32FloatLevel";
647  case NVPTXISD::TexUnified3DU32FloatGrad:
648    return "NVPTXISD::TexUnified3DU32FloatGrad";
649  case NVPTXISD::TexUnifiedCubeFloatFloat:
650    return "NVPTXISD::TexUnifiedCubeFloatFloat";
651  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
652    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
653  case NVPTXISD::TexUnifiedCubeS32Float:
654    return "NVPTXISD::TexUnifiedCubeS32Float";
655  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
656    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
657  case NVPTXISD::TexUnifiedCubeU32Float:
658    return "NVPTXISD::TexUnifiedCubeU32Float";
659  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
660    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
661  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
662    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
663  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
664    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
665  case NVPTXISD::TexUnifiedCubeArrayS32Float:
666    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
667  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
668    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
669  case NVPTXISD::TexUnifiedCubeArrayU32Float:
670    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
671  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
672    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
673  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
674    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
675  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
676    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
677  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
678    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
679  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
680    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
681  case NVPTXISD::Tld4UnifiedR2DS64Float:
682    return "NVPTXISD::Tld4UnifiedR2DS64Float";
683  case NVPTXISD::Tld4UnifiedG2DS64Float:
684    return "NVPTXISD::Tld4UnifiedG2DS64Float";
685  case NVPTXISD::Tld4UnifiedB2DS64Float:
686    return "NVPTXISD::Tld4UnifiedB2DS64Float";
687  case NVPTXISD::Tld4UnifiedA2DS64Float:
688    return "NVPTXISD::Tld4UnifiedA2DS64Float";
689  case NVPTXISD::Tld4UnifiedR2DU64Float:
690    return "NVPTXISD::Tld4UnifiedR2DU64Float";
691  case NVPTXISD::Tld4UnifiedG2DU64Float:
692    return "NVPTXISD::Tld4UnifiedG2DU64Float";
693  case NVPTXISD::Tld4UnifiedB2DU64Float:
694    return "NVPTXISD::Tld4UnifiedB2DU64Float";
695  case NVPTXISD::Tld4UnifiedA2DU64Float:
696    return "NVPTXISD::Tld4UnifiedA2DU64Float";
697
698  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
699  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
700  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
701  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
702  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
703  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
704  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
705  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
706  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
707  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
708  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
709
710  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
711  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
712  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
713  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
714  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
715  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
716  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
717  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
718  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
719  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
720  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
721
722  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
723  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
724  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
725  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
726  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
727  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
728  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
729  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
730  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
731  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
732  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
733
734  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
735  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
736  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
737  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
738  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
739  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
740  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
741  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
742  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
743  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
744  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
745
746  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
747  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
748  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
749  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
750  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
751  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
752  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
753  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
754  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
755  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
756  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
757
758  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
759  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
760  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
761  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
762  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
763  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
764  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
765  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
766  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
767  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
768  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
769
770  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
771  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
772  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
773  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
774  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
775  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
776  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
777  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
778  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
779  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
780  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
781
782  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
783  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
784  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
785  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
786  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
787  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
788  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
789  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
790  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
791  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
792  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
793
794  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
795  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
796  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
797  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
798  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
799  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
800  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
801  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
802  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
803  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
804  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
805
806  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
807  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
808  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
809  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
810  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
811  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
812  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
813  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
814  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
815  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
816  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
817
818  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
819  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
820  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
821  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
822  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
823  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
824  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
825  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
826  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
827  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
828  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
829
830  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
831  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
832  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
833  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
834  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
835  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
836  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
837  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
838  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
839  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
840  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
841
842  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
843  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
844  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
845  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
846  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
847  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
848  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
849  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
850  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
851  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
852  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
853
854  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
855  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
856  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
857  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
858  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
859  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
860  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
861  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
862  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
863  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
864  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
865
866  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
867  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
868  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
869  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
870  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
871  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
872  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
873  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
874  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
875  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
876  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
877  }
878  return nullptr;
879}
880
881TargetLoweringBase::LegalizeTypeAction
882NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
883  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
884    return TypeSplitVector;
885
886  return TargetLoweringBase::getPreferredVectorAction(VT);
887}
888
889SDValue
890NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
891  SDLoc dl(Op);
892  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
893  auto PtrVT = getPointerTy(DAG.getDataLayout());
894  Op = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
895  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
896}
897
898std::string NVPTXTargetLowering::getPrototype(
899    const DataLayout &DL, Type *retTy, const ArgListTy &Args,
900    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
901    const ImmutableCallSite *CS) const {
902  auto PtrVT = getPointerTy(DL);
903
904  bool isABI = (STI.getSmVersion() >= 20);
905  assert(isABI && "Non-ABI compilation is not supported");
906  if (!isABI)
907    return "";
908
909  std::stringstream O;
910  O << "prototype_" << uniqueCallSite << " : .callprototype ";
911
912  if (retTy->getTypeID() == Type::VoidTyID) {
913    O << "()";
914  } else {
915    O << "(";
916    if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) {
917      unsigned size = 0;
918      if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
919        size = ITy->getBitWidth();
920        if (size < 32)
921          size = 32;
922      } else {
923        assert(retTy->isFloatingPointTy() &&
924               "Floating point type expected here");
925        size = retTy->getPrimitiveSizeInBits();
926      }
927
928      O << ".param .b" << size << " _";
929    } else if (isa<PointerType>(retTy)) {
930      O << ".param .b" << PtrVT.getSizeInBits() << " _";
931    } else if ((retTy->getTypeID() == Type::StructTyID) ||
932               isa<VectorType>(retTy)) {
933      auto &DL = CS->getCalledFunction()->getParent()->getDataLayout();
934      O << ".param .align " << retAlignment << " .b8 _["
935        << DL.getTypeAllocSize(retTy) << "]";
936    } else {
937      llvm_unreachable("Unknown return type");
938    }
939    O << ") ";
940  }
941  O << "_ (";
942
943  bool first = true;
944
945  unsigned OIdx = 0;
946  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
947    Type *Ty = Args[i].Ty;
948    if (!first) {
949      O << ", ";
950    }
951    first = false;
952
953    if (!Outs[OIdx].Flags.isByVal()) {
954      if (Ty->isAggregateType() || Ty->isVectorTy()) {
955        unsigned align = 0;
956        const CallInst *CallI = cast<CallInst>(CS->getInstruction());
957        // +1 because index 0 is reserved for return type alignment
958        if (!llvm::getAlign(*CallI, i + 1, align))
959          align = DL.getABITypeAlignment(Ty);
960        unsigned sz = DL.getTypeAllocSize(Ty);
961        O << ".param .align " << align << " .b8 ";
962        O << "_";
963        O << "[" << sz << "]";
964        // update the index for Outs
965        SmallVector<EVT, 16> vtparts;
966        ComputeValueVTs(*this, DL, Ty, vtparts);
967        if (unsigned len = vtparts.size())
968          OIdx += len - 1;
969        continue;
970      }
971       // i8 types in IR will be i16 types in SDAG
972      assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
973              (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
974             "type mismatch between callee prototype and arguments");
975      // scalar type
976      unsigned sz = 0;
977      if (isa<IntegerType>(Ty)) {
978        sz = cast<IntegerType>(Ty)->getBitWidth();
979        if (sz < 32)
980          sz = 32;
981      } else if (isa<PointerType>(Ty))
982        sz = PtrVT.getSizeInBits();
983      else
984        sz = Ty->getPrimitiveSizeInBits();
985      O << ".param .b" << sz << " ";
986      O << "_";
987      continue;
988    }
989    auto *PTy = dyn_cast<PointerType>(Ty);
990    assert(PTy && "Param with byval attribute should be a pointer type");
991    Type *ETy = PTy->getElementType();
992
993    unsigned align = Outs[OIdx].Flags.getByValAlign();
994    unsigned sz = DL.getTypeAllocSize(ETy);
995    O << ".param .align " << align << " .b8 ";
996    O << "_";
997    O << "[" << sz << "]";
998  }
999  O << ");";
1000  return O.str();
1001}
1002
1003unsigned
1004NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1005                                          const ImmutableCallSite *CS,
1006                                          Type *Ty,
1007                                          unsigned Idx) const {
1008  unsigned Align = 0;
1009  const Value *DirectCallee = CS->getCalledFunction();
1010
1011  if (!DirectCallee) {
1012    // We don't have a direct function symbol, but that may be because of
1013    // constant cast instructions in the call.
1014    const Instruction *CalleeI = CS->getInstruction();
1015    assert(CalleeI && "Call target is not a function or derived value?");
1016
1017    // With bitcast'd call targets, the instruction will be the call
1018    if (isa<CallInst>(CalleeI)) {
1019      // Check if we have call alignment metadata
1020      if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
1021        return Align;
1022
1023      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
1024      // Ignore any bitcast instructions
1025      while(isa<ConstantExpr>(CalleeV)) {
1026        const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1027        if (!CE->isCast())
1028          break;
1029        // Look through the bitcast
1030        CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1031      }
1032
1033      // We have now looked past all of the bitcasts.  Do we finally have a
1034      // Function?
1035      if (isa<Function>(CalleeV))
1036        DirectCallee = CalleeV;
1037    }
1038  }
1039
1040  // Check for function alignment information if we found that the
1041  // ultimate target is a Function
1042  if (DirectCallee)
1043    if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
1044      return Align;
1045
1046  // Call is indirect or alignment information is not available, fall back to
1047  // the ABI type alignment
1048  auto &DL = CS->getCaller()->getParent()->getDataLayout();
1049  return DL.getABITypeAlignment(Ty);
1050}
1051
1052SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1053                                       SmallVectorImpl<SDValue> &InVals) const {
1054  SelectionDAG &DAG = CLI.DAG;
1055  SDLoc dl = CLI.DL;
1056  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1057  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1058  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1059  SDValue Chain = CLI.Chain;
1060  SDValue Callee = CLI.Callee;
1061  bool &isTailCall = CLI.IsTailCall;
1062  ArgListTy &Args = CLI.getArgs();
1063  Type *retTy = CLI.RetTy;
1064  ImmutableCallSite *CS = CLI.CS;
1065
1066  bool isABI = (STI.getSmVersion() >= 20);
1067  assert(isABI && "Non-ABI compilation is not supported");
1068  if (!isABI)
1069    return Chain;
1070  MachineFunction &MF = DAG.getMachineFunction();
1071  const Function *F = MF.getFunction();
1072  auto &DL = MF.getDataLayout();
1073
1074  SDValue tempChain = Chain;
1075  Chain = DAG.getCALLSEQ_START(Chain,
1076                               DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1077                               dl);
1078  SDValue InFlag = Chain.getValue(1);
1079
1080  unsigned paramCount = 0;
1081  // Args.size() and Outs.size() need not match.
1082  // Outs.size() will be larger
1083  //   * if there is an aggregate argument with multiple fields (each field
1084  //     showing up separately in Outs)
1085  //   * if there is a vector argument with more than typical vector-length
1086  //     elements (generally if more than 4) where each vector element is
1087  //     individually present in Outs.
1088  // So a different index should be used for indexing into Outs/OutVals.
1089  // See similar issue in LowerFormalArguments.
1090  unsigned OIdx = 0;
1091  // Declare the .params or .reg need to pass values
1092  // to the function
1093  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1094    EVT VT = Outs[OIdx].VT;
1095    Type *Ty = Args[i].Ty;
1096
1097    if (!Outs[OIdx].Flags.isByVal()) {
1098      if (Ty->isAggregateType()) {
1099        // aggregate
1100        SmallVector<EVT, 16> vtparts;
1101        SmallVector<uint64_t, 16> Offsets;
1102        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
1103                           0);
1104
1105        unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
1106        // declare .param .align <align> .b8 .param<n>[<size>];
1107        unsigned sz = DL.getTypeAllocSize(Ty);
1108        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1109        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl,
1110                                                             MVT::i32),
1111                                      DAG.getConstant(paramCount, dl, MVT::i32),
1112                                      DAG.getConstant(sz, dl, MVT::i32),
1113                                      InFlag };
1114        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1115                            DeclareParamOps);
1116        InFlag = Chain.getValue(1);
1117        for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
1118          EVT elemtype = vtparts[j];
1119          unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
1120          if (elemtype.isInteger() && (sz < 8))
1121            sz = 8;
1122          SDValue StVal = OutVals[OIdx];
1123          if (elemtype.getSizeInBits() < 16) {
1124            StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1125          }
1126          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1127          SDValue CopyParamOps[] = { Chain,
1128                                     DAG.getConstant(paramCount, dl, MVT::i32),
1129                                     DAG.getConstant(Offsets[j], dl, MVT::i32),
1130                                     StVal, InFlag };
1131          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
1132                                          CopyParamVTs, CopyParamOps,
1133                                          elemtype, MachinePointerInfo(),
1134                                          ArgAlign);
1135          InFlag = Chain.getValue(1);
1136          ++OIdx;
1137        }
1138        if (vtparts.size() > 0)
1139          --OIdx;
1140        ++paramCount;
1141        continue;
1142      }
1143      if (Ty->isVectorTy()) {
1144        EVT ObjectVT = getValueType(DL, Ty);
1145        unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
1146        // declare .param .align <align> .b8 .param<n>[<size>];
1147        unsigned sz = DL.getTypeAllocSize(Ty);
1148        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1149        SDValue DeclareParamOps[] = { Chain,
1150                                      DAG.getConstant(align, dl, MVT::i32),
1151                                      DAG.getConstant(paramCount, dl, MVT::i32),
1152                                      DAG.getConstant(sz, dl, MVT::i32),
1153                                      InFlag };
1154        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1155                            DeclareParamOps);
1156        InFlag = Chain.getValue(1);
1157        unsigned NumElts = ObjectVT.getVectorNumElements();
1158        EVT EltVT = ObjectVT.getVectorElementType();
1159        EVT MemVT = EltVT;
1160        bool NeedExtend = false;
1161        if (EltVT.getSizeInBits() < 16) {
1162          NeedExtend = true;
1163          EltVT = MVT::i16;
1164        }
1165
1166        // V1 store
1167        if (NumElts == 1) {
1168          SDValue Elt = OutVals[OIdx++];
1169          if (NeedExtend)
1170            Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
1171
1172          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1173          SDValue CopyParamOps[] = { Chain,
1174                                     DAG.getConstant(paramCount, dl, MVT::i32),
1175                                     DAG.getConstant(0, dl, MVT::i32), Elt,
1176                                     InFlag };
1177          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
1178                                          CopyParamVTs, CopyParamOps,
1179                                          MemVT, MachinePointerInfo());
1180          InFlag = Chain.getValue(1);
1181        } else if (NumElts == 2) {
1182          SDValue Elt0 = OutVals[OIdx++];
1183          SDValue Elt1 = OutVals[OIdx++];
1184          if (NeedExtend) {
1185            Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
1186            Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
1187          }
1188
1189          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1190          SDValue CopyParamOps[] = { Chain,
1191                                     DAG.getConstant(paramCount, dl, MVT::i32),
1192                                     DAG.getConstant(0, dl, MVT::i32), Elt0,
1193                                     Elt1, InFlag };
1194          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
1195                                          CopyParamVTs, CopyParamOps,
1196                                          MemVT, MachinePointerInfo());
1197          InFlag = Chain.getValue(1);
1198        } else {
1199          unsigned curOffset = 0;
1200          // V4 stores
1201          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
1202          // the
1203          // vector will be expanded to a power of 2 elements, so we know we can
1204          // always round up to the next multiple of 4 when creating the vector
1205          // stores.
1206          // e.g.  4 elem => 1 st.v4
1207          //       6 elem => 2 st.v4
1208          //       8 elem => 2 st.v4
1209          //      11 elem => 3 st.v4
1210          unsigned VecSize = 4;
1211          if (EltVT.getSizeInBits() == 64)
1212            VecSize = 2;
1213
1214          // This is potentially only part of a vector, so assume all elements
1215          // are packed together.
1216          unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
1217
1218          for (unsigned i = 0; i < NumElts; i += VecSize) {
1219            // Get values
1220            SDValue StoreVal;
1221            SmallVector<SDValue, 8> Ops;
1222            Ops.push_back(Chain);
1223            Ops.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1224            Ops.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
1225
1226            unsigned Opc = NVPTXISD::StoreParamV2;
1227
1228            StoreVal = OutVals[OIdx++];
1229            if (NeedExtend)
1230              StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
1231            Ops.push_back(StoreVal);
1232
1233            if (i + 1 < NumElts) {
1234              StoreVal = OutVals[OIdx++];
1235              if (NeedExtend)
1236                StoreVal =
1237                    DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
1238            } else {
1239              StoreVal = DAG.getUNDEF(EltVT);
1240            }
1241            Ops.push_back(StoreVal);
1242
1243            if (VecSize == 4) {
1244              Opc = NVPTXISD::StoreParamV4;
1245              if (i + 2 < NumElts) {
1246                StoreVal = OutVals[OIdx++];
1247                if (NeedExtend)
1248                  StoreVal =
1249                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
1250              } else {
1251                StoreVal = DAG.getUNDEF(EltVT);
1252              }
1253              Ops.push_back(StoreVal);
1254
1255              if (i + 3 < NumElts) {
1256                StoreVal = OutVals[OIdx++];
1257                if (NeedExtend)
1258                  StoreVal =
1259                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
1260              } else {
1261                StoreVal = DAG.getUNDEF(EltVT);
1262              }
1263              Ops.push_back(StoreVal);
1264            }
1265
1266            Ops.push_back(InFlag);
1267
1268            SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1269            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
1270                                            MemVT, MachinePointerInfo());
1271            InFlag = Chain.getValue(1);
1272            curOffset += PerStoreOffset;
1273          }
1274        }
1275        ++paramCount;
1276        --OIdx;
1277        continue;
1278      }
1279      // Plain scalar
1280      // for ABI,    declare .param .b<size> .param<n>;
1281      unsigned sz = VT.getSizeInBits();
1282      bool needExtend = false;
1283      if (VT.isInteger()) {
1284        if (sz < 16)
1285          needExtend = true;
1286        if (sz < 32)
1287          sz = 32;
1288      }
1289      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1290      SDValue DeclareParamOps[] = { Chain,
1291                                    DAG.getConstant(paramCount, dl, MVT::i32),
1292                                    DAG.getConstant(sz, dl, MVT::i32),
1293                                    DAG.getConstant(0, dl, MVT::i32), InFlag };
1294      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1295                          DeclareParamOps);
1296      InFlag = Chain.getValue(1);
1297      SDValue OutV = OutVals[OIdx];
1298      if (needExtend) {
1299        // zext/sext i1 to i16
1300        unsigned opc = ISD::ZERO_EXTEND;
1301        if (Outs[OIdx].Flags.isSExt())
1302          opc = ISD::SIGN_EXTEND;
1303        OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
1304      }
1305      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1306      SDValue CopyParamOps[] = { Chain,
1307                                 DAG.getConstant(paramCount, dl, MVT::i32),
1308                                 DAG.getConstant(0, dl, MVT::i32), OutV,
1309                                 InFlag };
1310
1311      unsigned opcode = NVPTXISD::StoreParam;
1312      if (Outs[OIdx].Flags.isZExt())
1313        opcode = NVPTXISD::StoreParamU32;
1314      else if (Outs[OIdx].Flags.isSExt())
1315        opcode = NVPTXISD::StoreParamS32;
1316      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
1317                                      VT, MachinePointerInfo());
1318
1319      InFlag = Chain.getValue(1);
1320      ++paramCount;
1321      continue;
1322    }
1323    // struct or vector
1324    SmallVector<EVT, 16> vtparts;
1325    SmallVector<uint64_t, 16> Offsets;
1326    auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1327    assert(PTy && "Type of a byval parameter should be pointer");
1328    ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
1329                       vtparts, &Offsets, 0);
1330
1331    // declare .param .align <align> .b8 .param<n>[<size>];
1332    unsigned sz = Outs[OIdx].Flags.getByValSize();
1333    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1334    unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
1335    // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1336    // so we don't need to worry about natural alignment or not.
1337    // See TargetLowering::LowerCallTo().
1338    SDValue DeclareParamOps[] = {
1339      Chain, DAG.getConstant(Outs[OIdx].Flags.getByValAlign(), dl, MVT::i32),
1340      DAG.getConstant(paramCount, dl, MVT::i32),
1341      DAG.getConstant(sz, dl, MVT::i32), InFlag
1342    };
1343    Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1344                        DeclareParamOps);
1345    InFlag = Chain.getValue(1);
1346    for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
1347      EVT elemtype = vtparts[j];
1348      int curOffset = Offsets[j];
1349      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
1350      auto PtrVT = getPointerTy(DAG.getDataLayout());
1351      SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1352                                    DAG.getConstant(curOffset, dl, PtrVT));
1353      SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1354                                   MachinePointerInfo(), false, false, false,
1355                                   PartAlign);
1356      if (elemtype.getSizeInBits() < 16) {
1357        theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1358      }
1359      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1360      SDValue CopyParamOps[] = { Chain,
1361                                 DAG.getConstant(paramCount, dl, MVT::i32),
1362                                 DAG.getConstant(curOffset, dl, MVT::i32),
1363                                 theVal, InFlag };
1364      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
1365                                      CopyParamOps, elemtype,
1366                                      MachinePointerInfo());
1367
1368      InFlag = Chain.getValue(1);
1369    }
1370    ++paramCount;
1371  }
1372
1373  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1374  unsigned retAlignment = 0;
1375
1376  // Handle Result
1377  if (Ins.size() > 0) {
1378    SmallVector<EVT, 16> resvtparts;
1379    ComputeValueVTs(*this, DL, retTy, resvtparts);
1380
1381    // Declare
1382    //  .param .align 16 .b8 retval0[<size-in-bytes>], or
1383    //  .param .b<size-in-bits> retval0
1384    unsigned resultsz = DL.getTypeAllocSizeInBits(retTy);
1385    // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1386    // these three types to match the logic in
1387    // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1388    // Plus, this behavior is consistent with nvcc's.
1389    if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
1390        retTy->isPointerTy()) {
1391      // Scalar needs to be at least 32bit wide
1392      if (resultsz < 32)
1393        resultsz = 32;
1394      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1395      SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1396                                  DAG.getConstant(resultsz, dl, MVT::i32),
1397                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
1398      Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1399                          DeclareRetOps);
1400      InFlag = Chain.getValue(1);
1401    } else {
1402      retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
1403      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1404      SDValue DeclareRetOps[] = { Chain,
1405                                  DAG.getConstant(retAlignment, dl, MVT::i32),
1406                                  DAG.getConstant(resultsz / 8, dl, MVT::i32),
1407                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
1408      Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1409                          DeclareRetOps);
1410      InFlag = Chain.getValue(1);
1411    }
1412  }
1413
1414  if (!Func) {
1415    // This is indirect function call case : PTX requires a prototype of the
1416    // form
1417    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1418    // to be emitted, and the label has to used as the last arg of call
1419    // instruction.
1420    // The prototype is embedded in a string and put as the operand for a
1421    // CallPrototype SDNode which will print out to the value of the string.
1422    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1423    std::string Proto =
1424        getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS);
1425    const char *ProtoStr =
1426      nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1427    SDValue ProtoOps[] = {
1428      Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1429    };
1430    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1431    InFlag = Chain.getValue(1);
1432  }
1433  // Op to just print "call"
1434  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1435  SDValue PrintCallOps[] = {
1436    Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1437  };
1438  Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
1439                      dl, PrintCallVTs, PrintCallOps);
1440  InFlag = Chain.getValue(1);
1441
1442  // Ops to print out the function name
1443  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1444  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1445  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1446  InFlag = Chain.getValue(1);
1447
1448  // Ops to print out the param list
1449  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1450  SDValue CallArgBeginOps[] = { Chain, InFlag };
1451  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1452                      CallArgBeginOps);
1453  InFlag = Chain.getValue(1);
1454
1455  for (unsigned i = 0, e = paramCount; i != e; ++i) {
1456    unsigned opcode;
1457    if (i == (e - 1))
1458      opcode = NVPTXISD::LastCallArg;
1459    else
1460      opcode = NVPTXISD::CallArg;
1461    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1462    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1463                             DAG.getConstant(i, dl, MVT::i32), InFlag };
1464    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1465    InFlag = Chain.getValue(1);
1466  }
1467  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1468  SDValue CallArgEndOps[] = { Chain,
1469                              DAG.getConstant(Func ? 1 : 0, dl, MVT::i32),
1470                              InFlag };
1471  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1472  InFlag = Chain.getValue(1);
1473
1474  if (!Func) {
1475    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1476    SDValue PrototypeOps[] = { Chain,
1477                               DAG.getConstant(uniqueCallSite, dl, MVT::i32),
1478                               InFlag };
1479    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1480    InFlag = Chain.getValue(1);
1481  }
1482
1483  // Generate loads from param memory/moves from registers for result
1484  if (Ins.size() > 0) {
1485    if (retTy && retTy->isVectorTy()) {
1486      EVT ObjectVT = getValueType(DL, retTy);
1487      unsigned NumElts = ObjectVT.getVectorNumElements();
1488      EVT EltVT = ObjectVT.getVectorElementType();
1489      assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
1490                                                      ObjectVT) == NumElts &&
1491             "Vector was not scalarized");
1492      unsigned sz = EltVT.getSizeInBits();
1493      bool needTruncate = sz < 8;
1494
1495      if (NumElts == 1) {
1496        // Just a simple load
1497        SmallVector<EVT, 4> LoadRetVTs;
1498        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
1499          // If loading i1/i8 result, generate
1500          //   load.b8 i16
1501          //   if i1
1502          //   trunc i16 to i1
1503          LoadRetVTs.push_back(MVT::i16);
1504        } else
1505          LoadRetVTs.push_back(EltVT);
1506        LoadRetVTs.push_back(MVT::Other);
1507        LoadRetVTs.push_back(MVT::Glue);
1508        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1509                                DAG.getConstant(0, dl, MVT::i32), InFlag};
1510        SDValue retval = DAG.getMemIntrinsicNode(
1511            NVPTXISD::LoadParam, dl,
1512            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
1513        Chain = retval.getValue(1);
1514        InFlag = retval.getValue(2);
1515        SDValue Ret0 = retval;
1516        if (needTruncate)
1517          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
1518        InVals.push_back(Ret0);
1519      } else if (NumElts == 2) {
1520        // LoadV2
1521        SmallVector<EVT, 4> LoadRetVTs;
1522        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
1523          // If loading i1/i8 result, generate
1524          //   load.b8 i16
1525          //   if i1
1526          //   trunc i16 to i1
1527          LoadRetVTs.push_back(MVT::i16);
1528          LoadRetVTs.push_back(MVT::i16);
1529        } else {
1530          LoadRetVTs.push_back(EltVT);
1531          LoadRetVTs.push_back(EltVT);
1532        }
1533        LoadRetVTs.push_back(MVT::Other);
1534        LoadRetVTs.push_back(MVT::Glue);
1535        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1536                                DAG.getConstant(0, dl, MVT::i32), InFlag};
1537        SDValue retval = DAG.getMemIntrinsicNode(
1538            NVPTXISD::LoadParamV2, dl,
1539            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
1540        Chain = retval.getValue(2);
1541        InFlag = retval.getValue(3);
1542        SDValue Ret0 = retval.getValue(0);
1543        SDValue Ret1 = retval.getValue(1);
1544        if (needTruncate) {
1545          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
1546          InVals.push_back(Ret0);
1547          Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
1548          InVals.push_back(Ret1);
1549        } else {
1550          InVals.push_back(Ret0);
1551          InVals.push_back(Ret1);
1552        }
1553      } else {
1554        // Split into N LoadV4
1555        unsigned Ofst = 0;
1556        unsigned VecSize = 4;
1557        unsigned Opc = NVPTXISD::LoadParamV4;
1558        if (EltVT.getSizeInBits() == 64) {
1559          VecSize = 2;
1560          Opc = NVPTXISD::LoadParamV2;
1561        }
1562        EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
1563        for (unsigned i = 0; i < NumElts; i += VecSize) {
1564          SmallVector<EVT, 8> LoadRetVTs;
1565          if (EltVT == MVT::i1 || EltVT == MVT::i8) {
1566            // If loading i1/i8 result, generate
1567            //   load.b8 i16
1568            //   if i1
1569            //   trunc i16 to i1
1570            for (unsigned j = 0; j < VecSize; ++j)
1571              LoadRetVTs.push_back(MVT::i16);
1572          } else {
1573            for (unsigned j = 0; j < VecSize; ++j)
1574              LoadRetVTs.push_back(EltVT);
1575          }
1576          LoadRetVTs.push_back(MVT::Other);
1577          LoadRetVTs.push_back(MVT::Glue);
1578          SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1579                                  DAG.getConstant(Ofst, dl, MVT::i32), InFlag};
1580          SDValue retval = DAG.getMemIntrinsicNode(
1581              Opc, dl, DAG.getVTList(LoadRetVTs),
1582              LoadRetOps, EltVT, MachinePointerInfo());
1583          if (VecSize == 2) {
1584            Chain = retval.getValue(2);
1585            InFlag = retval.getValue(3);
1586          } else {
1587            Chain = retval.getValue(4);
1588            InFlag = retval.getValue(5);
1589          }
1590
1591          for (unsigned j = 0; j < VecSize; ++j) {
1592            if (i + j >= NumElts)
1593              break;
1594            SDValue Elt = retval.getValue(j);
1595            if (needTruncate)
1596              Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
1597            InVals.push_back(Elt);
1598          }
1599          Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
1600        }
1601      }
1602    } else {
1603      SmallVector<EVT, 16> VTs;
1604      SmallVector<uint64_t, 16> Offsets;
1605      ComputePTXValueVTs(*this, DAG.getDataLayout(), retTy, VTs, &Offsets, 0);
1606      assert(VTs.size() == Ins.size() && "Bad value decomposition");
1607      unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0);
1608      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
1609        unsigned sz = VTs[i].getSizeInBits();
1610        unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
1611        bool needTruncate = sz < 8;
1612        if (VTs[i].isInteger() && (sz < 8))
1613          sz = 8;
1614
1615        SmallVector<EVT, 4> LoadRetVTs;
1616        EVT TheLoadType = VTs[i];
1617        if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) {
1618          // This is for integer types only, and specifically not for
1619          // aggregates.
1620          LoadRetVTs.push_back(MVT::i32);
1621          TheLoadType = MVT::i32;
1622        } else if (sz < 16) {
1623          // If loading i1/i8 result, generate
1624          //   load i8 (-> i16)
1625          //   trunc i16 to i1/i8
1626          LoadRetVTs.push_back(MVT::i16);
1627        } else
1628          LoadRetVTs.push_back(Ins[i].VT);
1629        LoadRetVTs.push_back(MVT::Other);
1630        LoadRetVTs.push_back(MVT::Glue);
1631
1632        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1633                                DAG.getConstant(Offsets[i], dl, MVT::i32),
1634                                InFlag};
1635        SDValue retval = DAG.getMemIntrinsicNode(
1636            NVPTXISD::LoadParam, dl,
1637            DAG.getVTList(LoadRetVTs), LoadRetOps,
1638            TheLoadType, MachinePointerInfo(), AlignI);
1639        Chain = retval.getValue(1);
1640        InFlag = retval.getValue(2);
1641        SDValue Ret0 = retval.getValue(0);
1642        if (needTruncate)
1643          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
1644        InVals.push_back(Ret0);
1645      }
1646    }
1647  }
1648
1649  Chain = DAG.getCALLSEQ_END(Chain,
1650                             DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1651                             DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
1652                                                   true),
1653                             InFlag, dl);
1654  uniqueCallSite++;
1655
1656  // set isTailCall to false for now, until we figure out how to express
1657  // tail call optimization in PTX
1658  isTailCall = false;
1659  return Chain;
1660}
1661
1662// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1663// (see LegalizeDAG.cpp). This is slow and uses local memory.
1664// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1665SDValue
1666NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1667  SDNode *Node = Op.getNode();
1668  SDLoc dl(Node);
1669  SmallVector<SDValue, 8> Ops;
1670  unsigned NumOperands = Node->getNumOperands();
1671  for (unsigned i = 0; i < NumOperands; ++i) {
1672    SDValue SubOp = Node->getOperand(i);
1673    EVT VVT = SubOp.getNode()->getValueType(0);
1674    EVT EltVT = VVT.getVectorElementType();
1675    unsigned NumSubElem = VVT.getVectorNumElements();
1676    for (unsigned j = 0; j < NumSubElem; ++j) {
1677      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1678                                DAG.getIntPtrConstant(j, dl)));
1679    }
1680  }
1681  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
1682}
1683
1684/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1685/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1686///    amount, or
1687/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1688///    amount.
1689SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1690                                                  SelectionDAG &DAG) const {
1691  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1692  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
1693
1694  EVT VT = Op.getValueType();
1695  unsigned VTBits = VT.getSizeInBits();
1696  SDLoc dl(Op);
1697  SDValue ShOpLo = Op.getOperand(0);
1698  SDValue ShOpHi = Op.getOperand(1);
1699  SDValue ShAmt  = Op.getOperand(2);
1700  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1701
1702  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1703
1704    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1705    // {dHi, dLo} = {aHi, aLo} >> Amt
1706    //   dHi = aHi >> Amt
1707    //   dLo = shf.r.clamp aLo, aHi, Amt
1708
1709    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1710    SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1711                             ShAmt);
1712
1713    SDValue Ops[2] = { Lo, Hi };
1714    return DAG.getMergeValues(Ops, dl);
1715  }
1716  else {
1717
1718    // {dHi, dLo} = {aHi, aLo} >> Amt
1719    // - if (Amt>=size) then
1720    //      dLo = aHi >> (Amt-size)
1721    //      dHi = aHi >> Amt (this is either all 0 or all 1)
1722    //   else
1723    //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1724    //      dHi = aHi >> Amt
1725
1726    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1727                                   DAG.getConstant(VTBits, dl, MVT::i32),
1728                                   ShAmt);
1729    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
1730    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1731                                     DAG.getConstant(VTBits, dl, MVT::i32));
1732    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
1733    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
1734    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
1735
1736    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
1737                               DAG.getConstant(VTBits, dl, MVT::i32),
1738                               ISD::SETGE);
1739    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1740    SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
1741
1742    SDValue Ops[2] = { Lo, Hi };
1743    return DAG.getMergeValues(Ops, dl);
1744  }
1745}
1746
1747/// LowerShiftLeftParts - Lower SHL_PARTS, which
1748/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1749///    amount, or
1750/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1751///    amount.
1752SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
1753                                                 SelectionDAG &DAG) const {
1754  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1755  assert(Op.getOpcode() == ISD::SHL_PARTS);
1756
1757  EVT VT = Op.getValueType();
1758  unsigned VTBits = VT.getSizeInBits();
1759  SDLoc dl(Op);
1760  SDValue ShOpLo = Op.getOperand(0);
1761  SDValue ShOpHi = Op.getOperand(1);
1762  SDValue ShAmt  = Op.getOperand(2);
1763
1764  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1765
1766    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1767    // {dHi, dLo} = {aHi, aLo} << Amt
1768    //   dHi = shf.l.clamp aLo, aHi, Amt
1769    //   dLo = aLo << Amt
1770
1771    SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
1772                             ShAmt);
1773    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
1774
1775    SDValue Ops[2] = { Lo, Hi };
1776    return DAG.getMergeValues(Ops, dl);
1777  }
1778  else {
1779
1780    // {dHi, dLo} = {aHi, aLo} << Amt
1781    // - if (Amt>=size) then
1782    //      dLo = aLo << Amt (all 0)
1783    //      dLo = aLo << (Amt-size)
1784    //   else
1785    //      dLo = aLo << Amt
1786    //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
1787
1788    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1789                                   DAG.getConstant(VTBits, dl, MVT::i32),
1790                                   ShAmt);
1791    SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
1792    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
1793                                     DAG.getConstant(VTBits, dl, MVT::i32));
1794    SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
1795    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
1796    SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
1797
1798    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
1799                               DAG.getConstant(VTBits, dl, MVT::i32),
1800                               ISD::SETGE);
1801    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
1802    SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
1803
1804    SDValue Ops[2] = { Lo, Hi };
1805    return DAG.getMergeValues(Ops, dl);
1806  }
1807}
1808
1809SDValue
1810NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1811  switch (Op.getOpcode()) {
1812  case ISD::RETURNADDR:
1813    return SDValue();
1814  case ISD::FRAMEADDR:
1815    return SDValue();
1816  case ISD::GlobalAddress:
1817    return LowerGlobalAddress(Op, DAG);
1818  case ISD::INTRINSIC_W_CHAIN:
1819    return Op;
1820  case ISD::BUILD_VECTOR:
1821  case ISD::EXTRACT_SUBVECTOR:
1822    return Op;
1823  case ISD::CONCAT_VECTORS:
1824    return LowerCONCAT_VECTORS(Op, DAG);
1825  case ISD::STORE:
1826    return LowerSTORE(Op, DAG);
1827  case ISD::LOAD:
1828    return LowerLOAD(Op, DAG);
1829  case ISD::SHL_PARTS:
1830    return LowerShiftLeftParts(Op, DAG);
1831  case ISD::SRA_PARTS:
1832  case ISD::SRL_PARTS:
1833    return LowerShiftRightParts(Op, DAG);
1834  case ISD::SELECT:
1835    return LowerSelect(Op, DAG);
1836  default:
1837    llvm_unreachable("Custom lowering not defined for operation");
1838  }
1839}
1840
1841SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
1842  SDValue Op0 = Op->getOperand(0);
1843  SDValue Op1 = Op->getOperand(1);
1844  SDValue Op2 = Op->getOperand(2);
1845  SDLoc DL(Op.getNode());
1846
1847  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
1848
1849  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
1850  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
1851  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
1852  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
1853
1854  return Trunc;
1855}
1856
1857SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1858  if (Op.getValueType() == MVT::i1)
1859    return LowerLOADi1(Op, DAG);
1860  else
1861    return SDValue();
1862}
1863
1864// v = ld i1* addr
1865//   =>
1866// v1 = ld i8* addr (-> i16)
1867// v = trunc i16 to i1
1868SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
1869  SDNode *Node = Op.getNode();
1870  LoadSDNode *LD = cast<LoadSDNode>(Node);
1871  SDLoc dl(Node);
1872  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
1873  assert(Node->getValueType(0) == MVT::i1 &&
1874         "Custom lowering for i1 load only");
1875  SDValue newLD =
1876      DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
1877                  LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
1878                  LD->isInvariant(), LD->getAlignment());
1879  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
1880  // The legalizer (the caller) is expecting two values from the legalized
1881  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
1882  // in LegalizeDAG.cpp which also uses MergeValues.
1883  SDValue Ops[] = { result, LD->getChain() };
1884  return DAG.getMergeValues(Ops, dl);
1885}
1886
1887SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1888  EVT ValVT = Op.getOperand(1).getValueType();
1889  if (ValVT == MVT::i1)
1890    return LowerSTOREi1(Op, DAG);
1891  else if (ValVT.isVector())
1892    return LowerSTOREVector(Op, DAG);
1893  else
1894    return SDValue();
1895}
1896
1897SDValue
1898NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
1899  SDNode *N = Op.getNode();
1900  SDValue Val = N->getOperand(1);
1901  SDLoc DL(N);
1902  EVT ValVT = Val.getValueType();
1903
1904  if (ValVT.isVector()) {
1905    // We only handle "native" vector sizes for now, e.g. <4 x double> is not
1906    // legal.  We can (and should) split that into 2 stores of <2 x double> here
1907    // but I'm leaving that as a TODO for now.
1908    if (!ValVT.isSimple())
1909      return SDValue();
1910    switch (ValVT.getSimpleVT().SimpleTy) {
1911    default:
1912      return SDValue();
1913    case MVT::v2i8:
1914    case MVT::v2i16:
1915    case MVT::v2i32:
1916    case MVT::v2i64:
1917    case MVT::v2f32:
1918    case MVT::v2f64:
1919    case MVT::v4i8:
1920    case MVT::v4i16:
1921    case MVT::v4i32:
1922    case MVT::v4f32:
1923      // This is a "native" vector type
1924      break;
1925    }
1926
1927    MemSDNode *MemSD = cast<MemSDNode>(N);
1928    const DataLayout &TD = DAG.getDataLayout();
1929
1930    unsigned Align = MemSD->getAlignment();
1931    unsigned PrefAlign =
1932        TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
1933    if (Align < PrefAlign) {
1934      // This store is not sufficiently aligned, so bail out and let this vector
1935      // store be scalarized.  Note that we may still be able to emit smaller
1936      // vector stores.  For example, if we are storing a <4 x float> with an
1937      // alignment of 8, this check will fail but the legalizer will try again
1938      // with 2 x <2 x float>, which will succeed with an alignment of 8.
1939      return SDValue();
1940    }
1941
1942    unsigned Opcode = 0;
1943    EVT EltVT = ValVT.getVectorElementType();
1944    unsigned NumElts = ValVT.getVectorNumElements();
1945
1946    // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
1947    // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
1948    // stored type to i16 and propagate the "real" type as the memory type.
1949    bool NeedExt = false;
1950    if (EltVT.getSizeInBits() < 16)
1951      NeedExt = true;
1952
1953    switch (NumElts) {
1954    default:
1955      return SDValue();
1956    case 2:
1957      Opcode = NVPTXISD::StoreV2;
1958      break;
1959    case 4: {
1960      Opcode = NVPTXISD::StoreV4;
1961      break;
1962    }
1963    }
1964
1965    SmallVector<SDValue, 8> Ops;
1966
1967    // First is the chain
1968    Ops.push_back(N->getOperand(0));
1969
1970    // Then the split values
1971    for (unsigned i = 0; i < NumElts; ++i) {
1972      SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
1973                                   DAG.getIntPtrConstant(i, DL));
1974      if (NeedExt)
1975        ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
1976      Ops.push_back(ExtVal);
1977    }
1978
1979    // Then any remaining arguments
1980    Ops.append(N->op_begin() + 2, N->op_end());
1981
1982    SDValue NewSt = DAG.getMemIntrinsicNode(
1983        Opcode, DL, DAG.getVTList(MVT::Other), Ops,
1984        MemSD->getMemoryVT(), MemSD->getMemOperand());
1985
1986    //return DCI.CombineTo(N, NewSt, true);
1987    return NewSt;
1988  }
1989
1990  return SDValue();
1991}
1992
1993// st i1 v, addr
1994//    =>
1995// v1 = zxt v to i16
1996// st.u8 i16, addr
1997SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
1998  SDNode *Node = Op.getNode();
1999  SDLoc dl(Node);
2000  StoreSDNode *ST = cast<StoreSDNode>(Node);
2001  SDValue Tmp1 = ST->getChain();
2002  SDValue Tmp2 = ST->getBasePtr();
2003  SDValue Tmp3 = ST->getValue();
2004  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2005  unsigned Alignment = ST->getAlignment();
2006  bool isVolatile = ST->isVolatile();
2007  bool isNonTemporal = ST->isNonTemporal();
2008  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2009  SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
2010                                     ST->getPointerInfo(), MVT::i8, isNonTemporal,
2011                                     isVolatile, Alignment);
2012  return Result;
2013}
2014
2015SDValue
2016NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2017  std::string ParamSym;
2018  raw_string_ostream ParamStr(ParamSym);
2019
2020  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2021  ParamStr.flush();
2022
2023  std::string *SavedStr =
2024    nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2025  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2026}
2027
2028// Check to see if the kernel argument is image*_t or sampler_t
2029
2030bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
2031  static const char *const specialTypes[] = { "struct._image2d_t",
2032                                              "struct._image3d_t",
2033                                              "struct._sampler_t" };
2034
2035  Type *Ty = arg->getType();
2036  auto *PTy = dyn_cast<PointerType>(Ty);
2037
2038  if (!PTy)
2039    return false;
2040
2041  if (!context)
2042    return false;
2043
2044  auto *STy = dyn_cast<StructType>(PTy->getElementType());
2045  const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : "";
2046
2047  return std::find(std::begin(specialTypes), std::end(specialTypes),
2048                   TypeName) != std::end(specialTypes);
2049}
2050
2051SDValue NVPTXTargetLowering::LowerFormalArguments(
2052    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2053    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
2054    SmallVectorImpl<SDValue> &InVals) const {
2055  MachineFunction &MF = DAG.getMachineFunction();
2056  const DataLayout &DL = DAG.getDataLayout();
2057  auto PtrVT = getPointerTy(DAG.getDataLayout());
2058
2059  const Function *F = MF.getFunction();
2060  const AttributeSet &PAL = F->getAttributes();
2061  const TargetLowering *TLI = STI.getTargetLowering();
2062
2063  SDValue Root = DAG.getRoot();
2064  std::vector<SDValue> OutChains;
2065
2066  bool isKernel = llvm::isKernelFunction(*F);
2067  bool isABI = (STI.getSmVersion() >= 20);
2068  assert(isABI && "Non-ABI compilation is not supported");
2069  if (!isABI)
2070    return Chain;
2071
2072  std::vector<Type *> argTypes;
2073  std::vector<const Argument *> theArgs;
2074  for (const Argument &I : F->args()) {
2075    theArgs.push_back(&I);
2076    argTypes.push_back(I.getType());
2077  }
2078  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2079  // Ins.size() will be larger
2080  //   * if there is an aggregate argument with multiple fields (each field
2081  //     showing up separately in Ins)
2082  //   * if there is a vector argument with more than typical vector-length
2083  //     elements (generally if more than 4) where each vector element is
2084  //     individually present in Ins.
2085  // So a different index should be used for indexing into Ins.
2086  // See similar issue in LowerCall.
2087  unsigned InsIdx = 0;
2088
2089  int idx = 0;
2090  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2091    Type *Ty = argTypes[i];
2092
2093    // If the kernel argument is image*_t or sampler_t, convert it to
2094    // a i32 constant holding the parameter position. This can later
2095    // matched in the AsmPrinter to output the correct mangled name.
2096    if (isImageOrSamplerVal(
2097            theArgs[i],
2098            (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2099                                     : nullptr))) {
2100      assert(isKernel && "Only kernels can have image/sampler params");
2101      InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2102      continue;
2103    }
2104
2105    if (theArgs[i]->use_empty()) {
2106      // argument is dead
2107      if (Ty->isAggregateType()) {
2108        SmallVector<EVT, 16> vtparts;
2109
2110        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2111        assert(vtparts.size() > 0 && "empty aggregate type not expected");
2112        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2113             ++parti) {
2114          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2115          ++InsIdx;
2116        }
2117        if (vtparts.size() > 0)
2118          --InsIdx;
2119        continue;
2120      }
2121      if (Ty->isVectorTy()) {
2122        EVT ObjectVT = getValueType(DL, Ty);
2123        unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2124        for (unsigned parti = 0; parti < NumRegs; ++parti) {
2125          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2126          ++InsIdx;
2127        }
2128        if (NumRegs > 0)
2129          --InsIdx;
2130        continue;
2131      }
2132      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2133      continue;
2134    }
2135
2136    // In the following cases, assign a node order of "idx+1"
2137    // to newly created nodes. The SDNodes for params have to
2138    // appear in the same order as their order of appearance
2139    // in the original function. "idx+1" holds that order.
2140    if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) {
2141      if (Ty->isAggregateType()) {
2142        SmallVector<EVT, 16> vtparts;
2143        SmallVector<uint64_t, 16> offsets;
2144
2145        // NOTE: Here, we lose the ability to issue vector loads for vectors
2146        // that are a part of a struct.  This should be investigated in the
2147        // future.
2148        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets,
2149                           0);
2150        assert(vtparts.size() > 0 && "empty aggregate type not expected");
2151        bool aggregateIsPacked = false;
2152        if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
2153          aggregateIsPacked = STy->isPacked();
2154
2155        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2156        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2157             ++parti) {
2158          EVT partVT = vtparts[parti];
2159          Value *srcValue = Constant::getNullValue(
2160              PointerType::get(partVT.getTypeForEVT(F->getContext()),
2161                               llvm::ADDRESS_SPACE_PARAM));
2162          SDValue srcAddr =
2163              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2164                          DAG.getConstant(offsets[parti], dl, PtrVT));
2165          unsigned partAlign = aggregateIsPacked
2166                                   ? 1
2167                                   : DL.getABITypeAlignment(
2168                                         partVT.getTypeForEVT(F->getContext()));
2169          SDValue p;
2170          if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
2171            ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
2172                                     ISD::SEXTLOAD : ISD::ZEXTLOAD;
2173            p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
2174                               MachinePointerInfo(srcValue), partVT, false,
2175                               false, false, partAlign);
2176          } else {
2177            p = DAG.getLoad(partVT, dl, Root, srcAddr,
2178                            MachinePointerInfo(srcValue), false, false, false,
2179                            partAlign);
2180          }
2181          if (p.getNode())
2182            p.getNode()->setIROrder(idx + 1);
2183          InVals.push_back(p);
2184          ++InsIdx;
2185        }
2186        if (vtparts.size() > 0)
2187          --InsIdx;
2188        continue;
2189      }
2190      if (Ty->isVectorTy()) {
2191        EVT ObjectVT = getValueType(DL, Ty);
2192        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2193        unsigned NumElts = ObjectVT.getVectorNumElements();
2194        assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
2195               "Vector was not scalarized");
2196        EVT EltVT = ObjectVT.getVectorElementType();
2197
2198        // V1 load
2199        // f32 = load ...
2200        if (NumElts == 1) {
2201          // We only have one element, so just directly load it
2202          Value *SrcValue = Constant::getNullValue(PointerType::get(
2203              EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
2204          SDValue P = DAG.getLoad(
2205              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
2206              true,
2207              DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
2208          if (P.getNode())
2209            P.getNode()->setIROrder(idx + 1);
2210
2211          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
2212            P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
2213          InVals.push_back(P);
2214          ++InsIdx;
2215        } else if (NumElts == 2) {
2216          // V2 load
2217          // f32,f32 = load ...
2218          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
2219          Value *SrcValue = Constant::getNullValue(PointerType::get(
2220              VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
2221          SDValue P = DAG.getLoad(
2222              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue), false, false,
2223              true,
2224              DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
2225          if (P.getNode())
2226            P.getNode()->setIROrder(idx + 1);
2227
2228          SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
2229                                     DAG.getIntPtrConstant(0, dl));
2230          SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
2231                                     DAG.getIntPtrConstant(1, dl));
2232
2233          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
2234            Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
2235            Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
2236          }
2237
2238          InVals.push_back(Elt0);
2239          InVals.push_back(Elt1);
2240          InsIdx += 2;
2241        } else {
2242          // V4 loads
2243          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
2244          // the
2245          // vector will be expanded to a power of 2 elements, so we know we can
2246          // always round up to the next multiple of 4 when creating the vector
2247          // loads.
2248          // e.g.  4 elem => 1 ld.v4
2249          //       6 elem => 2 ld.v4
2250          //       8 elem => 2 ld.v4
2251          //      11 elem => 3 ld.v4
2252          unsigned VecSize = 4;
2253          if (EltVT.getSizeInBits() == 64) {
2254            VecSize = 2;
2255          }
2256          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
2257          unsigned Ofst = 0;
2258          for (unsigned i = 0; i < NumElts; i += VecSize) {
2259            Value *SrcValue = Constant::getNullValue(
2260                PointerType::get(VecVT.getTypeForEVT(F->getContext()),
2261                                 llvm::ADDRESS_SPACE_PARAM));
2262            SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2263                                          DAG.getConstant(Ofst, dl, PtrVT));
2264            SDValue P = DAG.getLoad(
2265                VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
2266                false, true,
2267                DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
2268            if (P.getNode())
2269              P.getNode()->setIROrder(idx + 1);
2270
2271            for (unsigned j = 0; j < VecSize; ++j) {
2272              if (i + j >= NumElts)
2273                break;
2274              SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
2275                                        DAG.getIntPtrConstant(j, dl));
2276              if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
2277                Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
2278              InVals.push_back(Elt);
2279            }
2280            Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
2281          }
2282          InsIdx += NumElts;
2283        }
2284
2285        if (NumElts > 0)
2286          --InsIdx;
2287        continue;
2288      }
2289      // A plain scalar.
2290      EVT ObjectVT = getValueType(DL, Ty);
2291      // If ABI, load from the param symbol
2292      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2293      Value *srcValue = Constant::getNullValue(PointerType::get(
2294          ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
2295      SDValue p;
2296       if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
2297        ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ?
2298                                       ISD::SEXTLOAD : ISD::ZEXTLOAD;
2299        p = DAG.getExtLoad(
2300            ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
2301            ObjectVT, false, false, false,
2302            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
2303      } else {
2304        p = DAG.getLoad(
2305            Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue), false,
2306            false, false,
2307            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
2308      }
2309      if (p.getNode())
2310        p.getNode()->setIROrder(idx + 1);
2311      InVals.push_back(p);
2312      continue;
2313    }
2314
2315    // Param has ByVal attribute
2316    // Return MoveParam(param symbol).
2317    // Ideally, the param symbol can be returned directly,
2318    // but when SDNode builder decides to use it in a CopyToReg(),
2319    // machine instruction fails because TargetExternalSymbol
2320    // (not lowered) is target dependent, and CopyToReg assumes
2321    // the source is lowered.
2322    EVT ObjectVT = getValueType(DL, Ty);
2323    assert(ObjectVT == Ins[InsIdx].VT &&
2324           "Ins type did not match function type");
2325    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2326    SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2327    if (p.getNode())
2328      p.getNode()->setIROrder(idx + 1);
2329    if (isKernel)
2330      InVals.push_back(p);
2331    else {
2332      SDValue p2 = DAG.getNode(
2333          ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
2334          DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p);
2335      InVals.push_back(p2);
2336    }
2337  }
2338
2339  // Clang will check explicit VarArg and issue error if any. However, Clang
2340  // will let code with
2341  // implicit var arg like f() pass. See bug 617733.
2342  // We treat this case as if the arg list is empty.
2343  // if (F.isVarArg()) {
2344  // assert(0 && "VarArg not supported yet!");
2345  //}
2346
2347  if (!OutChains.empty())
2348    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2349
2350  return Chain;
2351}
2352
2353
2354SDValue
2355NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2356                                 bool isVarArg,
2357                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
2358                                 const SmallVectorImpl<SDValue> &OutVals,
2359                                 SDLoc dl, SelectionDAG &DAG) const {
2360  MachineFunction &MF = DAG.getMachineFunction();
2361  const Function *F = MF.getFunction();
2362  Type *RetTy = F->getReturnType();
2363  const DataLayout &TD = DAG.getDataLayout();
2364
2365  bool isABI = (STI.getSmVersion() >= 20);
2366  assert(isABI && "Non-ABI compilation is not supported");
2367  if (!isABI)
2368    return Chain;
2369
2370  if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
2371    // If we have a vector type, the OutVals array will be the scalarized
2372    // components and we have combine them into 1 or more vector stores.
2373    unsigned NumElts = VTy->getNumElements();
2374    assert(NumElts == Outs.size() && "Bad scalarization of return value");
2375
2376    // const_cast can be removed in later LLVM versions
2377    EVT EltVT = getValueType(TD, RetTy).getVectorElementType();
2378    bool NeedExtend = false;
2379    if (EltVT.getSizeInBits() < 16)
2380      NeedExtend = true;
2381
2382    // V1 store
2383    if (NumElts == 1) {
2384      SDValue StoreVal = OutVals[0];
2385      // We only have one element, so just directly store it
2386      if (NeedExtend)
2387        StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
2388      SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal };
2389      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
2390                                      DAG.getVTList(MVT::Other), Ops,
2391                                      EltVT, MachinePointerInfo());
2392
2393    } else if (NumElts == 2) {
2394      // V2 store
2395      SDValue StoreVal0 = OutVals[0];
2396      SDValue StoreVal1 = OutVals[1];
2397
2398      if (NeedExtend) {
2399        StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
2400        StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
2401      }
2402
2403      SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal0,
2404                        StoreVal1 };
2405      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
2406                                      DAG.getVTList(MVT::Other), Ops,
2407                                      EltVT, MachinePointerInfo());
2408    } else {
2409      // V4 stores
2410      // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
2411      // vector will be expanded to a power of 2 elements, so we know we can
2412      // always round up to the next multiple of 4 when creating the vector
2413      // stores.
2414      // e.g.  4 elem => 1 st.v4
2415      //       6 elem => 2 st.v4
2416      //       8 elem => 2 st.v4
2417      //      11 elem => 3 st.v4
2418
2419      unsigned VecSize = 4;
2420      if (OutVals[0].getValueType().getSizeInBits() == 64)
2421        VecSize = 2;
2422
2423      unsigned Offset = 0;
2424
2425      EVT VecVT =
2426          EVT::getVectorVT(F->getContext(), EltVT, VecSize);
2427      unsigned PerStoreOffset =
2428          TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
2429
2430      for (unsigned i = 0; i < NumElts; i += VecSize) {
2431        // Get values
2432        SDValue StoreVal;
2433        SmallVector<SDValue, 8> Ops;
2434        Ops.push_back(Chain);
2435        Ops.push_back(DAG.getConstant(Offset, dl, MVT::i32));
2436        unsigned Opc = NVPTXISD::StoreRetvalV2;
2437        EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
2438
2439        StoreVal = OutVals[i];
2440        if (NeedExtend)
2441          StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
2442        Ops.push_back(StoreVal);
2443
2444        if (i + 1 < NumElts) {
2445          StoreVal = OutVals[i + 1];
2446          if (NeedExtend)
2447            StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
2448        } else {
2449          StoreVal = DAG.getUNDEF(ExtendedVT);
2450        }
2451        Ops.push_back(StoreVal);
2452
2453        if (VecSize == 4) {
2454          Opc = NVPTXISD::StoreRetvalV4;
2455          if (i + 2 < NumElts) {
2456            StoreVal = OutVals[i + 2];
2457            if (NeedExtend)
2458              StoreVal =
2459                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
2460          } else {
2461            StoreVal = DAG.getUNDEF(ExtendedVT);
2462          }
2463          Ops.push_back(StoreVal);
2464
2465          if (i + 3 < NumElts) {
2466            StoreVal = OutVals[i + 3];
2467            if (NeedExtend)
2468              StoreVal =
2469                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
2470          } else {
2471            StoreVal = DAG.getUNDEF(ExtendedVT);
2472          }
2473          Ops.push_back(StoreVal);
2474        }
2475
2476        // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
2477        Chain =
2478            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
2479                                    EltVT, MachinePointerInfo());
2480        Offset += PerStoreOffset;
2481      }
2482    }
2483  } else {
2484    SmallVector<EVT, 16> ValVTs;
2485    SmallVector<uint64_t, 16> Offsets;
2486    ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0);
2487    assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
2488
2489    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
2490      SDValue theVal = OutVals[i];
2491      EVT TheValType = theVal.getValueType();
2492      unsigned numElems = 1;
2493      if (TheValType.isVector())
2494        numElems = TheValType.getVectorNumElements();
2495      for (unsigned j = 0, je = numElems; j != je; ++j) {
2496        SDValue TmpVal = theVal;
2497        if (TheValType.isVector())
2498          TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
2499                               TheValType.getVectorElementType(), TmpVal,
2500                               DAG.getIntPtrConstant(j, dl));
2501        EVT TheStoreType = ValVTs[i];
2502        if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) {
2503          // The following zero-extension is for integer types only, and
2504          // specifically not for aggregates.
2505          TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
2506          TheStoreType = MVT::i32;
2507        }
2508        else if (TmpVal.getValueType().getSizeInBits() < 16)
2509          TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
2510
2511        SDValue Ops[] = {
2512          Chain,
2513          DAG.getConstant(Offsets[i], dl, MVT::i32),
2514          TmpVal };
2515        Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
2516                                        DAG.getVTList(MVT::Other), Ops,
2517                                        TheStoreType,
2518                                        MachinePointerInfo());
2519      }
2520    }
2521  }
2522
2523  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2524}
2525
2526
2527void NVPTXTargetLowering::LowerAsmOperandForConstraint(
2528    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2529    SelectionDAG &DAG) const {
2530  if (Constraint.length() > 1)
2531    return;
2532  else
2533    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2534}
2535
2536static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2537  switch (Intrinsic) {
2538  default:
2539    return 0;
2540
2541  case Intrinsic::nvvm_tex_1d_v4f32_s32:
2542    return NVPTXISD::Tex1DFloatS32;
2543  case Intrinsic::nvvm_tex_1d_v4f32_f32:
2544    return NVPTXISD::Tex1DFloatFloat;
2545  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2546    return NVPTXISD::Tex1DFloatFloatLevel;
2547  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2548    return NVPTXISD::Tex1DFloatFloatGrad;
2549  case Intrinsic::nvvm_tex_1d_v4s32_s32:
2550    return NVPTXISD::Tex1DS32S32;
2551  case Intrinsic::nvvm_tex_1d_v4s32_f32:
2552    return NVPTXISD::Tex1DS32Float;
2553  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2554    return NVPTXISD::Tex1DS32FloatLevel;
2555  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2556    return NVPTXISD::Tex1DS32FloatGrad;
2557  case Intrinsic::nvvm_tex_1d_v4u32_s32:
2558    return NVPTXISD::Tex1DU32S32;
2559  case Intrinsic::nvvm_tex_1d_v4u32_f32:
2560    return NVPTXISD::Tex1DU32Float;
2561  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2562    return NVPTXISD::Tex1DU32FloatLevel;
2563  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2564    return NVPTXISD::Tex1DU32FloatGrad;
2565
2566  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2567    return NVPTXISD::Tex1DArrayFloatS32;
2568  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2569    return NVPTXISD::Tex1DArrayFloatFloat;
2570  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2571    return NVPTXISD::Tex1DArrayFloatFloatLevel;
2572  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2573    return NVPTXISD::Tex1DArrayFloatFloatGrad;
2574  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2575    return NVPTXISD::Tex1DArrayS32S32;
2576  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2577    return NVPTXISD::Tex1DArrayS32Float;
2578  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2579    return NVPTXISD::Tex1DArrayS32FloatLevel;
2580  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2581    return NVPTXISD::Tex1DArrayS32FloatGrad;
2582  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2583    return NVPTXISD::Tex1DArrayU32S32;
2584  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2585    return NVPTXISD::Tex1DArrayU32Float;
2586  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2587    return NVPTXISD::Tex1DArrayU32FloatLevel;
2588  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2589    return NVPTXISD::Tex1DArrayU32FloatGrad;
2590
2591  case Intrinsic::nvvm_tex_2d_v4f32_s32:
2592    return NVPTXISD::Tex2DFloatS32;
2593  case Intrinsic::nvvm_tex_2d_v4f32_f32:
2594    return NVPTXISD::Tex2DFloatFloat;
2595  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2596    return NVPTXISD::Tex2DFloatFloatLevel;
2597  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2598    return NVPTXISD::Tex2DFloatFloatGrad;
2599  case Intrinsic::nvvm_tex_2d_v4s32_s32:
2600    return NVPTXISD::Tex2DS32S32;
2601  case Intrinsic::nvvm_tex_2d_v4s32_f32:
2602    return NVPTXISD::Tex2DS32Float;
2603  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2604    return NVPTXISD::Tex2DS32FloatLevel;
2605  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2606    return NVPTXISD::Tex2DS32FloatGrad;
2607  case Intrinsic::nvvm_tex_2d_v4u32_s32:
2608    return NVPTXISD::Tex2DU32S32;
2609  case Intrinsic::nvvm_tex_2d_v4u32_f32:
2610    return NVPTXISD::Tex2DU32Float;
2611  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2612    return NVPTXISD::Tex2DU32FloatLevel;
2613  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2614    return NVPTXISD::Tex2DU32FloatGrad;
2615
2616  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2617    return NVPTXISD::Tex2DArrayFloatS32;
2618  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2619    return NVPTXISD::Tex2DArrayFloatFloat;
2620  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2621    return NVPTXISD::Tex2DArrayFloatFloatLevel;
2622  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2623    return NVPTXISD::Tex2DArrayFloatFloatGrad;
2624  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2625    return NVPTXISD::Tex2DArrayS32S32;
2626  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2627    return NVPTXISD::Tex2DArrayS32Float;
2628  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2629    return NVPTXISD::Tex2DArrayS32FloatLevel;
2630  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2631    return NVPTXISD::Tex2DArrayS32FloatGrad;
2632  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2633    return NVPTXISD::Tex2DArrayU32S32;
2634  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2635    return NVPTXISD::Tex2DArrayU32Float;
2636  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2637    return NVPTXISD::Tex2DArrayU32FloatLevel;
2638  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2639    return NVPTXISD::Tex2DArrayU32FloatGrad;
2640
2641  case Intrinsic::nvvm_tex_3d_v4f32_s32:
2642    return NVPTXISD::Tex3DFloatS32;
2643  case Intrinsic::nvvm_tex_3d_v4f32_f32:
2644    return NVPTXISD::Tex3DFloatFloat;
2645  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2646    return NVPTXISD::Tex3DFloatFloatLevel;
2647  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2648    return NVPTXISD::Tex3DFloatFloatGrad;
2649  case Intrinsic::nvvm_tex_3d_v4s32_s32:
2650    return NVPTXISD::Tex3DS32S32;
2651  case Intrinsic::nvvm_tex_3d_v4s32_f32:
2652    return NVPTXISD::Tex3DS32Float;
2653  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2654    return NVPTXISD::Tex3DS32FloatLevel;
2655  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2656    return NVPTXISD::Tex3DS32FloatGrad;
2657  case Intrinsic::nvvm_tex_3d_v4u32_s32:
2658    return NVPTXISD::Tex3DU32S32;
2659  case Intrinsic::nvvm_tex_3d_v4u32_f32:
2660    return NVPTXISD::Tex3DU32Float;
2661  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2662    return NVPTXISD::Tex3DU32FloatLevel;
2663  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2664    return NVPTXISD::Tex3DU32FloatGrad;
2665
2666  case Intrinsic::nvvm_tex_cube_v4f32_f32:
2667    return NVPTXISD::TexCubeFloatFloat;
2668  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2669    return NVPTXISD::TexCubeFloatFloatLevel;
2670  case Intrinsic::nvvm_tex_cube_v4s32_f32:
2671    return NVPTXISD::TexCubeS32Float;
2672  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2673    return NVPTXISD::TexCubeS32FloatLevel;
2674  case Intrinsic::nvvm_tex_cube_v4u32_f32:
2675    return NVPTXISD::TexCubeU32Float;
2676  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2677    return NVPTXISD::TexCubeU32FloatLevel;
2678
2679  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2680    return NVPTXISD::TexCubeArrayFloatFloat;
2681  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2682    return NVPTXISD::TexCubeArrayFloatFloatLevel;
2683  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2684    return NVPTXISD::TexCubeArrayS32Float;
2685  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2686    return NVPTXISD::TexCubeArrayS32FloatLevel;
2687  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2688    return NVPTXISD::TexCubeArrayU32Float;
2689  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2690    return NVPTXISD::TexCubeArrayU32FloatLevel;
2691
2692  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2693    return NVPTXISD::Tld4R2DFloatFloat;
2694  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2695    return NVPTXISD::Tld4G2DFloatFloat;
2696  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2697    return NVPTXISD::Tld4B2DFloatFloat;
2698  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2699    return NVPTXISD::Tld4A2DFloatFloat;
2700  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2701    return NVPTXISD::Tld4R2DS64Float;
2702  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2703    return NVPTXISD::Tld4G2DS64Float;
2704  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2705    return NVPTXISD::Tld4B2DS64Float;
2706  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2707    return NVPTXISD::Tld4A2DS64Float;
2708  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2709    return NVPTXISD::Tld4R2DU64Float;
2710  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2711    return NVPTXISD::Tld4G2DU64Float;
2712  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2713    return NVPTXISD::Tld4B2DU64Float;
2714  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2715    return NVPTXISD::Tld4A2DU64Float;
2716
2717  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2718    return NVPTXISD::TexUnified1DFloatS32;
2719  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2720    return NVPTXISD::TexUnified1DFloatFloat;
2721  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2722    return NVPTXISD::TexUnified1DFloatFloatLevel;
2723  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2724    return NVPTXISD::TexUnified1DFloatFloatGrad;
2725  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2726    return NVPTXISD::TexUnified1DS32S32;
2727  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2728    return NVPTXISD::TexUnified1DS32Float;
2729  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2730    return NVPTXISD::TexUnified1DS32FloatLevel;
2731  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2732    return NVPTXISD::TexUnified1DS32FloatGrad;
2733  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2734    return NVPTXISD::TexUnified1DU32S32;
2735  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2736    return NVPTXISD::TexUnified1DU32Float;
2737  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2738    return NVPTXISD::TexUnified1DU32FloatLevel;
2739  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2740    return NVPTXISD::TexUnified1DU32FloatGrad;
2741
2742  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2743    return NVPTXISD::TexUnified1DArrayFloatS32;
2744  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2745    return NVPTXISD::TexUnified1DArrayFloatFloat;
2746  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2747    return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
2748  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2749    return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
2750  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2751    return NVPTXISD::TexUnified1DArrayS32S32;
2752  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2753    return NVPTXISD::TexUnified1DArrayS32Float;
2754  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2755    return NVPTXISD::TexUnified1DArrayS32FloatLevel;
2756  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2757    return NVPTXISD::TexUnified1DArrayS32FloatGrad;
2758  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2759    return NVPTXISD::TexUnified1DArrayU32S32;
2760  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2761    return NVPTXISD::TexUnified1DArrayU32Float;
2762  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2763    return NVPTXISD::TexUnified1DArrayU32FloatLevel;
2764  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2765    return NVPTXISD::TexUnified1DArrayU32FloatGrad;
2766
2767  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2768    return NVPTXISD::TexUnified2DFloatS32;
2769  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2770    return NVPTXISD::TexUnified2DFloatFloat;
2771  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2772    return NVPTXISD::TexUnified2DFloatFloatLevel;
2773  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2774    return NVPTXISD::TexUnified2DFloatFloatGrad;
2775  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2776    return NVPTXISD::TexUnified2DS32S32;
2777  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2778    return NVPTXISD::TexUnified2DS32Float;
2779  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2780    return NVPTXISD::TexUnified2DS32FloatLevel;
2781  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
2782    return NVPTXISD::TexUnified2DS32FloatGrad;
2783  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
2784    return NVPTXISD::TexUnified2DU32S32;
2785  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
2786    return NVPTXISD::TexUnified2DU32Float;
2787  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
2788    return NVPTXISD::TexUnified2DU32FloatLevel;
2789  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
2790    return NVPTXISD::TexUnified2DU32FloatGrad;
2791
2792  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
2793    return NVPTXISD::TexUnified2DArrayFloatS32;
2794  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
2795    return NVPTXISD::TexUnified2DArrayFloatFloat;
2796  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
2797    return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
2798  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
2799    return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
2800  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
2801    return NVPTXISD::TexUnified2DArrayS32S32;
2802  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
2803    return NVPTXISD::TexUnified2DArrayS32Float;
2804  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
2805    return NVPTXISD::TexUnified2DArrayS32FloatLevel;
2806  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
2807    return NVPTXISD::TexUnified2DArrayS32FloatGrad;
2808  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
2809    return NVPTXISD::TexUnified2DArrayU32S32;
2810  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
2811    return NVPTXISD::TexUnified2DArrayU32Float;
2812  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
2813    return NVPTXISD::TexUnified2DArrayU32FloatLevel;
2814  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
2815    return NVPTXISD::TexUnified2DArrayU32FloatGrad;
2816
2817  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
2818    return NVPTXISD::TexUnified3DFloatS32;
2819  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
2820    return NVPTXISD::TexUnified3DFloatFloat;
2821  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
2822    return NVPTXISD::TexUnified3DFloatFloatLevel;
2823  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
2824    return NVPTXISD::TexUnified3DFloatFloatGrad;
2825  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
2826    return NVPTXISD::TexUnified3DS32S32;
2827  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
2828    return NVPTXISD::TexUnified3DS32Float;
2829  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
2830    return NVPTXISD::TexUnified3DS32FloatLevel;
2831  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
2832    return NVPTXISD::TexUnified3DS32FloatGrad;
2833  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
2834    return NVPTXISD::TexUnified3DU32S32;
2835  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
2836    return NVPTXISD::TexUnified3DU32Float;
2837  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
2838    return NVPTXISD::TexUnified3DU32FloatLevel;
2839  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
2840    return NVPTXISD::TexUnified3DU32FloatGrad;
2841
2842  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
2843    return NVPTXISD::TexUnifiedCubeFloatFloat;
2844  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
2845    return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
2846  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
2847    return NVPTXISD::TexUnifiedCubeS32Float;
2848  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
2849    return NVPTXISD::TexUnifiedCubeS32FloatLevel;
2850  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
2851    return NVPTXISD::TexUnifiedCubeU32Float;
2852  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
2853    return NVPTXISD::TexUnifiedCubeU32FloatLevel;
2854
2855  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
2856    return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
2857  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
2858    return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
2859  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
2860    return NVPTXISD::TexUnifiedCubeArrayS32Float;
2861  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
2862    return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
2863  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
2864    return NVPTXISD::TexUnifiedCubeArrayU32Float;
2865  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
2866    return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
2867
2868  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
2869    return NVPTXISD::Tld4UnifiedR2DFloatFloat;
2870  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
2871    return NVPTXISD::Tld4UnifiedG2DFloatFloat;
2872  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
2873    return NVPTXISD::Tld4UnifiedB2DFloatFloat;
2874  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
2875    return NVPTXISD::Tld4UnifiedA2DFloatFloat;
2876  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
2877    return NVPTXISD::Tld4UnifiedR2DS64Float;
2878  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
2879    return NVPTXISD::Tld4UnifiedG2DS64Float;
2880  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
2881    return NVPTXISD::Tld4UnifiedB2DS64Float;
2882  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
2883    return NVPTXISD::Tld4UnifiedA2DS64Float;
2884  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
2885    return NVPTXISD::Tld4UnifiedR2DU64Float;
2886  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
2887    return NVPTXISD::Tld4UnifiedG2DU64Float;
2888  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
2889    return NVPTXISD::Tld4UnifiedB2DU64Float;
2890  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
2891    return NVPTXISD::Tld4UnifiedA2DU64Float;
2892  }
2893}
2894
2895static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
2896  switch (Intrinsic) {
2897  default:
2898    return 0;
2899  case Intrinsic::nvvm_suld_1d_i8_clamp:
2900    return NVPTXISD::Suld1DI8Clamp;
2901  case Intrinsic::nvvm_suld_1d_i16_clamp:
2902    return NVPTXISD::Suld1DI16Clamp;
2903  case Intrinsic::nvvm_suld_1d_i32_clamp:
2904    return NVPTXISD::Suld1DI32Clamp;
2905  case Intrinsic::nvvm_suld_1d_i64_clamp:
2906    return NVPTXISD::Suld1DI64Clamp;
2907  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
2908    return NVPTXISD::Suld1DV2I8Clamp;
2909  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
2910    return NVPTXISD::Suld1DV2I16Clamp;
2911  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
2912    return NVPTXISD::Suld1DV2I32Clamp;
2913  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
2914    return NVPTXISD::Suld1DV2I64Clamp;
2915  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
2916    return NVPTXISD::Suld1DV4I8Clamp;
2917  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
2918    return NVPTXISD::Suld1DV4I16Clamp;
2919  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
2920    return NVPTXISD::Suld1DV4I32Clamp;
2921  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
2922    return NVPTXISD::Suld1DArrayI8Clamp;
2923  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
2924    return NVPTXISD::Suld1DArrayI16Clamp;
2925  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
2926    return NVPTXISD::Suld1DArrayI32Clamp;
2927  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
2928    return NVPTXISD::Suld1DArrayI64Clamp;
2929  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
2930    return NVPTXISD::Suld1DArrayV2I8Clamp;
2931  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
2932    return NVPTXISD::Suld1DArrayV2I16Clamp;
2933  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
2934    return NVPTXISD::Suld1DArrayV2I32Clamp;
2935  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
2936    return NVPTXISD::Suld1DArrayV2I64Clamp;
2937  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
2938    return NVPTXISD::Suld1DArrayV4I8Clamp;
2939  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
2940    return NVPTXISD::Suld1DArrayV4I16Clamp;
2941  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
2942    return NVPTXISD::Suld1DArrayV4I32Clamp;
2943  case Intrinsic::nvvm_suld_2d_i8_clamp:
2944    return NVPTXISD::Suld2DI8Clamp;
2945  case Intrinsic::nvvm_suld_2d_i16_clamp:
2946    return NVPTXISD::Suld2DI16Clamp;
2947  case Intrinsic::nvvm_suld_2d_i32_clamp:
2948    return NVPTXISD::Suld2DI32Clamp;
2949  case Intrinsic::nvvm_suld_2d_i64_clamp:
2950    return NVPTXISD::Suld2DI64Clamp;
2951  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
2952    return NVPTXISD::Suld2DV2I8Clamp;
2953  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
2954    return NVPTXISD::Suld2DV2I16Clamp;
2955  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
2956    return NVPTXISD::Suld2DV2I32Clamp;
2957  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
2958    return NVPTXISD::Suld2DV2I64Clamp;
2959  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
2960    return NVPTXISD::Suld2DV4I8Clamp;
2961  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
2962    return NVPTXISD::Suld2DV4I16Clamp;
2963  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
2964    return NVPTXISD::Suld2DV4I32Clamp;
2965  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
2966    return NVPTXISD::Suld2DArrayI8Clamp;
2967  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
2968    return NVPTXISD::Suld2DArrayI16Clamp;
2969  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
2970    return NVPTXISD::Suld2DArrayI32Clamp;
2971  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
2972    return NVPTXISD::Suld2DArrayI64Clamp;
2973  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
2974    return NVPTXISD::Suld2DArrayV2I8Clamp;
2975  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
2976    return NVPTXISD::Suld2DArrayV2I16Clamp;
2977  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
2978    return NVPTXISD::Suld2DArrayV2I32Clamp;
2979  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
2980    return NVPTXISD::Suld2DArrayV2I64Clamp;
2981  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
2982    return NVPTXISD::Suld2DArrayV4I8Clamp;
2983  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
2984    return NVPTXISD::Suld2DArrayV4I16Clamp;
2985  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
2986    return NVPTXISD::Suld2DArrayV4I32Clamp;
2987  case Intrinsic::nvvm_suld_3d_i8_clamp:
2988    return NVPTXISD::Suld3DI8Clamp;
2989  case Intrinsic::nvvm_suld_3d_i16_clamp:
2990    return NVPTXISD::Suld3DI16Clamp;
2991  case Intrinsic::nvvm_suld_3d_i32_clamp:
2992    return NVPTXISD::Suld3DI32Clamp;
2993  case Intrinsic::nvvm_suld_3d_i64_clamp:
2994    return NVPTXISD::Suld3DI64Clamp;
2995  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
2996    return NVPTXISD::Suld3DV2I8Clamp;
2997  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
2998    return NVPTXISD::Suld3DV2I16Clamp;
2999  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3000    return NVPTXISD::Suld3DV2I32Clamp;
3001  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3002    return NVPTXISD::Suld3DV2I64Clamp;
3003  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3004    return NVPTXISD::Suld3DV4I8Clamp;
3005  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3006    return NVPTXISD::Suld3DV4I16Clamp;
3007  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3008    return NVPTXISD::Suld3DV4I32Clamp;
3009  case Intrinsic::nvvm_suld_1d_i8_trap:
3010    return NVPTXISD::Suld1DI8Trap;
3011  case Intrinsic::nvvm_suld_1d_i16_trap:
3012    return NVPTXISD::Suld1DI16Trap;
3013  case Intrinsic::nvvm_suld_1d_i32_trap:
3014    return NVPTXISD::Suld1DI32Trap;
3015  case Intrinsic::nvvm_suld_1d_i64_trap:
3016    return NVPTXISD::Suld1DI64Trap;
3017  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3018    return NVPTXISD::Suld1DV2I8Trap;
3019  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3020    return NVPTXISD::Suld1DV2I16Trap;
3021  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3022    return NVPTXISD::Suld1DV2I32Trap;
3023  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3024    return NVPTXISD::Suld1DV2I64Trap;
3025  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3026    return NVPTXISD::Suld1DV4I8Trap;
3027  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3028    return NVPTXISD::Suld1DV4I16Trap;
3029  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3030    return NVPTXISD::Suld1DV4I32Trap;
3031  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3032    return NVPTXISD::Suld1DArrayI8Trap;
3033  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3034    return NVPTXISD::Suld1DArrayI16Trap;
3035  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3036    return NVPTXISD::Suld1DArrayI32Trap;
3037  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3038    return NVPTXISD::Suld1DArrayI64Trap;
3039  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3040    return NVPTXISD::Suld1DArrayV2I8Trap;
3041  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3042    return NVPTXISD::Suld1DArrayV2I16Trap;
3043  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3044    return NVPTXISD::Suld1DArrayV2I32Trap;
3045  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3046    return NVPTXISD::Suld1DArrayV2I64Trap;
3047  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3048    return NVPTXISD::Suld1DArrayV4I8Trap;
3049  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3050    return NVPTXISD::Suld1DArrayV4I16Trap;
3051  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3052    return NVPTXISD::Suld1DArrayV4I32Trap;
3053  case Intrinsic::nvvm_suld_2d_i8_trap:
3054    return NVPTXISD::Suld2DI8Trap;
3055  case Intrinsic::nvvm_suld_2d_i16_trap:
3056    return NVPTXISD::Suld2DI16Trap;
3057  case Intrinsic::nvvm_suld_2d_i32_trap:
3058    return NVPTXISD::Suld2DI32Trap;
3059  case Intrinsic::nvvm_suld_2d_i64_trap:
3060    return NVPTXISD::Suld2DI64Trap;
3061  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3062    return NVPTXISD::Suld2DV2I8Trap;
3063  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3064    return NVPTXISD::Suld2DV2I16Trap;
3065  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3066    return NVPTXISD::Suld2DV2I32Trap;
3067  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3068    return NVPTXISD::Suld2DV2I64Trap;
3069  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3070    return NVPTXISD::Suld2DV4I8Trap;
3071  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3072    return NVPTXISD::Suld2DV4I16Trap;
3073  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3074    return NVPTXISD::Suld2DV4I32Trap;
3075  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3076    return NVPTXISD::Suld2DArrayI8Trap;
3077  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3078    return NVPTXISD::Suld2DArrayI16Trap;
3079  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3080    return NVPTXISD::Suld2DArrayI32Trap;
3081  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3082    return NVPTXISD::Suld2DArrayI64Trap;
3083  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3084    return NVPTXISD::Suld2DArrayV2I8Trap;
3085  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3086    return NVPTXISD::Suld2DArrayV2I16Trap;
3087  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3088    return NVPTXISD::Suld2DArrayV2I32Trap;
3089  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3090    return NVPTXISD::Suld2DArrayV2I64Trap;
3091  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3092    return NVPTXISD::Suld2DArrayV4I8Trap;
3093  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3094    return NVPTXISD::Suld2DArrayV4I16Trap;
3095  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3096    return NVPTXISD::Suld2DArrayV4I32Trap;
3097  case Intrinsic::nvvm_suld_3d_i8_trap:
3098    return NVPTXISD::Suld3DI8Trap;
3099  case Intrinsic::nvvm_suld_3d_i16_trap:
3100    return NVPTXISD::Suld3DI16Trap;
3101  case Intrinsic::nvvm_suld_3d_i32_trap:
3102    return NVPTXISD::Suld3DI32Trap;
3103  case Intrinsic::nvvm_suld_3d_i64_trap:
3104    return NVPTXISD::Suld3DI64Trap;
3105  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3106    return NVPTXISD::Suld3DV2I8Trap;
3107  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3108    return NVPTXISD::Suld3DV2I16Trap;
3109  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3110    return NVPTXISD::Suld3DV2I32Trap;
3111  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3112    return NVPTXISD::Suld3DV2I64Trap;
3113  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3114    return NVPTXISD::Suld3DV4I8Trap;
3115  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3116    return NVPTXISD::Suld3DV4I16Trap;
3117  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3118    return NVPTXISD::Suld3DV4I32Trap;
3119  case Intrinsic::nvvm_suld_1d_i8_zero:
3120    return NVPTXISD::Suld1DI8Zero;
3121  case Intrinsic::nvvm_suld_1d_i16_zero:
3122    return NVPTXISD::Suld1DI16Zero;
3123  case Intrinsic::nvvm_suld_1d_i32_zero:
3124    return NVPTXISD::Suld1DI32Zero;
3125  case Intrinsic::nvvm_suld_1d_i64_zero:
3126    return NVPTXISD::Suld1DI64Zero;
3127  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3128    return NVPTXISD::Suld1DV2I8Zero;
3129  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3130    return NVPTXISD::Suld1DV2I16Zero;
3131  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3132    return NVPTXISD::Suld1DV2I32Zero;
3133  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3134    return NVPTXISD::Suld1DV2I64Zero;
3135  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3136    return NVPTXISD::Suld1DV4I8Zero;
3137  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3138    return NVPTXISD::Suld1DV4I16Zero;
3139  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3140    return NVPTXISD::Suld1DV4I32Zero;
3141  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3142    return NVPTXISD::Suld1DArrayI8Zero;
3143  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3144    return NVPTXISD::Suld1DArrayI16Zero;
3145  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3146    return NVPTXISD::Suld1DArrayI32Zero;
3147  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3148    return NVPTXISD::Suld1DArrayI64Zero;
3149  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3150    return NVPTXISD::Suld1DArrayV2I8Zero;
3151  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3152    return NVPTXISD::Suld1DArrayV2I16Zero;
3153  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3154    return NVPTXISD::Suld1DArrayV2I32Zero;
3155  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3156    return NVPTXISD::Suld1DArrayV2I64Zero;
3157  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3158    return NVPTXISD::Suld1DArrayV4I8Zero;
3159  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3160    return NVPTXISD::Suld1DArrayV4I16Zero;
3161  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3162    return NVPTXISD::Suld1DArrayV4I32Zero;
3163  case Intrinsic::nvvm_suld_2d_i8_zero:
3164    return NVPTXISD::Suld2DI8Zero;
3165  case Intrinsic::nvvm_suld_2d_i16_zero:
3166    return NVPTXISD::Suld2DI16Zero;
3167  case Intrinsic::nvvm_suld_2d_i32_zero:
3168    return NVPTXISD::Suld2DI32Zero;
3169  case Intrinsic::nvvm_suld_2d_i64_zero:
3170    return NVPTXISD::Suld2DI64Zero;
3171  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3172    return NVPTXISD::Suld2DV2I8Zero;
3173  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3174    return NVPTXISD::Suld2DV2I16Zero;
3175  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3176    return NVPTXISD::Suld2DV2I32Zero;
3177  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3178    return NVPTXISD::Suld2DV2I64Zero;
3179  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3180    return NVPTXISD::Suld2DV4I8Zero;
3181  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3182    return NVPTXISD::Suld2DV4I16Zero;
3183  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3184    return NVPTXISD::Suld2DV4I32Zero;
3185  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3186    return NVPTXISD::Suld2DArrayI8Zero;
3187  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3188    return NVPTXISD::Suld2DArrayI16Zero;
3189  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3190    return NVPTXISD::Suld2DArrayI32Zero;
3191  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3192    return NVPTXISD::Suld2DArrayI64Zero;
3193  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3194    return NVPTXISD::Suld2DArrayV2I8Zero;
3195  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3196    return NVPTXISD::Suld2DArrayV2I16Zero;
3197  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3198    return NVPTXISD::Suld2DArrayV2I32Zero;
3199  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3200    return NVPTXISD::Suld2DArrayV2I64Zero;
3201  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3202    return NVPTXISD::Suld2DArrayV4I8Zero;
3203  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3204    return NVPTXISD::Suld2DArrayV4I16Zero;
3205  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3206    return NVPTXISD::Suld2DArrayV4I32Zero;
3207  case Intrinsic::nvvm_suld_3d_i8_zero:
3208    return NVPTXISD::Suld3DI8Zero;
3209  case Intrinsic::nvvm_suld_3d_i16_zero:
3210    return NVPTXISD::Suld3DI16Zero;
3211  case Intrinsic::nvvm_suld_3d_i32_zero:
3212    return NVPTXISD::Suld3DI32Zero;
3213  case Intrinsic::nvvm_suld_3d_i64_zero:
3214    return NVPTXISD::Suld3DI64Zero;
3215  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3216    return NVPTXISD::Suld3DV2I8Zero;
3217  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3218    return NVPTXISD::Suld3DV2I16Zero;
3219  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3220    return NVPTXISD::Suld3DV2I32Zero;
3221  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3222    return NVPTXISD::Suld3DV2I64Zero;
3223  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3224    return NVPTXISD::Suld3DV4I8Zero;
3225  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3226    return NVPTXISD::Suld3DV4I16Zero;
3227  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3228    return NVPTXISD::Suld3DV4I32Zero;
3229  }
3230}
3231
3232// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3233// TgtMemIntrinsic
3234// because we need the information that is only available in the "Value" type
3235// of destination
3236// pointer. In particular, the address space information.
3237bool NVPTXTargetLowering::getTgtMemIntrinsic(
3238    IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const {
3239  switch (Intrinsic) {
3240  default:
3241    return false;
3242
3243  case Intrinsic::nvvm_atomic_load_add_f32:
3244    Info.opc = ISD::INTRINSIC_W_CHAIN;
3245    Info.memVT = MVT::f32;
3246    Info.ptrVal = I.getArgOperand(0);
3247    Info.offset = 0;
3248    Info.vol = 0;
3249    Info.readMem = true;
3250    Info.writeMem = true;
3251    Info.align = 0;
3252    return true;
3253
3254  case Intrinsic::nvvm_atomic_load_inc_32:
3255  case Intrinsic::nvvm_atomic_load_dec_32:
3256    Info.opc = ISD::INTRINSIC_W_CHAIN;
3257    Info.memVT = MVT::i32;
3258    Info.ptrVal = I.getArgOperand(0);
3259    Info.offset = 0;
3260    Info.vol = 0;
3261    Info.readMem = true;
3262    Info.writeMem = true;
3263    Info.align = 0;
3264    return true;
3265
3266  case Intrinsic::nvvm_ldu_global_i:
3267  case Intrinsic::nvvm_ldu_global_f:
3268  case Intrinsic::nvvm_ldu_global_p: {
3269    auto &DL = I.getModule()->getDataLayout();
3270    Info.opc = ISD::INTRINSIC_W_CHAIN;
3271    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3272      Info.memVT = getValueType(DL, I.getType());
3273    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3274      Info.memVT = getPointerTy(DL);
3275    else
3276      Info.memVT = getValueType(DL, I.getType());
3277    Info.ptrVal = I.getArgOperand(0);
3278    Info.offset = 0;
3279    Info.vol = 0;
3280    Info.readMem = true;
3281    Info.writeMem = false;
3282    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3283
3284    return true;
3285  }
3286  case Intrinsic::nvvm_ldg_global_i:
3287  case Intrinsic::nvvm_ldg_global_f:
3288  case Intrinsic::nvvm_ldg_global_p: {
3289    auto &DL = I.getModule()->getDataLayout();
3290
3291    Info.opc = ISD::INTRINSIC_W_CHAIN;
3292    if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3293      Info.memVT = getValueType(DL, I.getType());
3294    else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3295      Info.memVT = getPointerTy(DL);
3296    else
3297      Info.memVT = getValueType(DL, I.getType());
3298    Info.ptrVal = I.getArgOperand(0);
3299    Info.offset = 0;
3300    Info.vol = 0;
3301    Info.readMem = true;
3302    Info.writeMem = false;
3303    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
3304
3305    return true;
3306  }
3307
3308  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3309  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3310  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3311  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3312  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3313  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3314  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3315  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3316  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3317  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3318  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3319  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3320  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3321  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3322  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3323  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3324  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3325  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3326  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3327  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3328  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3329  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3330  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3331  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3332  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3333  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3334  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3335  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3336  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3337  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3338  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3339  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3340  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3341  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3342  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3343  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3344  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3345  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3346  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3347  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3348  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3349  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3350  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3351  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3352  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3353  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3354  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3355  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3356  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3357  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3358  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3359  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3360  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3361  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3362  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3363  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32: {
3364    Info.opc = getOpcForTextureInstr(Intrinsic);
3365    Info.memVT = MVT::v4f32;
3366    Info.ptrVal = nullptr;
3367    Info.offset = 0;
3368    Info.vol = 0;
3369    Info.readMem = true;
3370    Info.writeMem = false;
3371    Info.align = 16;
3372    return true;
3373  }
3374  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3375  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3376  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3377  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3378  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3379  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3380  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3381  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3382  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3383  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3384  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3385  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3386  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3387  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3388  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3389  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3390  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3391  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3392  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3393  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3394  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3395  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3396  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3397  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3398  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3399  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3400  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3401  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3402  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3403  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3404  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3405  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3406  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3407  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3408  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3409  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3410  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3411  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3412  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3413  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3414  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3415  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3416  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3417  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3418  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3419  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3420  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3421  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3422  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3423  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3424  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3425  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3426  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3427  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3428  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3429  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3430  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3431  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3432  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3433  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3434  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3435  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3436  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3437  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3438  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3439  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3440  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3441  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3442  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3443  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3444  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3445  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3446  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3447  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3448  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3449  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3450  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3451  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3452  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3453  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3454  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3455  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3456  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3457  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3458  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3459  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3460  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3461  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3462  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3463  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3464  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3465  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3466  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3467  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3468  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3469  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3470  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3471  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3472  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3473  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3474  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3475  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3476  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3477  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3478  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3479  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3480  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3481  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3482  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3483  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3484  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3485  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32: {
3486    Info.opc = getOpcForTextureInstr(Intrinsic);
3487    Info.memVT = MVT::v4i32;
3488    Info.ptrVal = nullptr;
3489    Info.offset = 0;
3490    Info.vol = 0;
3491    Info.readMem = true;
3492    Info.writeMem = false;
3493    Info.align = 16;
3494    return true;
3495  }
3496  case Intrinsic::nvvm_suld_1d_i8_clamp:
3497  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3498  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3499  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3500  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3501  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3502  case Intrinsic::nvvm_suld_2d_i8_clamp:
3503  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3504  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3505  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3506  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3507  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3508  case Intrinsic::nvvm_suld_3d_i8_clamp:
3509  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3510  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3511  case Intrinsic::nvvm_suld_1d_i8_trap:
3512  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3513  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3514  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3515  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3516  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3517  case Intrinsic::nvvm_suld_2d_i8_trap:
3518  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3519  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3520  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3521  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3522  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3523  case Intrinsic::nvvm_suld_3d_i8_trap:
3524  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3525  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3526  case Intrinsic::nvvm_suld_1d_i8_zero:
3527  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3528  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3529  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3530  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3531  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3532  case Intrinsic::nvvm_suld_2d_i8_zero:
3533  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3534  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3535  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3536  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3537  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3538  case Intrinsic::nvvm_suld_3d_i8_zero:
3539  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3540  case Intrinsic::nvvm_suld_3d_v4i8_zero: {
3541    Info.opc = getOpcForSurfaceInstr(Intrinsic);
3542    Info.memVT = MVT::i8;
3543    Info.ptrVal = nullptr;
3544    Info.offset = 0;
3545    Info.vol = 0;
3546    Info.readMem = true;
3547    Info.writeMem = false;
3548    Info.align = 16;
3549    return true;
3550  }
3551  case Intrinsic::nvvm_suld_1d_i16_clamp:
3552  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3553  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3554  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3555  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3556  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3557  case Intrinsic::nvvm_suld_2d_i16_clamp:
3558  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3559  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3560  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3561  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3562  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3563  case Intrinsic::nvvm_suld_3d_i16_clamp:
3564  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3565  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3566  case Intrinsic::nvvm_suld_1d_i16_trap:
3567  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3568  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3569  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3570  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3571  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3572  case Intrinsic::nvvm_suld_2d_i16_trap:
3573  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3574  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3575  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3576  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3577  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3578  case Intrinsic::nvvm_suld_3d_i16_trap:
3579  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3580  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3581  case Intrinsic::nvvm_suld_1d_i16_zero:
3582  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3583  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3584  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3585  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3586  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3587  case Intrinsic::nvvm_suld_2d_i16_zero:
3588  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3589  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3590  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3591  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3592  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3593  case Intrinsic::nvvm_suld_3d_i16_zero:
3594  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3595  case Intrinsic::nvvm_suld_3d_v4i16_zero: {
3596    Info.opc = getOpcForSurfaceInstr(Intrinsic);
3597    Info.memVT = MVT::i16;
3598    Info.ptrVal = nullptr;
3599    Info.offset = 0;
3600    Info.vol = 0;
3601    Info.readMem = true;
3602    Info.writeMem = false;
3603    Info.align = 16;
3604    return true;
3605  }
3606  case Intrinsic::nvvm_suld_1d_i32_clamp:
3607  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3608  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3609  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3610  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3611  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3612  case Intrinsic::nvvm_suld_2d_i32_clamp:
3613  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3614  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3615  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3616  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3617  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3618  case Intrinsic::nvvm_suld_3d_i32_clamp:
3619  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3620  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3621  case Intrinsic::nvvm_suld_1d_i32_trap:
3622  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3623  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3624  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3625  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3626  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3627  case Intrinsic::nvvm_suld_2d_i32_trap:
3628  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3629  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3630  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3631  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3632  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3633  case Intrinsic::nvvm_suld_3d_i32_trap:
3634  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3635  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3636  case Intrinsic::nvvm_suld_1d_i32_zero:
3637  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3638  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3639  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3640  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3641  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3642  case Intrinsic::nvvm_suld_2d_i32_zero:
3643  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3644  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3645  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3646  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3647  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3648  case Intrinsic::nvvm_suld_3d_i32_zero:
3649  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3650  case Intrinsic::nvvm_suld_3d_v4i32_zero: {
3651    Info.opc = getOpcForSurfaceInstr(Intrinsic);
3652    Info.memVT = MVT::i32;
3653    Info.ptrVal = nullptr;
3654    Info.offset = 0;
3655    Info.vol = 0;
3656    Info.readMem = true;
3657    Info.writeMem = false;
3658    Info.align = 16;
3659    return true;
3660  }
3661  case Intrinsic::nvvm_suld_1d_i64_clamp:
3662  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3663  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3664  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3665  case Intrinsic::nvvm_suld_2d_i64_clamp:
3666  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3667  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3668  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3669  case Intrinsic::nvvm_suld_3d_i64_clamp:
3670  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3671  case Intrinsic::nvvm_suld_1d_i64_trap:
3672  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3673  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3674  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3675  case Intrinsic::nvvm_suld_2d_i64_trap:
3676  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3677  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3678  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3679  case Intrinsic::nvvm_suld_3d_i64_trap:
3680  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3681  case Intrinsic::nvvm_suld_1d_i64_zero:
3682  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3683  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3684  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3685  case Intrinsic::nvvm_suld_2d_i64_zero:
3686  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3687  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3688  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3689  case Intrinsic::nvvm_suld_3d_i64_zero:
3690  case Intrinsic::nvvm_suld_3d_v2i64_zero: {
3691    Info.opc = getOpcForSurfaceInstr(Intrinsic);
3692    Info.memVT = MVT::i64;
3693    Info.ptrVal = nullptr;
3694    Info.offset = 0;
3695    Info.vol = 0;
3696    Info.readMem = true;
3697    Info.writeMem = false;
3698    Info.align = 16;
3699    return true;
3700  }
3701  }
3702  return false;
3703}
3704
3705/// isLegalAddressingMode - Return true if the addressing mode represented
3706/// by AM is legal for this target, for a load/store of the specified type.
3707/// Used to guide target specific optimizations, like loop strength reduction
3708/// (LoopStrengthReduce.cpp) and memory optimization for address mode
3709/// (CodeGenPrepare.cpp)
3710bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
3711                                                const AddrMode &AM, Type *Ty,
3712                                                unsigned AS) const {
3713
3714  // AddrMode - This represents an addressing mode of:
3715  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
3716  //
3717  // The legal address modes are
3718  // - [avar]
3719  // - [areg]
3720  // - [areg+immoff]
3721  // - [immAddr]
3722
3723  if (AM.BaseGV) {
3724    return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
3725  }
3726
3727  switch (AM.Scale) {
3728  case 0: // "r", "r+i" or "i" is allowed
3729    break;
3730  case 1:
3731    if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
3732      return false;
3733    // Otherwise we have r+i.
3734    break;
3735  default:
3736    // No scale > 1 is allowed
3737    return false;
3738  }
3739  return true;
3740}
3741
3742//===----------------------------------------------------------------------===//
3743//                         NVPTX Inline Assembly Support
3744//===----------------------------------------------------------------------===//
3745
3746/// getConstraintType - Given a constraint letter, return the type of
3747/// constraint it is for this target.
3748NVPTXTargetLowering::ConstraintType
3749NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
3750  if (Constraint.size() == 1) {
3751    switch (Constraint[0]) {
3752    default:
3753      break;
3754    case 'b':
3755    case 'r':
3756    case 'h':
3757    case 'c':
3758    case 'l':
3759    case 'f':
3760    case 'd':
3761    case '0':
3762    case 'N':
3763      return C_RegisterClass;
3764    }
3765  }
3766  return TargetLowering::getConstraintType(Constraint);
3767}
3768
3769std::pair<unsigned, const TargetRegisterClass *>
3770NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
3771                                                  StringRef Constraint,
3772                                                  MVT VT) const {
3773  if (Constraint.size() == 1) {
3774    switch (Constraint[0]) {
3775    case 'b':
3776      return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
3777    case 'c':
3778      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
3779    case 'h':
3780      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
3781    case 'r':
3782      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
3783    case 'l':
3784    case 'N':
3785      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
3786    case 'f':
3787      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
3788    case 'd':
3789      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
3790    }
3791  }
3792  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
3793}
3794
3795//===----------------------------------------------------------------------===//
3796//                         NVPTX DAG Combining
3797//===----------------------------------------------------------------------===//
3798
3799bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
3800                                   CodeGenOpt::Level OptLevel) const {
3801  const Function *F = MF.getFunction();
3802  const TargetOptions &TO = MF.getTarget().Options;
3803
3804  // Always honor command-line argument
3805  if (FMAContractLevelOpt.getNumOccurrences() > 0) {
3806    return FMAContractLevelOpt > 0;
3807  } else if (OptLevel == 0) {
3808    // Do not contract if we're not optimizing the code
3809    return false;
3810  } else if (TO.AllowFPOpFusion == FPOpFusion::Fast || TO.UnsafeFPMath) {
3811    // Honor TargetOptions flags that explicitly say fusion is okay
3812    return true;
3813  } else if (F->hasFnAttribute("unsafe-fp-math")) {
3814    // Check for unsafe-fp-math=true coming from Clang
3815    Attribute Attr = F->getFnAttribute("unsafe-fp-math");
3816    StringRef Val = Attr.getValueAsString();
3817    if (Val == "true")
3818      return true;
3819  }
3820
3821  // We did not have a clear indication that fusion is allowed, so assume not
3822  return false;
3823}
3824
3825/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
3826/// operands N0 and N1.  This is a helper for PerformADDCombine that is
3827/// called with the default operands, and if that fails, with commuted
3828/// operands.
3829static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
3830                                           TargetLowering::DAGCombinerInfo &DCI,
3831                                             const NVPTXSubtarget &Subtarget,
3832                                             CodeGenOpt::Level OptLevel) {
3833  SelectionDAG  &DAG = DCI.DAG;
3834  // Skip non-integer, non-scalar case
3835  EVT VT=N0.getValueType();
3836  if (VT.isVector())
3837    return SDValue();
3838
3839  // fold (add (mul a, b), c) -> (mad a, b, c)
3840  //
3841  if (N0.getOpcode() == ISD::MUL) {
3842    assert (VT.isInteger());
3843    // For integer:
3844    // Since integer multiply-add costs the same as integer multiply
3845    // but is more costly than integer add, do the fusion only when
3846    // the mul is only used in the add.
3847    if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
3848        !N0.getNode()->hasOneUse())
3849      return SDValue();
3850
3851    // Do the folding
3852    return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
3853                       N0.getOperand(0), N0.getOperand(1), N1);
3854  }
3855  else if (N0.getOpcode() == ISD::FMUL) {
3856    if (VT == MVT::f32 || VT == MVT::f64) {
3857      const auto *TLI = static_cast<const NVPTXTargetLowering *>(
3858          &DAG.getTargetLoweringInfo());
3859      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
3860        return SDValue();
3861
3862      // For floating point:
3863      // Do the fusion only when the mul has less than 5 uses and all
3864      // are add.
3865      // The heuristic is that if a use is not an add, then that use
3866      // cannot be fused into fma, therefore mul is still needed anyway.
3867      // If there are more than 4 uses, even if they are all add, fusing
3868      // them will increase register pressue.
3869      //
3870      int numUses = 0;
3871      int nonAddCount = 0;
3872      for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
3873           UE = N0.getNode()->use_end();
3874           UI != UE; ++UI) {
3875        numUses++;
3876        SDNode *User = *UI;
3877        if (User->getOpcode() != ISD::FADD)
3878          ++nonAddCount;
3879      }
3880      if (numUses >= 5)
3881        return SDValue();
3882      if (nonAddCount) {
3883        int orderNo = N->getIROrder();
3884        int orderNo2 = N0.getNode()->getIROrder();
3885        // simple heuristics here for considering potential register
3886        // pressure, the logics here is that the differnce are used
3887        // to measure the distance between def and use, the longer distance
3888        // more likely cause register pressure.
3889        if (orderNo - orderNo2 < 500)
3890          return SDValue();
3891
3892        // Now, check if at least one of the FMUL's operands is live beyond the node N,
3893        // which guarantees that the FMA will not increase register pressure at node N.
3894        bool opIsLive = false;
3895        const SDNode *left = N0.getOperand(0).getNode();
3896        const SDNode *right = N0.getOperand(1).getNode();
3897
3898        if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
3899          opIsLive = true;
3900
3901        if (!opIsLive)
3902          for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
3903            SDNode *User = *UI;
3904            int orderNo3 = User->getIROrder();
3905            if (orderNo3 > orderNo) {
3906              opIsLive = true;
3907              break;
3908            }
3909          }
3910
3911        if (!opIsLive)
3912          for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
3913            SDNode *User = *UI;
3914            int orderNo3 = User->getIROrder();
3915            if (orderNo3 > orderNo) {
3916              opIsLive = true;
3917              break;
3918            }
3919          }
3920
3921        if (!opIsLive)
3922          return SDValue();
3923      }
3924
3925      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
3926                         N0.getOperand(0), N0.getOperand(1), N1);
3927    }
3928  }
3929
3930  return SDValue();
3931}
3932
3933/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
3934///
3935static SDValue PerformADDCombine(SDNode *N,
3936                                 TargetLowering::DAGCombinerInfo &DCI,
3937                                 const NVPTXSubtarget &Subtarget,
3938                                 CodeGenOpt::Level OptLevel) {
3939  SDValue N0 = N->getOperand(0);
3940  SDValue N1 = N->getOperand(1);
3941
3942  // First try with the default operand order.
3943  SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget,
3944                                                 OptLevel);
3945  if (Result.getNode())
3946    return Result;
3947
3948  // If that didn't work, try again with the operands commuted.
3949  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
3950}
3951
3952static SDValue PerformANDCombine(SDNode *N,
3953                                 TargetLowering::DAGCombinerInfo &DCI) {
3954  // The type legalizer turns a vector load of i8 values into a zextload to i16
3955  // registers, optionally ANY_EXTENDs it (if target type is integer),
3956  // and ANDs off the high 8 bits. Since we turn this load into a
3957  // target-specific DAG node, the DAG combiner fails to eliminate these AND
3958  // nodes. Do that here.
3959  SDValue Val = N->getOperand(0);
3960  SDValue Mask = N->getOperand(1);
3961
3962  if (isa<ConstantSDNode>(Val)) {
3963    std::swap(Val, Mask);
3964  }
3965
3966  SDValue AExt;
3967  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
3968  if (Val.getOpcode() == ISD::ANY_EXTEND) {
3969    AExt = Val;
3970    Val = Val->getOperand(0);
3971  }
3972
3973  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
3974    Val = Val->getOperand(0);
3975  }
3976
3977  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
3978      Val->getOpcode() == NVPTXISD::LoadV4) {
3979    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
3980    if (!MaskCnst) {
3981      // Not an AND with a constant
3982      return SDValue();
3983    }
3984
3985    uint64_t MaskVal = MaskCnst->getZExtValue();
3986    if (MaskVal != 0xff) {
3987      // Not an AND that chops off top 8 bits
3988      return SDValue();
3989    }
3990
3991    MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
3992    if (!Mem) {
3993      // Not a MemSDNode?!?
3994      return SDValue();
3995    }
3996
3997    EVT MemVT = Mem->getMemoryVT();
3998    if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
3999      // We only handle the i8 case
4000      return SDValue();
4001    }
4002
4003    unsigned ExtType =
4004      cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4005        getZExtValue();
4006    if (ExtType == ISD::SEXTLOAD) {
4007      // If for some reason the load is a sextload, the and is needed to zero
4008      // out the high 8 bits
4009      return SDValue();
4010    }
4011
4012    bool AddTo = false;
4013    if (AExt.getNode() != 0) {
4014      // Re-insert the ext as a zext.
4015      Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4016                            AExt.getValueType(), Val);
4017      AddTo = true;
4018    }
4019
4020    // If we get here, the AND is unnecessary.  Just replace it with the load
4021    DCI.CombineTo(N, Val, AddTo);
4022  }
4023
4024  return SDValue();
4025}
4026
4027static SDValue PerformSELECTCombine(SDNode *N,
4028                                    TargetLowering::DAGCombinerInfo &DCI) {
4029  // Currently this detects patterns for integer min and max and
4030  // lowers them to PTX-specific intrinsics that enable hardware
4031  // support.
4032
4033  const SDValue Cond = N->getOperand(0);
4034  if (Cond.getOpcode() != ISD::SETCC) return SDValue();
4035
4036  const SDValue LHS = Cond.getOperand(0);
4037  const SDValue RHS = Cond.getOperand(1);
4038  const SDValue True = N->getOperand(1);
4039  const SDValue False = N->getOperand(2);
4040  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
4041    return SDValue();
4042
4043  const EVT VT = N->getValueType(0);
4044  if (VT != MVT::i32 && VT != MVT::i64) return SDValue();
4045
4046  const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4047  SDValue Larger;  // The larger of LHS and RHS when condition is true.
4048  switch (CC) {
4049    case ISD::SETULT:
4050    case ISD::SETULE:
4051    case ISD::SETLT:
4052    case ISD::SETLE:
4053      Larger = RHS;
4054      break;
4055
4056    case ISD::SETGT:
4057    case ISD::SETGE:
4058    case ISD::SETUGT:
4059    case ISD::SETUGE:
4060      Larger = LHS;
4061      break;
4062
4063    default:
4064      return SDValue();
4065  }
4066  const bool IsMax = (Larger == True);
4067  const bool IsSigned = ISD::isSignedIntSetCC(CC);
4068
4069  unsigned IntrinsicId;
4070  if (VT == MVT::i32) {
4071    if (IsSigned)
4072      IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
4073    else
4074      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
4075  } else {
4076    assert(VT == MVT::i64);
4077    if (IsSigned)
4078      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
4079    else
4080      IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
4081  }
4082
4083  SDLoc DL(N);
4084  return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
4085                         DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
4086}
4087
4088enum OperandSignedness {
4089  Signed = 0,
4090  Unsigned,
4091  Unknown
4092};
4093
4094/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4095/// that can be demoted to \p OptSize bits without loss of information. The
4096/// signedness of the operand, if determinable, is placed in \p S.
4097static bool IsMulWideOperandDemotable(SDValue Op,
4098                                      unsigned OptSize,
4099                                      OperandSignedness &S) {
4100  S = Unknown;
4101
4102  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4103      Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
4104    EVT OrigVT = Op.getOperand(0).getValueType();
4105    if (OrigVT.getSizeInBits() <= OptSize) {
4106      S = Signed;
4107      return true;
4108    }
4109  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4110    EVT OrigVT = Op.getOperand(0).getValueType();
4111    if (OrigVT.getSizeInBits() <= OptSize) {
4112      S = Unsigned;
4113      return true;
4114    }
4115  }
4116
4117  return false;
4118}
4119
4120/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4121/// be demoted to \p OptSize bits without loss of information. If the operands
4122/// contain a constant, it should appear as the RHS operand. The signedness of
4123/// the operands is placed in \p IsSigned.
4124static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
4125                                        unsigned OptSize,
4126                                        bool &IsSigned) {
4127
4128  OperandSignedness LHSSign;
4129
4130  // The LHS operand must be a demotable op
4131  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4132    return false;
4133
4134  // We should have been able to determine the signedness from the LHS
4135  if (LHSSign == Unknown)
4136    return false;
4137
4138  IsSigned = (LHSSign == Signed);
4139
4140  // The RHS can be a demotable op or a constant
4141  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4142    APInt Val = CI->getAPIntValue();
4143    if (LHSSign == Unsigned) {
4144      return Val.isIntN(OptSize);
4145    } else {
4146      return Val.isSignedIntN(OptSize);
4147    }
4148  } else {
4149    OperandSignedness RHSSign;
4150    if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4151      return false;
4152
4153    return LHSSign == RHSSign;
4154  }
4155}
4156
4157/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4158/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4159/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4160/// amount.
4161static SDValue TryMULWIDECombine(SDNode *N,
4162                                 TargetLowering::DAGCombinerInfo &DCI) {
4163  EVT MulType = N->getValueType(0);
4164  if (MulType != MVT::i32 && MulType != MVT::i64) {
4165    return SDValue();
4166  }
4167
4168  SDLoc DL(N);
4169  unsigned OptSize = MulType.getSizeInBits() >> 1;
4170  SDValue LHS = N->getOperand(0);
4171  SDValue RHS = N->getOperand(1);
4172
4173  // Canonicalize the multiply so the constant (if any) is on the right
4174  if (N->getOpcode() == ISD::MUL) {
4175    if (isa<ConstantSDNode>(LHS)) {
4176      std::swap(LHS, RHS);
4177    }
4178  }
4179
4180  // If we have a SHL, determine the actual multiply amount
4181  if (N->getOpcode() == ISD::SHL) {
4182    ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4183    if (!ShlRHS) {
4184      return SDValue();
4185    }
4186
4187    APInt ShiftAmt = ShlRHS->getAPIntValue();
4188    unsigned BitWidth = MulType.getSizeInBits();
4189    if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4190      APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4191      RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4192    } else {
4193      return SDValue();
4194    }
4195  }
4196
4197  bool Signed;
4198  // Verify that our operands are demotable
4199  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4200    return SDValue();
4201  }
4202
4203  EVT DemotedVT;
4204  if (MulType == MVT::i32) {
4205    DemotedVT = MVT::i16;
4206  } else {
4207    DemotedVT = MVT::i32;
4208  }
4209
4210  // Truncate the operands to the correct size. Note that these are just for
4211  // type consistency and will (likely) be eliminated in later phases.
4212  SDValue TruncLHS =
4213    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
4214  SDValue TruncRHS =
4215    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
4216
4217  unsigned Opc;
4218  if (Signed) {
4219    Opc = NVPTXISD::MUL_WIDE_SIGNED;
4220  } else {
4221    Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
4222  }
4223
4224  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
4225}
4226
4227/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
4228static SDValue PerformMULCombine(SDNode *N,
4229                                 TargetLowering::DAGCombinerInfo &DCI,
4230                                 CodeGenOpt::Level OptLevel) {
4231  if (OptLevel > 0) {
4232    // Try mul.wide combining at OptLevel > 0
4233    SDValue Ret = TryMULWIDECombine(N, DCI);
4234    if (Ret.getNode())
4235      return Ret;
4236  }
4237
4238  return SDValue();
4239}
4240
4241/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
4242static SDValue PerformSHLCombine(SDNode *N,
4243                                 TargetLowering::DAGCombinerInfo &DCI,
4244                                 CodeGenOpt::Level OptLevel) {
4245  if (OptLevel > 0) {
4246    // Try mul.wide combining at OptLevel > 0
4247    SDValue Ret = TryMULWIDECombine(N, DCI);
4248    if (Ret.getNode())
4249      return Ret;
4250  }
4251
4252  return SDValue();
4253}
4254
4255SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
4256                                               DAGCombinerInfo &DCI) const {
4257  CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
4258  switch (N->getOpcode()) {
4259    default: break;
4260    case ISD::ADD:
4261    case ISD::FADD:
4262      return PerformADDCombine(N, DCI, STI, OptLevel);
4263    case ISD::MUL:
4264      return PerformMULCombine(N, DCI, OptLevel);
4265    case ISD::SHL:
4266      return PerformSHLCombine(N, DCI, OptLevel);
4267    case ISD::AND:
4268      return PerformANDCombine(N, DCI);
4269    case ISD::SELECT:
4270      return PerformSELECTCombine(N, DCI);
4271  }
4272  return SDValue();
4273}
4274
4275/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
4276static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
4277                              SmallVectorImpl<SDValue> &Results) {
4278  EVT ResVT = N->getValueType(0);
4279  SDLoc DL(N);
4280
4281  assert(ResVT.isVector() && "Vector load must have vector type");
4282
4283  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
4284  // legal.  We can (and should) split that into 2 loads of <2 x double> here
4285  // but I'm leaving that as a TODO for now.
4286  assert(ResVT.isSimple() && "Can only handle simple types");
4287  switch (ResVT.getSimpleVT().SimpleTy) {
4288  default:
4289    return;
4290  case MVT::v2i8:
4291  case MVT::v2i16:
4292  case MVT::v2i32:
4293  case MVT::v2i64:
4294  case MVT::v2f32:
4295  case MVT::v2f64:
4296  case MVT::v4i8:
4297  case MVT::v4i16:
4298  case MVT::v4i32:
4299  case MVT::v4f32:
4300    // This is a "native" vector type
4301    break;
4302  }
4303
4304  LoadSDNode *LD = cast<LoadSDNode>(N);
4305
4306  unsigned Align = LD->getAlignment();
4307  auto &TD = DAG.getDataLayout();
4308  unsigned PrefAlign =
4309      TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
4310  if (Align < PrefAlign) {
4311    // This load is not sufficiently aligned, so bail out and let this vector
4312    // load be scalarized.  Note that we may still be able to emit smaller
4313    // vector loads.  For example, if we are loading a <4 x float> with an
4314    // alignment of 8, this check will fail but the legalizer will try again
4315    // with 2 x <2 x float>, which will succeed with an alignment of 8.
4316    return;
4317  }
4318
4319  EVT EltVT = ResVT.getVectorElementType();
4320  unsigned NumElts = ResVT.getVectorNumElements();
4321
4322  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
4323  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
4324  // loaded type to i16 and propagate the "real" type as the memory type.
4325  bool NeedTrunc = false;
4326  if (EltVT.getSizeInBits() < 16) {
4327    EltVT = MVT::i16;
4328    NeedTrunc = true;
4329  }
4330
4331  unsigned Opcode = 0;
4332  SDVTList LdResVTs;
4333
4334  switch (NumElts) {
4335  default:
4336    return;
4337  case 2:
4338    Opcode = NVPTXISD::LoadV2;
4339    LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4340    break;
4341  case 4: {
4342    Opcode = NVPTXISD::LoadV4;
4343    EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4344    LdResVTs = DAG.getVTList(ListVTs);
4345    break;
4346  }
4347  }
4348
4349  // Copy regular operands
4350  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
4351
4352  // The select routine does not have access to the LoadSDNode instance, so
4353  // pass along the extension information
4354  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
4355
4356  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
4357                                          LD->getMemoryVT(),
4358                                          LD->getMemOperand());
4359
4360  SmallVector<SDValue, 4> ScalarRes;
4361
4362  for (unsigned i = 0; i < NumElts; ++i) {
4363    SDValue Res = NewLD.getValue(i);
4364    if (NeedTrunc)
4365      Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
4366    ScalarRes.push_back(Res);
4367  }
4368
4369  SDValue LoadChain = NewLD.getValue(NumElts);
4370
4371  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
4372
4373  Results.push_back(BuildVec);
4374  Results.push_back(LoadChain);
4375}
4376
4377static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
4378                                     SmallVectorImpl<SDValue> &Results) {
4379  SDValue Chain = N->getOperand(0);
4380  SDValue Intrin = N->getOperand(1);
4381  SDLoc DL(N);
4382
4383  // Get the intrinsic ID
4384  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
4385  switch (IntrinNo) {
4386  default:
4387    return;
4388  case Intrinsic::nvvm_ldg_global_i:
4389  case Intrinsic::nvvm_ldg_global_f:
4390  case Intrinsic::nvvm_ldg_global_p:
4391  case Intrinsic::nvvm_ldu_global_i:
4392  case Intrinsic::nvvm_ldu_global_f:
4393  case Intrinsic::nvvm_ldu_global_p: {
4394    EVT ResVT = N->getValueType(0);
4395
4396    if (ResVT.isVector()) {
4397      // Vector LDG/LDU
4398
4399      unsigned NumElts = ResVT.getVectorNumElements();
4400      EVT EltVT = ResVT.getVectorElementType();
4401
4402      // Since LDU/LDG are target nodes, we cannot rely on DAG type
4403      // legalization.
4404      // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
4405      // loaded type to i16 and propagate the "real" type as the memory type.
4406      bool NeedTrunc = false;
4407      if (EltVT.getSizeInBits() < 16) {
4408        EltVT = MVT::i16;
4409        NeedTrunc = true;
4410      }
4411
4412      unsigned Opcode = 0;
4413      SDVTList LdResVTs;
4414
4415      switch (NumElts) {
4416      default:
4417        return;
4418      case 2:
4419        switch (IntrinNo) {
4420        default:
4421          return;
4422        case Intrinsic::nvvm_ldg_global_i:
4423        case Intrinsic::nvvm_ldg_global_f:
4424        case Intrinsic::nvvm_ldg_global_p:
4425          Opcode = NVPTXISD::LDGV2;
4426          break;
4427        case Intrinsic::nvvm_ldu_global_i:
4428        case Intrinsic::nvvm_ldu_global_f:
4429        case Intrinsic::nvvm_ldu_global_p:
4430          Opcode = NVPTXISD::LDUV2;
4431          break;
4432        }
4433        LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4434        break;
4435      case 4: {
4436        switch (IntrinNo) {
4437        default:
4438          return;
4439        case Intrinsic::nvvm_ldg_global_i:
4440        case Intrinsic::nvvm_ldg_global_f:
4441        case Intrinsic::nvvm_ldg_global_p:
4442          Opcode = NVPTXISD::LDGV4;
4443          break;
4444        case Intrinsic::nvvm_ldu_global_i:
4445        case Intrinsic::nvvm_ldu_global_f:
4446        case Intrinsic::nvvm_ldu_global_p:
4447          Opcode = NVPTXISD::LDUV4;
4448          break;
4449        }
4450        EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4451        LdResVTs = DAG.getVTList(ListVTs);
4452        break;
4453      }
4454      }
4455
4456      SmallVector<SDValue, 8> OtherOps;
4457
4458      // Copy regular operands
4459
4460      OtherOps.push_back(Chain); // Chain
4461                                 // Skip operand 1 (intrinsic ID)
4462      // Others
4463      OtherOps.append(N->op_begin() + 2, N->op_end());
4464
4465      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
4466
4467      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
4468                                              MemSD->getMemoryVT(),
4469                                              MemSD->getMemOperand());
4470
4471      SmallVector<SDValue, 4> ScalarRes;
4472
4473      for (unsigned i = 0; i < NumElts; ++i) {
4474        SDValue Res = NewLD.getValue(i);
4475        if (NeedTrunc)
4476          Res =
4477              DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
4478        ScalarRes.push_back(Res);
4479      }
4480
4481      SDValue LoadChain = NewLD.getValue(NumElts);
4482
4483      SDValue BuildVec =
4484          DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
4485
4486      Results.push_back(BuildVec);
4487      Results.push_back(LoadChain);
4488    } else {
4489      // i8 LDG/LDU
4490      assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
4491             "Custom handling of non-i8 ldu/ldg?");
4492
4493      // Just copy all operands as-is
4494      SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
4495
4496      // Force output to i16
4497      SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
4498
4499      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
4500
4501      // We make sure the memory type is i8, which will be used during isel
4502      // to select the proper instruction.
4503      SDValue NewLD =
4504          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
4505                                  MVT::i8, MemSD->getMemOperand());
4506
4507      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
4508                                    NewLD.getValue(0)));
4509      Results.push_back(NewLD.getValue(1));
4510    }
4511  }
4512  }
4513}
4514
4515void NVPTXTargetLowering::ReplaceNodeResults(
4516    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
4517  switch (N->getOpcode()) {
4518  default:
4519    report_fatal_error("Unhandled custom legalization");
4520  case ISD::LOAD:
4521    ReplaceLoadVector(N, DAG, Results);
4522    return;
4523  case ISD::INTRINSIC_W_CHAIN:
4524    ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
4525    return;
4526  }
4527}
4528
4529// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
4530void NVPTXSection::anchor() {}
4531
4532NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
4533  delete static_cast<NVPTXSection *>(TextSection);
4534  delete static_cast<NVPTXSection *>(DataSection);
4535  delete static_cast<NVPTXSection *>(BSSSection);
4536  delete static_cast<NVPTXSection *>(ReadOnlySection);
4537
4538  delete static_cast<NVPTXSection *>(StaticCtorSection);
4539  delete static_cast<NVPTXSection *>(StaticDtorSection);
4540  delete static_cast<NVPTXSection *>(LSDASection);
4541  delete static_cast<NVPTXSection *>(EHFrameSection);
4542  delete static_cast<NVPTXSection *>(DwarfAbbrevSection);
4543  delete static_cast<NVPTXSection *>(DwarfInfoSection);
4544  delete static_cast<NVPTXSection *>(DwarfLineSection);
4545  delete static_cast<NVPTXSection *>(DwarfFrameSection);
4546  delete static_cast<NVPTXSection *>(DwarfPubTypesSection);
4547  delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection);
4548  delete static_cast<NVPTXSection *>(DwarfStrSection);
4549  delete static_cast<NVPTXSection *>(DwarfLocSection);
4550  delete static_cast<NVPTXSection *>(DwarfARangesSection);
4551  delete static_cast<NVPTXSection *>(DwarfRangesSection);
4552  delete static_cast<NVPTXSection *>(DwarfMacinfoSection);
4553}
4554
4555MCSection *
4556NVPTXTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
4557                                              SectionKind Kind, Mangler &Mang,
4558                                              const TargetMachine &TM) const {
4559  return getDataSection();
4560}
4561