1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
15#include "MCTargetDesc/NVPTXBaseInfo.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
19#include "NVPTXTargetObjectFile.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/SmallVector.h"
23#include "llvm/ADT/StringRef.h"
24#include "llvm/CodeGen/Analysis.h"
25#include "llvm/CodeGen/MachineFunction.h"
26#include "llvm/CodeGen/MachineMemOperand.h"
27#include "llvm/CodeGen/SelectionDAG.h"
28#include "llvm/CodeGen/SelectionDAGNodes.h"
29#include "llvm/CodeGen/TargetCallingConv.h"
30#include "llvm/CodeGen/TargetLowering.h"
31#include "llvm/CodeGen/ValueTypes.h"
32#include "llvm/IR/Argument.h"
33#include "llvm/IR/Attributes.h"
34#include "llvm/IR/CallSite.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DataLayout.h"
37#include "llvm/IR/DerivedTypes.h"
38#include "llvm/IR/Function.h"
39#include "llvm/IR/GlobalValue.h"
40#include "llvm/IR/Instruction.h"
41#include "llvm/IR/Instructions.h"
42#include "llvm/IR/IntrinsicsNVPTX.h"
43#include "llvm/IR/Module.h"
44#include "llvm/IR/Type.h"
45#include "llvm/IR/Value.h"
46#include "llvm/Support/Casting.h"
47#include "llvm/Support/CodeGen.h"
48#include "llvm/Support/CommandLine.h"
49#include "llvm/Support/ErrorHandling.h"
50#include "llvm/Support/MachineValueType.h"
51#include "llvm/Support/MathExtras.h"
52#include "llvm/Support/raw_ostream.h"
53#include "llvm/Target/TargetMachine.h"
54#include "llvm/Target/TargetOptions.h"
55#include <algorithm>
56#include <cassert>
57#include <cstdint>
58#include <iterator>
59#include <sstream>
60#include <string>
61#include <utility>
62#include <vector>
63
64#define DEBUG_TYPE "nvptx-lower"
65
66using namespace llvm;
67
68static unsigned int uniqueCallSite = 0;
69
70static cl::opt<bool> sched4reg(
71    "nvptx-sched4reg",
72    cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
73
74static cl::opt<unsigned>
75FMAContractLevelOpt("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
76                    cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
77                             " 1: do it  2: do it aggressively"),
78                    cl::init(2));
79
80static cl::opt<int> UsePrecDivF32(
81    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
82    cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
83             " IEEE Compliant F32 div.rnd if available."),
84    cl::init(2));
85
86static cl::opt<bool> UsePrecSqrtF32(
87    "nvptx-prec-sqrtf32", cl::Hidden,
88    cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
89    cl::init(true));
90
91static cl::opt<bool> FtzEnabled(
92    "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
93    cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
94    cl::init(false));
95
96int NVPTXTargetLowering::getDivF32Level() const {
97  if (UsePrecDivF32.getNumOccurrences() > 0) {
98    // If nvptx-prec-div32=N is used on the command-line, always honor it
99    return UsePrecDivF32;
100  } else {
101    // Otherwise, use div.approx if fast math is enabled
102    if (getTargetMachine().Options.UnsafeFPMath)
103      return 0;
104    else
105      return 2;
106  }
107}
108
109bool NVPTXTargetLowering::usePrecSqrtF32() const {
110  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
111    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
112    return UsePrecSqrtF32;
113  } else {
114    // Otherwise, use sqrt.approx if fast math is enabled
115    return !getTargetMachine().Options.UnsafeFPMath;
116  }
117}
118
119bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
120  // TODO: Get rid of this flag; there can be only one way to do this.
121  if (FtzEnabled.getNumOccurrences() > 0) {
122    // If nvptx-f32ftz is used on the command-line, always honor it
123    return FtzEnabled;
124  } else {
125    const Function &F = MF.getFunction();
126    // Otherwise, check for an nvptx-f32ftz attribute on the function
127    if (F.hasFnAttribute("nvptx-f32ftz"))
128      return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
129    else
130      return false;
131  }
132}
133
134static bool IsPTXVectorType(MVT VT) {
135  switch (VT.SimpleTy) {
136  default:
137    return false;
138  case MVT::v2i1:
139  case MVT::v4i1:
140  case MVT::v2i8:
141  case MVT::v4i8:
142  case MVT::v2i16:
143  case MVT::v4i16:
144  case MVT::v2i32:
145  case MVT::v4i32:
146  case MVT::v2i64:
147  case MVT::v2f16:
148  case MVT::v4f16:
149  case MVT::v8f16: // <4 x f16x2>
150  case MVT::v2f32:
151  case MVT::v4f32:
152  case MVT::v2f64:
153    return true;
154  }
155}
156
157/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
158/// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
159/// into their primitive components.
160/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
161/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
162/// LowerCall, and LowerReturn.
163static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
164                               Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
165                               SmallVectorImpl<uint64_t> *Offsets = nullptr,
166                               uint64_t StartingOffset = 0) {
167  SmallVector<EVT, 16> TempVTs;
168  SmallVector<uint64_t, 16> TempOffsets;
169
170  // Special case for i128 - decompose to (i64, i64)
171  if (Ty->isIntegerTy(128)) {
172    ValueVTs.push_back(EVT(MVT::i64));
173    ValueVTs.push_back(EVT(MVT::i64));
174
175    if (Offsets) {
176      Offsets->push_back(StartingOffset + 0);
177      Offsets->push_back(StartingOffset + 8);
178    }
179
180    return;
181  }
182
183  // Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
184  if (StructType *STy = dyn_cast<StructType>(Ty)) {
185    auto const *SL = DL.getStructLayout(STy);
186    auto ElementNum = 0;
187    for(auto *EI : STy->elements()) {
188      ComputePTXValueVTs(TLI, DL, EI, ValueVTs, Offsets,
189                         StartingOffset + SL->getElementOffset(ElementNum));
190      ++ElementNum;
191    }
192    return;
193  }
194
195  ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
196  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
197    EVT VT = TempVTs[i];
198    uint64_t Off = TempOffsets[i];
199    // Split vectors into individual elements, except for v2f16, which
200    // we will pass as a single scalar.
201    if (VT.isVector()) {
202      unsigned NumElts = VT.getVectorNumElements();
203      EVT EltVT = VT.getVectorElementType();
204      // Vectors with an even number of f16 elements will be passed to
205      // us as an array of v2f16 elements. We must match this so we
206      // stay in sync with Ins/Outs.
207      if (EltVT == MVT::f16 && NumElts % 2 == 0) {
208        EltVT = MVT::v2f16;
209        NumElts /= 2;
210      }
211      for (unsigned j = 0; j != NumElts; ++j) {
212        ValueVTs.push_back(EltVT);
213        if (Offsets)
214          Offsets->push_back(Off + j * EltVT.getStoreSize());
215      }
216    } else {
217      ValueVTs.push_back(VT);
218      if (Offsets)
219        Offsets->push_back(Off);
220    }
221  }
222}
223
224// Check whether we can merge loads/stores of some of the pieces of a
225// flattened function parameter or return value into a single vector
226// load/store.
227//
228// The flattened parameter is represented as a list of EVTs and
229// offsets, and the whole structure is aligned to ParamAlignment. This
230// function determines whether we can load/store pieces of the
231// parameter starting at index Idx using a single vectorized op of
232// size AccessSize. If so, it returns the number of param pieces
233// covered by the vector op. Otherwise, it returns 1.
234static unsigned CanMergeParamLoadStoresStartingAt(
235    unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
236    const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
237  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
238
239  // Can't vectorize if param alignment is not sufficient.
240  if (AccessSize > ParamAlignment)
241    return 1;
242  // Can't vectorize if offset is not aligned.
243  if (Offsets[Idx] & (AccessSize - 1))
244    return 1;
245
246  EVT EltVT = ValueVTs[Idx];
247  unsigned EltSize = EltVT.getStoreSize();
248
249  // Element is too large to vectorize.
250  if (EltSize >= AccessSize)
251    return 1;
252
253  unsigned NumElts = AccessSize / EltSize;
254  // Can't vectorize if AccessBytes if not a multiple of EltSize.
255  if (AccessSize != EltSize * NumElts)
256    return 1;
257
258  // We don't have enough elements to vectorize.
259  if (Idx + NumElts > ValueVTs.size())
260    return 1;
261
262  // PTX ISA can only deal with 2- and 4-element vector ops.
263  if (NumElts != 4 && NumElts != 2)
264    return 1;
265
266  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
267    // Types do not match.
268    if (ValueVTs[j] != EltVT)
269      return 1;
270
271    // Elements are not contiguous.
272    if (Offsets[j] - Offsets[j - 1] != EltSize)
273      return 1;
274  }
275  // OK. We can vectorize ValueVTs[i..i+NumElts)
276  return NumElts;
277}
278
279// Flags for tracking per-element vectorization state of loads/stores
280// of a flattened function parameter or return value.
281enum ParamVectorizationFlags {
282  PVF_INNER = 0x0, // Middle elements of a vector.
283  PVF_FIRST = 0x1, // First element of the vector.
284  PVF_LAST = 0x2,  // Last element of the vector.
285  // Scalar is effectively a 1-element vector.
286  PVF_SCALAR = PVF_FIRST | PVF_LAST
287};
288
289// Computes whether and how we can vectorize the loads/stores of a
290// flattened function parameter or return value.
291//
292// The flattened parameter is represented as the list of ValueVTs and
293// Offsets, and is aligned to ParamAlignment bytes. We return a vector
294// of the same size as ValueVTs indicating how each piece should be
295// loaded/stored (i.e. as a scalar, or as part of a vector
296// load/store).
297static SmallVector<ParamVectorizationFlags, 16>
298VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
299                     const SmallVectorImpl<uint64_t> &Offsets,
300                     unsigned ParamAlignment) {
301  // Set vector size to match ValueVTs and mark all elements as
302  // scalars by default.
303  SmallVector<ParamVectorizationFlags, 16> VectorInfo;
304  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
305
306  // Check what we can vectorize using 128/64/32-bit accesses.
307  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
308    // Skip elements we've already processed.
309    assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
310    for (unsigned AccessSize : {16, 8, 4, 2}) {
311      unsigned NumElts = CanMergeParamLoadStoresStartingAt(
312          I, AccessSize, ValueVTs, Offsets, ParamAlignment);
313      // Mark vectorized elements.
314      switch (NumElts) {
315      default:
316        llvm_unreachable("Unexpected return value");
317      case 1:
318        // Can't vectorize using this size, try next smaller size.
319        continue;
320      case 2:
321        assert(I + 1 < E && "Not enough elements.");
322        VectorInfo[I] = PVF_FIRST;
323        VectorInfo[I + 1] = PVF_LAST;
324        I += 1;
325        break;
326      case 4:
327        assert(I + 3 < E && "Not enough elements.");
328        VectorInfo[I] = PVF_FIRST;
329        VectorInfo[I + 1] = PVF_INNER;
330        VectorInfo[I + 2] = PVF_INNER;
331        VectorInfo[I + 3] = PVF_LAST;
332        I += 3;
333        break;
334      }
335      // Break out of the inner loop because we've already succeeded
336      // using largest possible AccessSize.
337      break;
338    }
339  }
340  return VectorInfo;
341}
342
343// NVPTXTargetLowering Constructor.
344NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
345                                         const NVPTXSubtarget &STI)
346    : TargetLowering(TM), nvTM(&TM), STI(STI) {
347  // always lower memset, memcpy, and memmove intrinsics to load/store
348  // instructions, rather
349  // then generating calls to memset, mempcy or memmove.
350  MaxStoresPerMemset = (unsigned) 0xFFFFFFFF;
351  MaxStoresPerMemcpy = (unsigned) 0xFFFFFFFF;
352  MaxStoresPerMemmove = (unsigned) 0xFFFFFFFF;
353
354  setBooleanContents(ZeroOrNegativeOneBooleanContent);
355  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
356
357  // Jump is Expensive. Don't create extra control flow for 'and', 'or'
358  // condition branches.
359  setJumpIsExpensive(true);
360
361  // Wide divides are _very_ slow. Try to reduce the width of the divide if
362  // possible.
363  addBypassSlowDiv(64, 32);
364
365  // By default, use the Source scheduling
366  if (sched4reg)
367    setSchedulingPreference(Sched::RegPressure);
368  else
369    setSchedulingPreference(Sched::Source);
370
371  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
372                                    LegalizeAction NoF16Action) {
373    setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
374  };
375
376  addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
377  addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
378  addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
379  addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
380  addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
381  addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
382  addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
383  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
384
385  // Conversion to/from FP16/FP16x2 is always legal.
386  setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
387  setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
388  setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
389  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
390  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
391  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
392
393  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
394  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
395
396  // Operations not directly supported by NVPTX.
397  for (MVT VT : {MVT::f16, MVT::v2f16, MVT::f32, MVT::f64, MVT::i1, MVT::i8,
398                 MVT::i16, MVT::i32, MVT::i64}) {
399    setOperationAction(ISD::SELECT_CC, VT, Expand);
400    setOperationAction(ISD::BR_CC, VT, Expand);
401  }
402
403  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
404  // For others we will expand to a SHL/SRA pair.
405  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
406  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
407  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
408  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
409  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
410
411  setOperationAction(ISD::SHL_PARTS, MVT::i32  , Custom);
412  setOperationAction(ISD::SRA_PARTS, MVT::i32  , Custom);
413  setOperationAction(ISD::SRL_PARTS, MVT::i32  , Custom);
414  setOperationAction(ISD::SHL_PARTS, MVT::i64  , Custom);
415  setOperationAction(ISD::SRA_PARTS, MVT::i64  , Custom);
416  setOperationAction(ISD::SRL_PARTS, MVT::i64  , Custom);
417
418  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
419  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
420
421  // TODO: we may consider expanding ROTL/ROTR on older GPUs.  Currently on GPUs
422  // that don't have h/w rotation we lower them to multi-instruction assembly.
423  // See ROT*_sw in NVPTXIntrInfo.td
424  setOperationAction(ISD::ROTL, MVT::i64, Legal);
425  setOperationAction(ISD::ROTR, MVT::i64, Legal);
426  setOperationAction(ISD::ROTL, MVT::i32, Legal);
427  setOperationAction(ISD::ROTR, MVT::i32, Legal);
428
429  setOperationAction(ISD::ROTL, MVT::i16, Expand);
430  setOperationAction(ISD::ROTR, MVT::i16, Expand);
431  setOperationAction(ISD::ROTL, MVT::i8, Expand);
432  setOperationAction(ISD::ROTR, MVT::i8, Expand);
433  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
434  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
435  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
436
437  // Indirect branch is not supported.
438  // This also disables Jump Table creation.
439  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
440  setOperationAction(ISD::BRIND, MVT::Other, Expand);
441
442  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
443  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
444
445  // We want to legalize constant related memmove and memcopy
446  // intrinsics.
447  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
448
449  // Turn FP extload into load/fpextend
450  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
451  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
452  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
453  setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
454  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
455  setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
456  setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
457  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
458  setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
459  // Turn FP truncstore into trunc + store.
460  // FIXME: vector types should also be expanded
461  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
462  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
463  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
464
465  // PTX does not support load / store predicate registers
466  setOperationAction(ISD::LOAD, MVT::i1, Custom);
467  setOperationAction(ISD::STORE, MVT::i1, Custom);
468
469  for (MVT VT : MVT::integer_valuetypes()) {
470    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
471    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
472    setTruncStoreAction(VT, MVT::i1, Expand);
473  }
474
475  // This is legal in NVPTX
476  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
477  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
478  setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
479
480  // TRAP can be lowered to PTX trap
481  setOperationAction(ISD::TRAP, MVT::Other, Legal);
482
483  // Register custom handling for vector loads/stores
484  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
485    if (IsPTXVectorType(VT)) {
486      setOperationAction(ISD::LOAD, VT, Custom);
487      setOperationAction(ISD::STORE, VT, Custom);
488      setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
489    }
490  }
491
492  // Custom handling for i8 intrinsics
493  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
494
495  for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
496    setOperationAction(ISD::ABS,  Ty, Legal);
497    setOperationAction(ISD::SMIN, Ty, Legal);
498    setOperationAction(ISD::SMAX, Ty, Legal);
499    setOperationAction(ISD::UMIN, Ty, Legal);
500    setOperationAction(ISD::UMAX, Ty, Legal);
501
502    setOperationAction(ISD::CTPOP, Ty, Legal);
503    setOperationAction(ISD::CTLZ, Ty, Legal);
504  }
505
506  setOperationAction(ISD::CTTZ, MVT::i16, Expand);
507  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
508  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
509
510  // PTX does not directly support SELP of i1, so promote to i32 first
511  setOperationAction(ISD::SELECT, MVT::i1, Custom);
512
513  // PTX cannot multiply two i64s in a single instruction.
514  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
515  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
516
517  // We have some custom DAG combine patterns for these nodes
518  setTargetDAGCombine(ISD::ADD);
519  setTargetDAGCombine(ISD::AND);
520  setTargetDAGCombine(ISD::FADD);
521  setTargetDAGCombine(ISD::MUL);
522  setTargetDAGCombine(ISD::SHL);
523  setTargetDAGCombine(ISD::SREM);
524  setTargetDAGCombine(ISD::UREM);
525
526  // setcc for f16x2 needs special handling to prevent legalizer's
527  // attempt to scalarize it due to v2i1 not being legal.
528  if (STI.allowFP16Math())
529    setTargetDAGCombine(ISD::SETCC);
530
531  // Promote fp16 arithmetic if fp16 hardware isn't available or the
532  // user passed --nvptx-no-fp16-math. The flag is useful because,
533  // although sm_53+ GPUs have some sort of FP16 support in
534  // hardware, only sm_53 and sm_60 have full implementation. Others
535  // only have token amount of hardware and are likely to run faster
536  // by using fp32 units instead.
537  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
538    setFP16OperationAction(Op, MVT::f16, Legal, Promote);
539    setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
540  }
541
542  // There's no neg.f16 instruction. Expand to (0-x).
543  setOperationAction(ISD::FNEG, MVT::f16, Expand);
544  setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
545
546  // (would be) Library functions.
547
548  // These map to conversion instructions for scalar FP types.
549  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
550                         ISD::FTRUNC}) {
551    setOperationAction(Op, MVT::f16, Legal);
552    setOperationAction(Op, MVT::f32, Legal);
553    setOperationAction(Op, MVT::f64, Legal);
554    setOperationAction(Op, MVT::v2f16, Expand);
555  }
556
557  setOperationAction(ISD::FROUND, MVT::f16, Promote);
558  setOperationAction(ISD::FROUND, MVT::v2f16, Expand);
559  setOperationAction(ISD::FROUND, MVT::f32, Custom);
560  setOperationAction(ISD::FROUND, MVT::f64, Custom);
561
562
563  // 'Expand' implements FCOPYSIGN without calling an external library.
564  setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
565  setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
566  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
567  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
568
569  // These map to corresponding instructions for f32/f64. f16 must be
570  // promoted to f32. v2f16 is expanded to f16, which is then promoted
571  // to f32.
572  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
573                         ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
574    setOperationAction(Op, MVT::f16, Promote);
575    setOperationAction(Op, MVT::f32, Legal);
576    setOperationAction(Op, MVT::f64, Legal);
577    setOperationAction(Op, MVT::v2f16, Expand);
578  }
579  setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
580  setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
581  setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
582  setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
583
584  // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
585  // No FPOW or FREM in PTX.
586
587  // Now deduce the information based on the above mentioned
588  // actions
589  computeRegisterProperties(STI.getRegisterInfo());
590}
591
592const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
593  switch ((NVPTXISD::NodeType)Opcode) {
594  case NVPTXISD::FIRST_NUMBER:
595    break;
596  case NVPTXISD::CALL:
597    return "NVPTXISD::CALL";
598  case NVPTXISD::RET_FLAG:
599    return "NVPTXISD::RET_FLAG";
600  case NVPTXISD::LOAD_PARAM:
601    return "NVPTXISD::LOAD_PARAM";
602  case NVPTXISD::Wrapper:
603    return "NVPTXISD::Wrapper";
604  case NVPTXISD::DeclareParam:
605    return "NVPTXISD::DeclareParam";
606  case NVPTXISD::DeclareScalarParam:
607    return "NVPTXISD::DeclareScalarParam";
608  case NVPTXISD::DeclareRet:
609    return "NVPTXISD::DeclareRet";
610  case NVPTXISD::DeclareScalarRet:
611    return "NVPTXISD::DeclareScalarRet";
612  case NVPTXISD::DeclareRetParam:
613    return "NVPTXISD::DeclareRetParam";
614  case NVPTXISD::PrintCall:
615    return "NVPTXISD::PrintCall";
616  case NVPTXISD::PrintConvergentCall:
617    return "NVPTXISD::PrintConvergentCall";
618  case NVPTXISD::PrintCallUni:
619    return "NVPTXISD::PrintCallUni";
620  case NVPTXISD::PrintConvergentCallUni:
621    return "NVPTXISD::PrintConvergentCallUni";
622  case NVPTXISD::LoadParam:
623    return "NVPTXISD::LoadParam";
624  case NVPTXISD::LoadParamV2:
625    return "NVPTXISD::LoadParamV2";
626  case NVPTXISD::LoadParamV4:
627    return "NVPTXISD::LoadParamV4";
628  case NVPTXISD::StoreParam:
629    return "NVPTXISD::StoreParam";
630  case NVPTXISD::StoreParamV2:
631    return "NVPTXISD::StoreParamV2";
632  case NVPTXISD::StoreParamV4:
633    return "NVPTXISD::StoreParamV4";
634  case NVPTXISD::StoreParamS32:
635    return "NVPTXISD::StoreParamS32";
636  case NVPTXISD::StoreParamU32:
637    return "NVPTXISD::StoreParamU32";
638  case NVPTXISD::CallArgBegin:
639    return "NVPTXISD::CallArgBegin";
640  case NVPTXISD::CallArg:
641    return "NVPTXISD::CallArg";
642  case NVPTXISD::LastCallArg:
643    return "NVPTXISD::LastCallArg";
644  case NVPTXISD::CallArgEnd:
645    return "NVPTXISD::CallArgEnd";
646  case NVPTXISD::CallVoid:
647    return "NVPTXISD::CallVoid";
648  case NVPTXISD::CallVal:
649    return "NVPTXISD::CallVal";
650  case NVPTXISD::CallSymbol:
651    return "NVPTXISD::CallSymbol";
652  case NVPTXISD::Prototype:
653    return "NVPTXISD::Prototype";
654  case NVPTXISD::MoveParam:
655    return "NVPTXISD::MoveParam";
656  case NVPTXISD::StoreRetval:
657    return "NVPTXISD::StoreRetval";
658  case NVPTXISD::StoreRetvalV2:
659    return "NVPTXISD::StoreRetvalV2";
660  case NVPTXISD::StoreRetvalV4:
661    return "NVPTXISD::StoreRetvalV4";
662  case NVPTXISD::PseudoUseParam:
663    return "NVPTXISD::PseudoUseParam";
664  case NVPTXISD::RETURN:
665    return "NVPTXISD::RETURN";
666  case NVPTXISD::CallSeqBegin:
667    return "NVPTXISD::CallSeqBegin";
668  case NVPTXISD::CallSeqEnd:
669    return "NVPTXISD::CallSeqEnd";
670  case NVPTXISD::CallPrototype:
671    return "NVPTXISD::CallPrototype";
672  case NVPTXISD::ProxyReg:
673    return "NVPTXISD::ProxyReg";
674  case NVPTXISD::LoadV2:
675    return "NVPTXISD::LoadV2";
676  case NVPTXISD::LoadV4:
677    return "NVPTXISD::LoadV4";
678  case NVPTXISD::LDGV2:
679    return "NVPTXISD::LDGV2";
680  case NVPTXISD::LDGV4:
681    return "NVPTXISD::LDGV4";
682  case NVPTXISD::LDUV2:
683    return "NVPTXISD::LDUV2";
684  case NVPTXISD::LDUV4:
685    return "NVPTXISD::LDUV4";
686  case NVPTXISD::StoreV2:
687    return "NVPTXISD::StoreV2";
688  case NVPTXISD::StoreV4:
689    return "NVPTXISD::StoreV4";
690  case NVPTXISD::FUN_SHFL_CLAMP:
691    return "NVPTXISD::FUN_SHFL_CLAMP";
692  case NVPTXISD::FUN_SHFR_CLAMP:
693    return "NVPTXISD::FUN_SHFR_CLAMP";
694  case NVPTXISD::IMAD:
695    return "NVPTXISD::IMAD";
696  case NVPTXISD::SETP_F16X2:
697    return "NVPTXISD::SETP_F16X2";
698  case NVPTXISD::Dummy:
699    return "NVPTXISD::Dummy";
700  case NVPTXISD::MUL_WIDE_SIGNED:
701    return "NVPTXISD::MUL_WIDE_SIGNED";
702  case NVPTXISD::MUL_WIDE_UNSIGNED:
703    return "NVPTXISD::MUL_WIDE_UNSIGNED";
704  case NVPTXISD::Tex1DFloatS32:        return "NVPTXISD::Tex1DFloatS32";
705  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
706  case NVPTXISD::Tex1DFloatFloatLevel:
707    return "NVPTXISD::Tex1DFloatFloatLevel";
708  case NVPTXISD::Tex1DFloatFloatGrad:
709    return "NVPTXISD::Tex1DFloatFloatGrad";
710  case NVPTXISD::Tex1DS32S32:          return "NVPTXISD::Tex1DS32S32";
711  case NVPTXISD::Tex1DS32Float:        return "NVPTXISD::Tex1DS32Float";
712  case NVPTXISD::Tex1DS32FloatLevel:
713    return "NVPTXISD::Tex1DS32FloatLevel";
714  case NVPTXISD::Tex1DS32FloatGrad:
715    return "NVPTXISD::Tex1DS32FloatGrad";
716  case NVPTXISD::Tex1DU32S32:          return "NVPTXISD::Tex1DU32S32";
717  case NVPTXISD::Tex1DU32Float:        return "NVPTXISD::Tex1DU32Float";
718  case NVPTXISD::Tex1DU32FloatLevel:
719    return "NVPTXISD::Tex1DU32FloatLevel";
720  case NVPTXISD::Tex1DU32FloatGrad:
721    return "NVPTXISD::Tex1DU32FloatGrad";
722  case NVPTXISD::Tex1DArrayFloatS32:   return "NVPTXISD::Tex1DArrayFloatS32";
723  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex1DArrayFloatFloat";
724  case NVPTXISD::Tex1DArrayFloatFloatLevel:
725    return "NVPTXISD::Tex1DArrayFloatFloatLevel";
726  case NVPTXISD::Tex1DArrayFloatFloatGrad:
727    return "NVPTXISD::Tex1DArrayFloatFloatGrad";
728  case NVPTXISD::Tex1DArrayS32S32:     return "NVPTXISD::Tex1DArrayS32S32";
729  case NVPTXISD::Tex1DArrayS32Float:   return "NVPTXISD::Tex1DArrayS32Float";
730  case NVPTXISD::Tex1DArrayS32FloatLevel:
731    return "NVPTXISD::Tex1DArrayS32FloatLevel";
732  case NVPTXISD::Tex1DArrayS32FloatGrad:
733    return "NVPTXISD::Tex1DArrayS32FloatGrad";
734  case NVPTXISD::Tex1DArrayU32S32:     return "NVPTXISD::Tex1DArrayU32S32";
735  case NVPTXISD::Tex1DArrayU32Float:   return "NVPTXISD::Tex1DArrayU32Float";
736  case NVPTXISD::Tex1DArrayU32FloatLevel:
737    return "NVPTXISD::Tex1DArrayU32FloatLevel";
738  case NVPTXISD::Tex1DArrayU32FloatGrad:
739    return "NVPTXISD::Tex1DArrayU32FloatGrad";
740  case NVPTXISD::Tex2DFloatS32:        return "NVPTXISD::Tex2DFloatS32";
741  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
742  case NVPTXISD::Tex2DFloatFloatLevel:
743    return "NVPTXISD::Tex2DFloatFloatLevel";
744  case NVPTXISD::Tex2DFloatFloatGrad:
745    return "NVPTXISD::Tex2DFloatFloatGrad";
746  case NVPTXISD::Tex2DS32S32:          return "NVPTXISD::Tex2DS32S32";
747  case NVPTXISD::Tex2DS32Float:        return "NVPTXISD::Tex2DS32Float";
748  case NVPTXISD::Tex2DS32FloatLevel:
749    return "NVPTXISD::Tex2DS32FloatLevel";
750  case NVPTXISD::Tex2DS32FloatGrad:
751    return "NVPTXISD::Tex2DS32FloatGrad";
752  case NVPTXISD::Tex2DU32S32:          return "NVPTXISD::Tex2DU32S32";
753  case NVPTXISD::Tex2DU32Float:        return "NVPTXISD::Tex2DU32Float";
754  case NVPTXISD::Tex2DU32FloatLevel:
755    return "NVPTXISD::Tex2DU32FloatLevel";
756  case NVPTXISD::Tex2DU32FloatGrad:
757    return "NVPTXISD::Tex2DU32FloatGrad";
758  case NVPTXISD::Tex2DArrayFloatS32:   return "NVPTXISD::Tex2DArrayFloatS32";
759  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
760  case NVPTXISD::Tex2DArrayFloatFloatLevel:
761    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
762  case NVPTXISD::Tex2DArrayFloatFloatGrad:
763    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
764  case NVPTXISD::Tex2DArrayS32S32:     return "NVPTXISD::Tex2DArrayS32S32";
765  case NVPTXISD::Tex2DArrayS32Float:   return "NVPTXISD::Tex2DArrayS32Float";
766  case NVPTXISD::Tex2DArrayS32FloatLevel:
767    return "NVPTXISD::Tex2DArrayS32FloatLevel";
768  case NVPTXISD::Tex2DArrayS32FloatGrad:
769    return "NVPTXISD::Tex2DArrayS32FloatGrad";
770  case NVPTXISD::Tex2DArrayU32S32:     return "NVPTXISD::Tex2DArrayU32S32";
771  case NVPTXISD::Tex2DArrayU32Float:   return "NVPTXISD::Tex2DArrayU32Float";
772  case NVPTXISD::Tex2DArrayU32FloatLevel:
773    return "NVPTXISD::Tex2DArrayU32FloatLevel";
774  case NVPTXISD::Tex2DArrayU32FloatGrad:
775    return "NVPTXISD::Tex2DArrayU32FloatGrad";
776  case NVPTXISD::Tex3DFloatS32:        return "NVPTXISD::Tex3DFloatS32";
777  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
778  case NVPTXISD::Tex3DFloatFloatLevel:
779    return "NVPTXISD::Tex3DFloatFloatLevel";
780  case NVPTXISD::Tex3DFloatFloatGrad:
781    return "NVPTXISD::Tex3DFloatFloatGrad";
782  case NVPTXISD::Tex3DS32S32:          return "NVPTXISD::Tex3DS32S32";
783  case NVPTXISD::Tex3DS32Float:        return "NVPTXISD::Tex3DS32Float";
784  case NVPTXISD::Tex3DS32FloatLevel:
785    return "NVPTXISD::Tex3DS32FloatLevel";
786  case NVPTXISD::Tex3DS32FloatGrad:
787    return "NVPTXISD::Tex3DS32FloatGrad";
788  case NVPTXISD::Tex3DU32S32:          return "NVPTXISD::Tex3DU32S32";
789  case NVPTXISD::Tex3DU32Float:        return "NVPTXISD::Tex3DU32Float";
790  case NVPTXISD::Tex3DU32FloatLevel:
791    return "NVPTXISD::Tex3DU32FloatLevel";
792  case NVPTXISD::Tex3DU32FloatGrad:
793    return "NVPTXISD::Tex3DU32FloatGrad";
794  case NVPTXISD::TexCubeFloatFloat:      return "NVPTXISD::TexCubeFloatFloat";
795  case NVPTXISD::TexCubeFloatFloatLevel:
796    return "NVPTXISD::TexCubeFloatFloatLevel";
797  case NVPTXISD::TexCubeS32Float:        return "NVPTXISD::TexCubeS32Float";
798  case NVPTXISD::TexCubeS32FloatLevel:
799    return "NVPTXISD::TexCubeS32FloatLevel";
800  case NVPTXISD::TexCubeU32Float:        return "NVPTXISD::TexCubeU32Float";
801  case NVPTXISD::TexCubeU32FloatLevel:
802    return "NVPTXISD::TexCubeU32FloatLevel";
803  case NVPTXISD::TexCubeArrayFloatFloat:
804    return "NVPTXISD::TexCubeArrayFloatFloat";
805  case NVPTXISD::TexCubeArrayFloatFloatLevel:
806    return "NVPTXISD::TexCubeArrayFloatFloatLevel";
807  case NVPTXISD::TexCubeArrayS32Float:
808    return "NVPTXISD::TexCubeArrayS32Float";
809  case NVPTXISD::TexCubeArrayS32FloatLevel:
810    return "NVPTXISD::TexCubeArrayS32FloatLevel";
811  case NVPTXISD::TexCubeArrayU32Float:
812    return "NVPTXISD::TexCubeArrayU32Float";
813  case NVPTXISD::TexCubeArrayU32FloatLevel:
814    return "NVPTXISD::TexCubeArrayU32FloatLevel";
815  case NVPTXISD::Tld4R2DFloatFloat:
816    return "NVPTXISD::Tld4R2DFloatFloat";
817  case NVPTXISD::Tld4G2DFloatFloat:
818    return "NVPTXISD::Tld4G2DFloatFloat";
819  case NVPTXISD::Tld4B2DFloatFloat:
820    return "NVPTXISD::Tld4B2DFloatFloat";
821  case NVPTXISD::Tld4A2DFloatFloat:
822    return "NVPTXISD::Tld4A2DFloatFloat";
823  case NVPTXISD::Tld4R2DS64Float:
824    return "NVPTXISD::Tld4R2DS64Float";
825  case NVPTXISD::Tld4G2DS64Float:
826    return "NVPTXISD::Tld4G2DS64Float";
827  case NVPTXISD::Tld4B2DS64Float:
828    return "NVPTXISD::Tld4B2DS64Float";
829  case NVPTXISD::Tld4A2DS64Float:
830    return "NVPTXISD::Tld4A2DS64Float";
831  case NVPTXISD::Tld4R2DU64Float:
832    return "NVPTXISD::Tld4R2DU64Float";
833  case NVPTXISD::Tld4G2DU64Float:
834    return "NVPTXISD::Tld4G2DU64Float";
835  case NVPTXISD::Tld4B2DU64Float:
836    return "NVPTXISD::Tld4B2DU64Float";
837  case NVPTXISD::Tld4A2DU64Float:
838    return "NVPTXISD::Tld4A2DU64Float";
839
840  case NVPTXISD::TexUnified1DFloatS32:
841    return "NVPTXISD::TexUnified1DFloatS32";
842  case NVPTXISD::TexUnified1DFloatFloat:
843    return "NVPTXISD::TexUnified1DFloatFloat";
844  case NVPTXISD::TexUnified1DFloatFloatLevel:
845    return "NVPTXISD::TexUnified1DFloatFloatLevel";
846  case NVPTXISD::TexUnified1DFloatFloatGrad:
847    return "NVPTXISD::TexUnified1DFloatFloatGrad";
848  case NVPTXISD::TexUnified1DS32S32:
849    return "NVPTXISD::TexUnified1DS32S32";
850  case NVPTXISD::TexUnified1DS32Float:
851    return "NVPTXISD::TexUnified1DS32Float";
852  case NVPTXISD::TexUnified1DS32FloatLevel:
853    return "NVPTXISD::TexUnified1DS32FloatLevel";
854  case NVPTXISD::TexUnified1DS32FloatGrad:
855    return "NVPTXISD::TexUnified1DS32FloatGrad";
856  case NVPTXISD::TexUnified1DU32S32:
857    return "NVPTXISD::TexUnified1DU32S32";
858  case NVPTXISD::TexUnified1DU32Float:
859    return "NVPTXISD::TexUnified1DU32Float";
860  case NVPTXISD::TexUnified1DU32FloatLevel:
861    return "NVPTXISD::TexUnified1DU32FloatLevel";
862  case NVPTXISD::TexUnified1DU32FloatGrad:
863    return "NVPTXISD::TexUnified1DU32FloatGrad";
864  case NVPTXISD::TexUnified1DArrayFloatS32:
865    return "NVPTXISD::TexUnified1DArrayFloatS32";
866  case NVPTXISD::TexUnified1DArrayFloatFloat:
867    return "NVPTXISD::TexUnified1DArrayFloatFloat";
868  case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
869    return "NVPTXISD::TexUnified1DArrayFloatFloatLevel";
870  case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
871    return "NVPTXISD::TexUnified1DArrayFloatFloatGrad";
872  case NVPTXISD::TexUnified1DArrayS32S32:
873    return "NVPTXISD::TexUnified1DArrayS32S32";
874  case NVPTXISD::TexUnified1DArrayS32Float:
875    return "NVPTXISD::TexUnified1DArrayS32Float";
876  case NVPTXISD::TexUnified1DArrayS32FloatLevel:
877    return "NVPTXISD::TexUnified1DArrayS32FloatLevel";
878  case NVPTXISD::TexUnified1DArrayS32FloatGrad:
879    return "NVPTXISD::TexUnified1DArrayS32FloatGrad";
880  case NVPTXISD::TexUnified1DArrayU32S32:
881    return "NVPTXISD::TexUnified1DArrayU32S32";
882  case NVPTXISD::TexUnified1DArrayU32Float:
883    return "NVPTXISD::TexUnified1DArrayU32Float";
884  case NVPTXISD::TexUnified1DArrayU32FloatLevel:
885    return "NVPTXISD::TexUnified1DArrayU32FloatLevel";
886  case NVPTXISD::TexUnified1DArrayU32FloatGrad:
887    return "NVPTXISD::TexUnified1DArrayU32FloatGrad";
888  case NVPTXISD::TexUnified2DFloatS32:
889    return "NVPTXISD::TexUnified2DFloatS32";
890  case NVPTXISD::TexUnified2DFloatFloat:
891    return "NVPTXISD::TexUnified2DFloatFloat";
892  case NVPTXISD::TexUnified2DFloatFloatLevel:
893    return "NVPTXISD::TexUnified2DFloatFloatLevel";
894  case NVPTXISD::TexUnified2DFloatFloatGrad:
895    return "NVPTXISD::TexUnified2DFloatFloatGrad";
896  case NVPTXISD::TexUnified2DS32S32:
897    return "NVPTXISD::TexUnified2DS32S32";
898  case NVPTXISD::TexUnified2DS32Float:
899    return "NVPTXISD::TexUnified2DS32Float";
900  case NVPTXISD::TexUnified2DS32FloatLevel:
901    return "NVPTXISD::TexUnified2DS32FloatLevel";
902  case NVPTXISD::TexUnified2DS32FloatGrad:
903    return "NVPTXISD::TexUnified2DS32FloatGrad";
904  case NVPTXISD::TexUnified2DU32S32:
905    return "NVPTXISD::TexUnified2DU32S32";
906  case NVPTXISD::TexUnified2DU32Float:
907    return "NVPTXISD::TexUnified2DU32Float";
908  case NVPTXISD::TexUnified2DU32FloatLevel:
909    return "NVPTXISD::TexUnified2DU32FloatLevel";
910  case NVPTXISD::TexUnified2DU32FloatGrad:
911    return "NVPTXISD::TexUnified2DU32FloatGrad";
912  case NVPTXISD::TexUnified2DArrayFloatS32:
913    return "NVPTXISD::TexUnified2DArrayFloatS32";
914  case NVPTXISD::TexUnified2DArrayFloatFloat:
915    return "NVPTXISD::TexUnified2DArrayFloatFloat";
916  case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
917    return "NVPTXISD::TexUnified2DArrayFloatFloatLevel";
918  case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
919    return "NVPTXISD::TexUnified2DArrayFloatFloatGrad";
920  case NVPTXISD::TexUnified2DArrayS32S32:
921    return "NVPTXISD::TexUnified2DArrayS32S32";
922  case NVPTXISD::TexUnified2DArrayS32Float:
923    return "NVPTXISD::TexUnified2DArrayS32Float";
924  case NVPTXISD::TexUnified2DArrayS32FloatLevel:
925    return "NVPTXISD::TexUnified2DArrayS32FloatLevel";
926  case NVPTXISD::TexUnified2DArrayS32FloatGrad:
927    return "NVPTXISD::TexUnified2DArrayS32FloatGrad";
928  case NVPTXISD::TexUnified2DArrayU32S32:
929    return "NVPTXISD::TexUnified2DArrayU32S32";
930  case NVPTXISD::TexUnified2DArrayU32Float:
931    return "NVPTXISD::TexUnified2DArrayU32Float";
932  case NVPTXISD::TexUnified2DArrayU32FloatLevel:
933    return "NVPTXISD::TexUnified2DArrayU32FloatLevel";
934  case NVPTXISD::TexUnified2DArrayU32FloatGrad:
935    return "NVPTXISD::TexUnified2DArrayU32FloatGrad";
936  case NVPTXISD::TexUnified3DFloatS32:
937    return "NVPTXISD::TexUnified3DFloatS32";
938  case NVPTXISD::TexUnified3DFloatFloat:
939    return "NVPTXISD::TexUnified3DFloatFloat";
940  case NVPTXISD::TexUnified3DFloatFloatLevel:
941    return "NVPTXISD::TexUnified3DFloatFloatLevel";
942  case NVPTXISD::TexUnified3DFloatFloatGrad:
943    return "NVPTXISD::TexUnified3DFloatFloatGrad";
944  case NVPTXISD::TexUnified3DS32S32:
945    return "NVPTXISD::TexUnified3DS32S32";
946  case NVPTXISD::TexUnified3DS32Float:
947    return "NVPTXISD::TexUnified3DS32Float";
948  case NVPTXISD::TexUnified3DS32FloatLevel:
949    return "NVPTXISD::TexUnified3DS32FloatLevel";
950  case NVPTXISD::TexUnified3DS32FloatGrad:
951    return "NVPTXISD::TexUnified3DS32FloatGrad";
952  case NVPTXISD::TexUnified3DU32S32:
953    return "NVPTXISD::TexUnified3DU32S32";
954  case NVPTXISD::TexUnified3DU32Float:
955    return "NVPTXISD::TexUnified3DU32Float";
956  case NVPTXISD::TexUnified3DU32FloatLevel:
957    return "NVPTXISD::TexUnified3DU32FloatLevel";
958  case NVPTXISD::TexUnified3DU32FloatGrad:
959    return "NVPTXISD::TexUnified3DU32FloatGrad";
960  case NVPTXISD::TexUnifiedCubeFloatFloat:
961    return "NVPTXISD::TexUnifiedCubeFloatFloat";
962  case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
963    return "NVPTXISD::TexUnifiedCubeFloatFloatLevel";
964  case NVPTXISD::TexUnifiedCubeS32Float:
965    return "NVPTXISD::TexUnifiedCubeS32Float";
966  case NVPTXISD::TexUnifiedCubeS32FloatLevel:
967    return "NVPTXISD::TexUnifiedCubeS32FloatLevel";
968  case NVPTXISD::TexUnifiedCubeU32Float:
969    return "NVPTXISD::TexUnifiedCubeU32Float";
970  case NVPTXISD::TexUnifiedCubeU32FloatLevel:
971    return "NVPTXISD::TexUnifiedCubeU32FloatLevel";
972  case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
973    return "NVPTXISD::TexUnifiedCubeArrayFloatFloat";
974  case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
975    return "NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel";
976  case NVPTXISD::TexUnifiedCubeArrayS32Float:
977    return "NVPTXISD::TexUnifiedCubeArrayS32Float";
978  case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
979    return "NVPTXISD::TexUnifiedCubeArrayS32FloatLevel";
980  case NVPTXISD::TexUnifiedCubeArrayU32Float:
981    return "NVPTXISD::TexUnifiedCubeArrayU32Float";
982  case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
983    return "NVPTXISD::TexUnifiedCubeArrayU32FloatLevel";
984  case NVPTXISD::Tld4UnifiedR2DFloatFloat:
985    return "NVPTXISD::Tld4UnifiedR2DFloatFloat";
986  case NVPTXISD::Tld4UnifiedG2DFloatFloat:
987    return "NVPTXISD::Tld4UnifiedG2DFloatFloat";
988  case NVPTXISD::Tld4UnifiedB2DFloatFloat:
989    return "NVPTXISD::Tld4UnifiedB2DFloatFloat";
990  case NVPTXISD::Tld4UnifiedA2DFloatFloat:
991    return "NVPTXISD::Tld4UnifiedA2DFloatFloat";
992  case NVPTXISD::Tld4UnifiedR2DS64Float:
993    return "NVPTXISD::Tld4UnifiedR2DS64Float";
994  case NVPTXISD::Tld4UnifiedG2DS64Float:
995    return "NVPTXISD::Tld4UnifiedG2DS64Float";
996  case NVPTXISD::Tld4UnifiedB2DS64Float:
997    return "NVPTXISD::Tld4UnifiedB2DS64Float";
998  case NVPTXISD::Tld4UnifiedA2DS64Float:
999    return "NVPTXISD::Tld4UnifiedA2DS64Float";
1000  case NVPTXISD::Tld4UnifiedR2DU64Float:
1001    return "NVPTXISD::Tld4UnifiedR2DU64Float";
1002  case NVPTXISD::Tld4UnifiedG2DU64Float:
1003    return "NVPTXISD::Tld4UnifiedG2DU64Float";
1004  case NVPTXISD::Tld4UnifiedB2DU64Float:
1005    return "NVPTXISD::Tld4UnifiedB2DU64Float";
1006  case NVPTXISD::Tld4UnifiedA2DU64Float:
1007    return "NVPTXISD::Tld4UnifiedA2DU64Float";
1008
1009  case NVPTXISD::Suld1DI8Clamp:          return "NVPTXISD::Suld1DI8Clamp";
1010  case NVPTXISD::Suld1DI16Clamp:         return "NVPTXISD::Suld1DI16Clamp";
1011  case NVPTXISD::Suld1DI32Clamp:         return "NVPTXISD::Suld1DI32Clamp";
1012  case NVPTXISD::Suld1DI64Clamp:         return "NVPTXISD::Suld1DI64Clamp";
1013  case NVPTXISD::Suld1DV2I8Clamp:        return "NVPTXISD::Suld1DV2I8Clamp";
1014  case NVPTXISD::Suld1DV2I16Clamp:       return "NVPTXISD::Suld1DV2I16Clamp";
1015  case NVPTXISD::Suld1DV2I32Clamp:       return "NVPTXISD::Suld1DV2I32Clamp";
1016  case NVPTXISD::Suld1DV2I64Clamp:       return "NVPTXISD::Suld1DV2I64Clamp";
1017  case NVPTXISD::Suld1DV4I8Clamp:        return "NVPTXISD::Suld1DV4I8Clamp";
1018  case NVPTXISD::Suld1DV4I16Clamp:       return "NVPTXISD::Suld1DV4I16Clamp";
1019  case NVPTXISD::Suld1DV4I32Clamp:       return "NVPTXISD::Suld1DV4I32Clamp";
1020
1021  case NVPTXISD::Suld1DArrayI8Clamp:   return "NVPTXISD::Suld1DArrayI8Clamp";
1022  case NVPTXISD::Suld1DArrayI16Clamp:  return "NVPTXISD::Suld1DArrayI16Clamp";
1023  case NVPTXISD::Suld1DArrayI32Clamp:  return "NVPTXISD::Suld1DArrayI32Clamp";
1024  case NVPTXISD::Suld1DArrayI64Clamp:  return "NVPTXISD::Suld1DArrayI64Clamp";
1025  case NVPTXISD::Suld1DArrayV2I8Clamp: return "NVPTXISD::Suld1DArrayV2I8Clamp";
1026  case NVPTXISD::Suld1DArrayV2I16Clamp:return "NVPTXISD::Suld1DArrayV2I16Clamp";
1027  case NVPTXISD::Suld1DArrayV2I32Clamp:return "NVPTXISD::Suld1DArrayV2I32Clamp";
1028  case NVPTXISD::Suld1DArrayV2I64Clamp:return "NVPTXISD::Suld1DArrayV2I64Clamp";
1029  case NVPTXISD::Suld1DArrayV4I8Clamp: return "NVPTXISD::Suld1DArrayV4I8Clamp";
1030  case NVPTXISD::Suld1DArrayV4I16Clamp:return "NVPTXISD::Suld1DArrayV4I16Clamp";
1031  case NVPTXISD::Suld1DArrayV4I32Clamp:return "NVPTXISD::Suld1DArrayV4I32Clamp";
1032
1033  case NVPTXISD::Suld2DI8Clamp:          return "NVPTXISD::Suld2DI8Clamp";
1034  case NVPTXISD::Suld2DI16Clamp:         return "NVPTXISD::Suld2DI16Clamp";
1035  case NVPTXISD::Suld2DI32Clamp:         return "NVPTXISD::Suld2DI32Clamp";
1036  case NVPTXISD::Suld2DI64Clamp:         return "NVPTXISD::Suld2DI64Clamp";
1037  case NVPTXISD::Suld2DV2I8Clamp:        return "NVPTXISD::Suld2DV2I8Clamp";
1038  case NVPTXISD::Suld2DV2I16Clamp:       return "NVPTXISD::Suld2DV2I16Clamp";
1039  case NVPTXISD::Suld2DV2I32Clamp:       return "NVPTXISD::Suld2DV2I32Clamp";
1040  case NVPTXISD::Suld2DV2I64Clamp:       return "NVPTXISD::Suld2DV2I64Clamp";
1041  case NVPTXISD::Suld2DV4I8Clamp:        return "NVPTXISD::Suld2DV4I8Clamp";
1042  case NVPTXISD::Suld2DV4I16Clamp:       return "NVPTXISD::Suld2DV4I16Clamp";
1043  case NVPTXISD::Suld2DV4I32Clamp:       return "NVPTXISD::Suld2DV4I32Clamp";
1044
1045  case NVPTXISD::Suld2DArrayI8Clamp:   return "NVPTXISD::Suld2DArrayI8Clamp";
1046  case NVPTXISD::Suld2DArrayI16Clamp:  return "NVPTXISD::Suld2DArrayI16Clamp";
1047  case NVPTXISD::Suld2DArrayI32Clamp:  return "NVPTXISD::Suld2DArrayI32Clamp";
1048  case NVPTXISD::Suld2DArrayI64Clamp:  return "NVPTXISD::Suld2DArrayI64Clamp";
1049  case NVPTXISD::Suld2DArrayV2I8Clamp: return "NVPTXISD::Suld2DArrayV2I8Clamp";
1050  case NVPTXISD::Suld2DArrayV2I16Clamp:return "NVPTXISD::Suld2DArrayV2I16Clamp";
1051  case NVPTXISD::Suld2DArrayV2I32Clamp:return "NVPTXISD::Suld2DArrayV2I32Clamp";
1052  case NVPTXISD::Suld2DArrayV2I64Clamp:return "NVPTXISD::Suld2DArrayV2I64Clamp";
1053  case NVPTXISD::Suld2DArrayV4I8Clamp: return "NVPTXISD::Suld2DArrayV4I8Clamp";
1054  case NVPTXISD::Suld2DArrayV4I16Clamp:return "NVPTXISD::Suld2DArrayV4I16Clamp";
1055  case NVPTXISD::Suld2DArrayV4I32Clamp:return "NVPTXISD::Suld2DArrayV4I32Clamp";
1056
1057  case NVPTXISD::Suld3DI8Clamp:          return "NVPTXISD::Suld3DI8Clamp";
1058  case NVPTXISD::Suld3DI16Clamp:         return "NVPTXISD::Suld3DI16Clamp";
1059  case NVPTXISD::Suld3DI32Clamp:         return "NVPTXISD::Suld3DI32Clamp";
1060  case NVPTXISD::Suld3DI64Clamp:         return "NVPTXISD::Suld3DI64Clamp";
1061  case NVPTXISD::Suld3DV2I8Clamp:        return "NVPTXISD::Suld3DV2I8Clamp";
1062  case NVPTXISD::Suld3DV2I16Clamp:       return "NVPTXISD::Suld3DV2I16Clamp";
1063  case NVPTXISD::Suld3DV2I32Clamp:       return "NVPTXISD::Suld3DV2I32Clamp";
1064  case NVPTXISD::Suld3DV2I64Clamp:       return "NVPTXISD::Suld3DV2I64Clamp";
1065  case NVPTXISD::Suld3DV4I8Clamp:        return "NVPTXISD::Suld3DV4I8Clamp";
1066  case NVPTXISD::Suld3DV4I16Clamp:       return "NVPTXISD::Suld3DV4I16Clamp";
1067  case NVPTXISD::Suld3DV4I32Clamp:       return "NVPTXISD::Suld3DV4I32Clamp";
1068
1069  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
1070  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
1071  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
1072  case NVPTXISD::Suld1DI64Trap:         return "NVPTXISD::Suld1DI64Trap";
1073  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
1074  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
1075  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
1076  case NVPTXISD::Suld1DV2I64Trap:       return "NVPTXISD::Suld1DV2I64Trap";
1077  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
1078  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
1079  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
1080
1081  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
1082  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
1083  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
1084  case NVPTXISD::Suld1DArrayI64Trap:    return "NVPTXISD::Suld1DArrayI64Trap";
1085  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
1086  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
1087  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
1088  case NVPTXISD::Suld1DArrayV2I64Trap:  return "NVPTXISD::Suld1DArrayV2I64Trap";
1089  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
1090  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
1091  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
1092
1093  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
1094  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
1095  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
1096  case NVPTXISD::Suld2DI64Trap:         return "NVPTXISD::Suld2DI64Trap";
1097  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
1098  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
1099  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
1100  case NVPTXISD::Suld2DV2I64Trap:       return "NVPTXISD::Suld2DV2I64Trap";
1101  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
1102  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
1103  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
1104
1105  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
1106  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
1107  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
1108  case NVPTXISD::Suld2DArrayI64Trap:    return "NVPTXISD::Suld2DArrayI64Trap";
1109  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
1110  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
1111  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
1112  case NVPTXISD::Suld2DArrayV2I64Trap:  return "NVPTXISD::Suld2DArrayV2I64Trap";
1113  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
1114  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
1115  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
1116
1117  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
1118  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
1119  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
1120  case NVPTXISD::Suld3DI64Trap:         return "NVPTXISD::Suld3DI64Trap";
1121  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
1122  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
1123  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
1124  case NVPTXISD::Suld3DV2I64Trap:       return "NVPTXISD::Suld3DV2I64Trap";
1125  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
1126  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
1127  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
1128
1129  case NVPTXISD::Suld1DI8Zero:          return "NVPTXISD::Suld1DI8Zero";
1130  case NVPTXISD::Suld1DI16Zero:         return "NVPTXISD::Suld1DI16Zero";
1131  case NVPTXISD::Suld1DI32Zero:         return "NVPTXISD::Suld1DI32Zero";
1132  case NVPTXISD::Suld1DI64Zero:         return "NVPTXISD::Suld1DI64Zero";
1133  case NVPTXISD::Suld1DV2I8Zero:        return "NVPTXISD::Suld1DV2I8Zero";
1134  case NVPTXISD::Suld1DV2I16Zero:       return "NVPTXISD::Suld1DV2I16Zero";
1135  case NVPTXISD::Suld1DV2I32Zero:       return "NVPTXISD::Suld1DV2I32Zero";
1136  case NVPTXISD::Suld1DV2I64Zero:       return "NVPTXISD::Suld1DV2I64Zero";
1137  case NVPTXISD::Suld1DV4I8Zero:        return "NVPTXISD::Suld1DV4I8Zero";
1138  case NVPTXISD::Suld1DV4I16Zero:       return "NVPTXISD::Suld1DV4I16Zero";
1139  case NVPTXISD::Suld1DV4I32Zero:       return "NVPTXISD::Suld1DV4I32Zero";
1140
1141  case NVPTXISD::Suld1DArrayI8Zero:     return "NVPTXISD::Suld1DArrayI8Zero";
1142  case NVPTXISD::Suld1DArrayI16Zero:    return "NVPTXISD::Suld1DArrayI16Zero";
1143  case NVPTXISD::Suld1DArrayI32Zero:    return "NVPTXISD::Suld1DArrayI32Zero";
1144  case NVPTXISD::Suld1DArrayI64Zero:    return "NVPTXISD::Suld1DArrayI64Zero";
1145  case NVPTXISD::Suld1DArrayV2I8Zero:   return "NVPTXISD::Suld1DArrayV2I8Zero";
1146  case NVPTXISD::Suld1DArrayV2I16Zero:  return "NVPTXISD::Suld1DArrayV2I16Zero";
1147  case NVPTXISD::Suld1DArrayV2I32Zero:  return "NVPTXISD::Suld1DArrayV2I32Zero";
1148  case NVPTXISD::Suld1DArrayV2I64Zero:  return "NVPTXISD::Suld1DArrayV2I64Zero";
1149  case NVPTXISD::Suld1DArrayV4I8Zero:   return "NVPTXISD::Suld1DArrayV4I8Zero";
1150  case NVPTXISD::Suld1DArrayV4I16Zero:  return "NVPTXISD::Suld1DArrayV4I16Zero";
1151  case NVPTXISD::Suld1DArrayV4I32Zero:  return "NVPTXISD::Suld1DArrayV4I32Zero";
1152
1153  case NVPTXISD::Suld2DI8Zero:          return "NVPTXISD::Suld2DI8Zero";
1154  case NVPTXISD::Suld2DI16Zero:         return "NVPTXISD::Suld2DI16Zero";
1155  case NVPTXISD::Suld2DI32Zero:         return "NVPTXISD::Suld2DI32Zero";
1156  case NVPTXISD::Suld2DI64Zero:         return "NVPTXISD::Suld2DI64Zero";
1157  case NVPTXISD::Suld2DV2I8Zero:        return "NVPTXISD::Suld2DV2I8Zero";
1158  case NVPTXISD::Suld2DV2I16Zero:       return "NVPTXISD::Suld2DV2I16Zero";
1159  case NVPTXISD::Suld2DV2I32Zero:       return "NVPTXISD::Suld2DV2I32Zero";
1160  case NVPTXISD::Suld2DV2I64Zero:       return "NVPTXISD::Suld2DV2I64Zero";
1161  case NVPTXISD::Suld2DV4I8Zero:        return "NVPTXISD::Suld2DV4I8Zero";
1162  case NVPTXISD::Suld2DV4I16Zero:       return "NVPTXISD::Suld2DV4I16Zero";
1163  case NVPTXISD::Suld2DV4I32Zero:       return "NVPTXISD::Suld2DV4I32Zero";
1164
1165  case NVPTXISD::Suld2DArrayI8Zero:     return "NVPTXISD::Suld2DArrayI8Zero";
1166  case NVPTXISD::Suld2DArrayI16Zero:    return "NVPTXISD::Suld2DArrayI16Zero";
1167  case NVPTXISD::Suld2DArrayI32Zero:    return "NVPTXISD::Suld2DArrayI32Zero";
1168  case NVPTXISD::Suld2DArrayI64Zero:    return "NVPTXISD::Suld2DArrayI64Zero";
1169  case NVPTXISD::Suld2DArrayV2I8Zero:   return "NVPTXISD::Suld2DArrayV2I8Zero";
1170  case NVPTXISD::Suld2DArrayV2I16Zero:  return "NVPTXISD::Suld2DArrayV2I16Zero";
1171  case NVPTXISD::Suld2DArrayV2I32Zero:  return "NVPTXISD::Suld2DArrayV2I32Zero";
1172  case NVPTXISD::Suld2DArrayV2I64Zero:  return "NVPTXISD::Suld2DArrayV2I64Zero";
1173  case NVPTXISD::Suld2DArrayV4I8Zero:   return "NVPTXISD::Suld2DArrayV4I8Zero";
1174  case NVPTXISD::Suld2DArrayV4I16Zero:  return "NVPTXISD::Suld2DArrayV4I16Zero";
1175  case NVPTXISD::Suld2DArrayV4I32Zero:  return "NVPTXISD::Suld2DArrayV4I32Zero";
1176
1177  case NVPTXISD::Suld3DI8Zero:          return "NVPTXISD::Suld3DI8Zero";
1178  case NVPTXISD::Suld3DI16Zero:         return "NVPTXISD::Suld3DI16Zero";
1179  case NVPTXISD::Suld3DI32Zero:         return "NVPTXISD::Suld3DI32Zero";
1180  case NVPTXISD::Suld3DI64Zero:         return "NVPTXISD::Suld3DI64Zero";
1181  case NVPTXISD::Suld3DV2I8Zero:        return "NVPTXISD::Suld3DV2I8Zero";
1182  case NVPTXISD::Suld3DV2I16Zero:       return "NVPTXISD::Suld3DV2I16Zero";
1183  case NVPTXISD::Suld3DV2I32Zero:       return "NVPTXISD::Suld3DV2I32Zero";
1184  case NVPTXISD::Suld3DV2I64Zero:       return "NVPTXISD::Suld3DV2I64Zero";
1185  case NVPTXISD::Suld3DV4I8Zero:        return "NVPTXISD::Suld3DV4I8Zero";
1186  case NVPTXISD::Suld3DV4I16Zero:       return "NVPTXISD::Suld3DV4I16Zero";
1187  case NVPTXISD::Suld3DV4I32Zero:       return "NVPTXISD::Suld3DV4I32Zero";
1188  }
1189  return nullptr;
1190}
1191
1192TargetLoweringBase::LegalizeTypeAction
1193NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const {
1194  if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
1195    return TypeSplitVector;
1196  if (VT == MVT::v2f16)
1197    return TypeLegal;
1198  return TargetLoweringBase::getPreferredVectorAction(VT);
1199}
1200
1201SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
1202                                             int Enabled, int &ExtraSteps,
1203                                             bool &UseOneConst,
1204                                             bool Reciprocal) const {
1205  if (!(Enabled == ReciprocalEstimate::Enabled ||
1206        (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1207    return SDValue();
1208
1209  if (ExtraSteps == ReciprocalEstimate::Unspecified)
1210    ExtraSteps = 0;
1211
1212  SDLoc DL(Operand);
1213  EVT VT = Operand.getValueType();
1214  bool Ftz = useF32FTZ(DAG.getMachineFunction());
1215
1216  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1217    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1218                       DAG.getConstant(IID, DL, MVT::i32), Operand);
1219  };
1220
1221  // The sqrt and rsqrt refinement processes assume we always start out with an
1222  // approximation of the rsqrt.  Therefore, if we're going to do any refinement
1223  // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
1224  // any refinement, we must return a regular sqrt.
1225  if (Reciprocal || ExtraSteps > 0) {
1226    if (VT == MVT::f32)
1227      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1228                                   : Intrinsic::nvvm_rsqrt_approx_f);
1229    else if (VT == MVT::f64)
1230      return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1231    else
1232      return SDValue();
1233  } else {
1234    if (VT == MVT::f32)
1235      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1236                                   : Intrinsic::nvvm_sqrt_approx_f);
1237    else {
1238      // There's no sqrt.approx.f64 instruction, so we emit
1239      // reciprocal(rsqrt(x)).  This is faster than
1240      // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
1241      // x * rsqrt(x).)
1242      return DAG.getNode(
1243          ISD::INTRINSIC_WO_CHAIN, DL, VT,
1244          DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1245          MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1246    }
1247  }
1248}
1249
1250SDValue
1251NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
1252  SDLoc dl(Op);
1253  const GlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1254  auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1255  Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1256  return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT, Op);
1257}
1258
1259std::string NVPTXTargetLowering::getPrototype(
1260    const DataLayout &DL, Type *retTy, const ArgListTy &Args,
1261    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
1262    ImmutableCallSite CS) const {
1263  auto PtrVT = getPointerTy(DL);
1264
1265  bool isABI = (STI.getSmVersion() >= 20);
1266  assert(isABI && "Non-ABI compilation is not supported");
1267  if (!isABI)
1268    return "";
1269
1270  std::stringstream O;
1271  O << "prototype_" << uniqueCallSite << " : .callprototype ";
1272
1273  if (retTy->getTypeID() == Type::VoidTyID) {
1274    O << "()";
1275  } else {
1276    O << "(";
1277    if (retTy->isFloatingPointTy() || (retTy->isIntegerTy() && !retTy->isIntegerTy(128))) {
1278      unsigned size = 0;
1279      if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1280        size = ITy->getBitWidth();
1281      } else {
1282        assert(retTy->isFloatingPointTy() &&
1283               "Floating point type expected here");
1284        size = retTy->getPrimitiveSizeInBits();
1285      }
1286      // PTX ABI requires all scalar return values to be at least 32
1287      // bits in size.  fp16 normally uses .b16 as its storage type in
1288      // PTX, so its size must be adjusted here, too.
1289      if (size < 32)
1290        size = 32;
1291
1292      O << ".param .b" << size << " _";
1293    } else if (isa<PointerType>(retTy)) {
1294      O << ".param .b" << PtrVT.getSizeInBits() << " _";
1295    } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
1296               retTy->isIntegerTy(128)) {
1297      O << ".param .align " << retAlignment << " .b8 _["
1298        << DL.getTypeAllocSize(retTy) << "]";
1299    } else {
1300      llvm_unreachable("Unknown return type");
1301    }
1302    O << ") ";
1303  }
1304  O << "_ (";
1305
1306  bool first = true;
1307
1308  unsigned OIdx = 0;
1309  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1310    Type *Ty = Args[i].Ty;
1311    if (!first) {
1312      O << ", ";
1313    }
1314    first = false;
1315
1316    if (!Outs[OIdx].Flags.isByVal()) {
1317      if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1318        unsigned align = 0;
1319        const CallInst *CallI = cast<CallInst>(CS.getInstruction());
1320        // +1 because index 0 is reserved for return type alignment
1321        if (!getAlign(*CallI, i + 1, align))
1322          align = DL.getABITypeAlignment(Ty);
1323        unsigned sz = DL.getTypeAllocSize(Ty);
1324        O << ".param .align " << align << " .b8 ";
1325        O << "_";
1326        O << "[" << sz << "]";
1327        // update the index for Outs
1328        SmallVector<EVT, 16> vtparts;
1329        ComputeValueVTs(*this, DL, Ty, vtparts);
1330        if (unsigned len = vtparts.size())
1331          OIdx += len - 1;
1332        continue;
1333      }
1334      // i8 types in IR will be i16 types in SDAG
1335      assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1336              (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1337             "type mismatch between callee prototype and arguments");
1338      // scalar type
1339      unsigned sz = 0;
1340      if (isa<IntegerType>(Ty)) {
1341        sz = cast<IntegerType>(Ty)->getBitWidth();
1342        if (sz < 32)
1343          sz = 32;
1344      } else if (isa<PointerType>(Ty)) {
1345        sz = PtrVT.getSizeInBits();
1346      } else if (Ty->isHalfTy())
1347        // PTX ABI requires all scalar parameters to be at least 32
1348        // bits in size.  fp16 normally uses .b16 as its storage type
1349        // in PTX, so its size must be adjusted here, too.
1350        sz = 32;
1351      else
1352        sz = Ty->getPrimitiveSizeInBits();
1353      O << ".param .b" << sz << " ";
1354      O << "_";
1355      continue;
1356    }
1357    auto *PTy = dyn_cast<PointerType>(Ty);
1358    assert(PTy && "Param with byval attribute should be a pointer type");
1359    Type *ETy = PTy->getElementType();
1360
1361    unsigned align = Outs[OIdx].Flags.getByValAlign();
1362    unsigned sz = DL.getTypeAllocSize(ETy);
1363    O << ".param .align " << align << " .b8 ";
1364    O << "_";
1365    O << "[" << sz << "]";
1366  }
1367  O << ");";
1368  return O.str();
1369}
1370
1371unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
1372                                                   ImmutableCallSite CS,
1373                                                   Type *Ty, unsigned Idx,
1374                                                   const DataLayout &DL) const {
1375  if (!CS) {
1376    // CallSite is zero, fallback to ABI type alignment
1377    return DL.getABITypeAlignment(Ty);
1378  }
1379
1380  unsigned Align = 0;
1381  const Value *DirectCallee = CS.getCalledFunction();
1382
1383  if (!DirectCallee) {
1384    // We don't have a direct function symbol, but that may be because of
1385    // constant cast instructions in the call.
1386    const Instruction *CalleeI = CS.getInstruction();
1387    assert(CalleeI && "Call target is not a function or derived value?");
1388
1389    // With bitcast'd call targets, the instruction will be the call
1390    if (isa<CallInst>(CalleeI)) {
1391      // Check if we have call alignment metadata
1392      if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
1393        return Align;
1394
1395      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
1396      // Ignore any bitcast instructions
1397      while (isa<ConstantExpr>(CalleeV)) {
1398        const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
1399        if (!CE->isCast())
1400          break;
1401        // Look through the bitcast
1402        CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
1403      }
1404
1405      // We have now looked past all of the bitcasts.  Do we finally have a
1406      // Function?
1407      if (isa<Function>(CalleeV))
1408        DirectCallee = CalleeV;
1409    }
1410  }
1411
1412  // Check for function alignment information if we found that the
1413  // ultimate target is a Function
1414  if (DirectCallee)
1415    if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
1416      return Align;
1417
1418  // Call is indirect or alignment information is not available, fall back to
1419  // the ABI type alignment
1420  return DL.getABITypeAlignment(Ty);
1421}
1422
1423SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1424                                       SmallVectorImpl<SDValue> &InVals) const {
1425  SelectionDAG &DAG = CLI.DAG;
1426  SDLoc dl = CLI.DL;
1427  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1428  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1429  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1430  SDValue Chain = CLI.Chain;
1431  SDValue Callee = CLI.Callee;
1432  bool &isTailCall = CLI.IsTailCall;
1433  ArgListTy &Args = CLI.getArgs();
1434  Type *RetTy = CLI.RetTy;
1435  ImmutableCallSite CS = CLI.CS;
1436  const DataLayout &DL = DAG.getDataLayout();
1437
1438  bool isABI = (STI.getSmVersion() >= 20);
1439  assert(isABI && "Non-ABI compilation is not supported");
1440  if (!isABI)
1441    return Chain;
1442
1443  SDValue tempChain = Chain;
1444  Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl);
1445  SDValue InFlag = Chain.getValue(1);
1446
1447  unsigned paramCount = 0;
1448  // Args.size() and Outs.size() need not match.
1449  // Outs.size() will be larger
1450  //   * if there is an aggregate argument with multiple fields (each field
1451  //     showing up separately in Outs)
1452  //   * if there is a vector argument with more than typical vector-length
1453  //     elements (generally if more than 4) where each vector element is
1454  //     individually present in Outs.
1455  // So a different index should be used for indexing into Outs/OutVals.
1456  // See similar issue in LowerFormalArguments.
1457  unsigned OIdx = 0;
1458  // Declare the .params or .reg need to pass values
1459  // to the function
1460  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1461    EVT VT = Outs[OIdx].VT;
1462    Type *Ty = Args[i].Ty;
1463
1464    if (!Outs[OIdx].Flags.isByVal()) {
1465      SmallVector<EVT, 16> VTs;
1466      SmallVector<uint64_t, 16> Offsets;
1467      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
1468      unsigned ArgAlign =
1469          getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
1470      unsigned AllocSize = DL.getTypeAllocSize(Ty);
1471      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1472      bool NeedAlign; // Does argument declaration specify alignment?
1473      if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
1474        // declare .param .align <align> .b8 .param<n>[<size>];
1475        SDValue DeclareParamOps[] = {
1476            Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1477            DAG.getConstant(paramCount, dl, MVT::i32),
1478            DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
1479        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1480                            DeclareParamOps);
1481        NeedAlign = true;
1482      } else {
1483        // declare .param .b<size> .param<n>;
1484        if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
1485          // PTX ABI requires integral types to be at least 32 bits in
1486          // size. FP16 is loaded/stored using i16, so it's handled
1487          // here as well.
1488          AllocSize = 4;
1489        }
1490        SDValue DeclareScalarParamOps[] = {
1491            Chain, DAG.getConstant(paramCount, dl, MVT::i32),
1492            DAG.getConstant(AllocSize * 8, dl, MVT::i32),
1493            DAG.getConstant(0, dl, MVT::i32), InFlag};
1494        Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1495                            DeclareScalarParamOps);
1496        NeedAlign = false;
1497      }
1498      InFlag = Chain.getValue(1);
1499
1500      // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1501      // than 32-bits are sign extended or zero extended, depending on
1502      // whether they are signed or unsigned types. This case applies
1503      // only to scalar parameters and not to aggregate values.
1504      bool ExtendIntegerParam =
1505          Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
1506
1507      auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
1508      SmallVector<SDValue, 6> StoreOperands;
1509      for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1510        // New store.
1511        if (VectorInfo[j] & PVF_FIRST) {
1512          assert(StoreOperands.empty() && "Unfinished preceding store.");
1513          StoreOperands.push_back(Chain);
1514          StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
1515          StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
1516        }
1517
1518        EVT EltVT = VTs[j];
1519        SDValue StVal = OutVals[OIdx];
1520        if (ExtendIntegerParam) {
1521          assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
1522          // zext/sext to i32
1523          StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
1524                                                        : ISD::ZERO_EXTEND,
1525                              dl, MVT::i32, StVal);
1526        } else if (EltVT.getSizeInBits() < 16) {
1527          // Use 16-bit registers for small stores as it's the
1528          // smallest general purpose register size supported by NVPTX.
1529          StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1530        }
1531
1532        // Record the value to store.
1533        StoreOperands.push_back(StVal);
1534
1535        if (VectorInfo[j] & PVF_LAST) {
1536          unsigned NumElts = StoreOperands.size() - 3;
1537          NVPTXISD::NodeType Op;
1538          switch (NumElts) {
1539          case 1:
1540            Op = NVPTXISD::StoreParam;
1541            break;
1542          case 2:
1543            Op = NVPTXISD::StoreParamV2;
1544            break;
1545          case 4:
1546            Op = NVPTXISD::StoreParamV4;
1547            break;
1548          default:
1549            llvm_unreachable("Invalid vector info.");
1550          }
1551
1552          StoreOperands.push_back(InFlag);
1553
1554          // Adjust type of the store op if we've extended the scalar
1555          // return value.
1556          EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
1557          unsigned EltAlign =
1558              NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
1559
1560          Chain = DAG.getMemIntrinsicNode(
1561              Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1562              TheStoreType, MachinePointerInfo(), EltAlign,
1563              MachineMemOperand::MOStore);
1564          InFlag = Chain.getValue(1);
1565
1566          // Cleanup.
1567          StoreOperands.clear();
1568        }
1569        ++OIdx;
1570      }
1571      assert(StoreOperands.empty() && "Unfinished parameter store.");
1572      if (VTs.size() > 0)
1573        --OIdx;
1574      ++paramCount;
1575      continue;
1576    }
1577
1578    // ByVal arguments
1579    SmallVector<EVT, 16> VTs;
1580    SmallVector<uint64_t, 16> Offsets;
1581    auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
1582    assert(PTy && "Type of a byval parameter should be pointer");
1583    ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
1584
1585    // declare .param .align <align> .b8 .param<n>[<size>];
1586    unsigned sz = Outs[OIdx].Flags.getByValSize();
1587    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1588    unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
1589    // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
1590    // so we don't need to worry about natural alignment or not.
1591    // See TargetLowering::LowerCallTo().
1592
1593    // Enforce minumum alignment of 4 to work around ptxas miscompile
1594    // for sm_50+. See corresponding alignment adjustment in
1595    // emitFunctionParamList() for details.
1596    if (ArgAlign < 4)
1597      ArgAlign = 4;
1598    SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
1599                                 DAG.getConstant(paramCount, dl, MVT::i32),
1600                                 DAG.getConstant(sz, dl, MVT::i32), InFlag};
1601    Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1602                        DeclareParamOps);
1603    InFlag = Chain.getValue(1);
1604    for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1605      EVT elemtype = VTs[j];
1606      int curOffset = Offsets[j];
1607      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
1608      auto PtrVT = getPointerTy(DL);
1609      SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
1610                                    DAG.getConstant(curOffset, dl, PtrVT));
1611      SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
1612                                   MachinePointerInfo(), PartAlign);
1613      if (elemtype.getSizeInBits() < 16) {
1614        theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
1615      }
1616      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1617      SDValue CopyParamOps[] = { Chain,
1618                                 DAG.getConstant(paramCount, dl, MVT::i32),
1619                                 DAG.getConstant(curOffset, dl, MVT::i32),
1620                                 theVal, InFlag };
1621      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
1622                                      CopyParamOps, elemtype,
1623                                      MachinePointerInfo(), /* Align */ 0,
1624                                      MachineMemOperand::MOStore);
1625
1626      InFlag = Chain.getValue(1);
1627    }
1628    ++paramCount;
1629  }
1630
1631  GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1632  unsigned retAlignment = 0;
1633
1634  // Handle Result
1635  if (Ins.size() > 0) {
1636    SmallVector<EVT, 16> resvtparts;
1637    ComputeValueVTs(*this, DL, RetTy, resvtparts);
1638
1639    // Declare
1640    //  .param .align 16 .b8 retval0[<size-in-bytes>], or
1641    //  .param .b<size-in-bits> retval0
1642    unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
1643    // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
1644    // these three types to match the logic in
1645    // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
1646    // Plus, this behavior is consistent with nvcc's.
1647    if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
1648        (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
1649      // Scalar needs to be at least 32bit wide
1650      if (resultsz < 32)
1651        resultsz = 32;
1652      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1653      SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1654                                  DAG.getConstant(resultsz, dl, MVT::i32),
1655                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
1656      Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1657                          DeclareRetOps);
1658      InFlag = Chain.getValue(1);
1659    } else {
1660      retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1661      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1662      SDValue DeclareRetOps[] = { Chain,
1663                                  DAG.getConstant(retAlignment, dl, MVT::i32),
1664                                  DAG.getConstant(resultsz / 8, dl, MVT::i32),
1665                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
1666      Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1667                          DeclareRetOps);
1668      InFlag = Chain.getValue(1);
1669    }
1670  }
1671
1672  // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1673  // between them we must rely on the call site value which is valid for
1674  // indirect calls but is always null for libcalls.
1675  bool isIndirectCall = !Func && CS;
1676
1677  if (isa<ExternalSymbolSDNode>(Callee)) {
1678    Function* CalleeFunc = nullptr;
1679
1680    // Try to find the callee in the current module.
1681    Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1682    assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1683
1684    // Set the "libcall callee" attribute to indicate that the function
1685    // must always have a declaration.
1686    CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1687  }
1688
1689  if (isIndirectCall) {
1690    // This is indirect function call case : PTX requires a prototype of the
1691    // form
1692    // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1693    // to be emitted, and the label has to used as the last arg of call
1694    // instruction.
1695    // The prototype is embedded in a string and put as the operand for a
1696    // CallPrototype SDNode which will print out to the value of the string.
1697    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1698    std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
1699    const char *ProtoStr =
1700      nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
1701    SDValue ProtoOps[] = {
1702      Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
1703    };
1704    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1705    InFlag = Chain.getValue(1);
1706  }
1707  // Op to just print "call"
1708  SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1709  SDValue PrintCallOps[] = {
1710    Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InFlag
1711  };
1712  // We model convergent calls as separate opcodes.
1713  unsigned Opcode = isIndirectCall ? NVPTXISD::PrintCall : NVPTXISD::PrintCallUni;
1714  if (CLI.IsConvergent)
1715    Opcode = Opcode == NVPTXISD::PrintCallUni ? NVPTXISD::PrintConvergentCallUni
1716                                              : NVPTXISD::PrintConvergentCall;
1717  Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1718  InFlag = Chain.getValue(1);
1719
1720  // Ops to print out the function name
1721  SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1722  SDValue CallVoidOps[] = { Chain, Callee, InFlag };
1723  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1724  InFlag = Chain.getValue(1);
1725
1726  // Ops to print out the param list
1727  SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1728  SDValue CallArgBeginOps[] = { Chain, InFlag };
1729  Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1730                      CallArgBeginOps);
1731  InFlag = Chain.getValue(1);
1732
1733  for (unsigned i = 0, e = paramCount; i != e; ++i) {
1734    unsigned opcode;
1735    if (i == (e - 1))
1736      opcode = NVPTXISD::LastCallArg;
1737    else
1738      opcode = NVPTXISD::CallArg;
1739    SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1740    SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1741                             DAG.getConstant(i, dl, MVT::i32), InFlag };
1742    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1743    InFlag = Chain.getValue(1);
1744  }
1745  SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1746  SDValue CallArgEndOps[] = { Chain,
1747                              DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1748                              InFlag };
1749  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1750  InFlag = Chain.getValue(1);
1751
1752  if (isIndirectCall) {
1753    SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1754    SDValue PrototypeOps[] = { Chain,
1755                               DAG.getConstant(uniqueCallSite, dl, MVT::i32),
1756                               InFlag };
1757    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1758    InFlag = Chain.getValue(1);
1759  }
1760
1761  SmallVector<SDValue, 16> ProxyRegOps;
1762  SmallVector<Optional<MVT>, 16> ProxyRegTruncates;
1763
1764  // Generate loads from param memory/moves from registers for result
1765  if (Ins.size() > 0) {
1766    SmallVector<EVT, 16> VTs;
1767    SmallVector<uint64_t, 16> Offsets;
1768    ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
1769    assert(VTs.size() == Ins.size() && "Bad value decomposition");
1770
1771    unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
1772    auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1773
1774    SmallVector<EVT, 6> LoadVTs;
1775    int VecIdx = -1; // Index of the first element of the vector.
1776
1777    // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1778    // 32-bits are sign extended or zero extended, depending on whether
1779    // they are signed or unsigned types.
1780    bool ExtendIntegerRetVal =
1781        RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1782
1783    for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1784      bool needTruncate = false;
1785      EVT TheLoadType = VTs[i];
1786      EVT EltType = Ins[i].VT;
1787      unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
1788      if (ExtendIntegerRetVal) {
1789        TheLoadType = MVT::i32;
1790        EltType = MVT::i32;
1791        needTruncate = true;
1792      } else if (TheLoadType.getSizeInBits() < 16) {
1793        if (VTs[i].isInteger())
1794          needTruncate = true;
1795        EltType = MVT::i16;
1796      }
1797
1798      // Record index of the very first element of the vector.
1799      if (VectorInfo[i] & PVF_FIRST) {
1800        assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
1801        VecIdx = i;
1802      }
1803
1804      LoadVTs.push_back(EltType);
1805
1806      if (VectorInfo[i] & PVF_LAST) {
1807        unsigned NumElts = LoadVTs.size();
1808        LoadVTs.push_back(MVT::Other);
1809        LoadVTs.push_back(MVT::Glue);
1810        NVPTXISD::NodeType Op;
1811        switch (NumElts) {
1812        case 1:
1813          Op = NVPTXISD::LoadParam;
1814          break;
1815        case 2:
1816          Op = NVPTXISD::LoadParamV2;
1817          break;
1818        case 4:
1819          Op = NVPTXISD::LoadParamV4;
1820          break;
1821        default:
1822          llvm_unreachable("Invalid vector info.");
1823        }
1824
1825        SDValue LoadOperands[] = {
1826            Chain, DAG.getConstant(1, dl, MVT::i32),
1827            DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
1828        SDValue RetVal = DAG.getMemIntrinsicNode(
1829            Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1830            MachinePointerInfo(), EltAlign,
1831            MachineMemOperand::MOLoad);
1832
1833        for (unsigned j = 0; j < NumElts; ++j) {
1834          ProxyRegOps.push_back(RetVal.getValue(j));
1835
1836          if (needTruncate)
1837            ProxyRegTruncates.push_back(Optional<MVT>(Ins[VecIdx + j].VT));
1838          else
1839            ProxyRegTruncates.push_back(Optional<MVT>());
1840        }
1841
1842        Chain = RetVal.getValue(NumElts);
1843        InFlag = RetVal.getValue(NumElts + 1);
1844
1845        // Cleanup
1846        VecIdx = -1;
1847        LoadVTs.clear();
1848      }
1849    }
1850  }
1851
1852  Chain = DAG.getCALLSEQ_END(Chain,
1853                             DAG.getIntPtrConstant(uniqueCallSite, dl, true),
1854                             DAG.getIntPtrConstant(uniqueCallSite + 1, dl,
1855                                                   true),
1856                             InFlag, dl);
1857  InFlag = Chain.getValue(1);
1858  uniqueCallSite++;
1859
1860  // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1861  // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1862  // dangling.
1863  for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1864    SDValue Ret = DAG.getNode(
1865      NVPTXISD::ProxyReg, dl,
1866      DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1867      { Chain, ProxyRegOps[i], InFlag }
1868    );
1869
1870    Chain = Ret.getValue(1);
1871    InFlag = Ret.getValue(2);
1872
1873    if (ProxyRegTruncates[i].hasValue()) {
1874      Ret = DAG.getNode(ISD::TRUNCATE, dl, ProxyRegTruncates[i].getValue(), Ret);
1875    }
1876
1877    InVals.push_back(Ret);
1878  }
1879
1880  // set isTailCall to false for now, until we figure out how to express
1881  // tail call optimization in PTX
1882  isTailCall = false;
1883  return Chain;
1884}
1885
1886// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1887// (see LegalizeDAG.cpp). This is slow and uses local memory.
1888// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1889SDValue
1890NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1891  SDNode *Node = Op.getNode();
1892  SDLoc dl(Node);
1893  SmallVector<SDValue, 8> Ops;
1894  unsigned NumOperands = Node->getNumOperands();
1895  for (unsigned i = 0; i < NumOperands; ++i) {
1896    SDValue SubOp = Node->getOperand(i);
1897    EVT VVT = SubOp.getNode()->getValueType(0);
1898    EVT EltVT = VVT.getVectorElementType();
1899    unsigned NumSubElem = VVT.getVectorNumElements();
1900    for (unsigned j = 0; j < NumSubElem; ++j) {
1901      Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1902                                DAG.getIntPtrConstant(j, dl)));
1903    }
1904  }
1905  return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1906}
1907
1908// We can init constant f16x2 with a single .b32 move.  Normally it
1909// would get lowered as two constant loads and vector-packing move.
1910//        mov.b16         %h1, 0x4000;
1911//        mov.b16         %h2, 0x3C00;
1912//        mov.b32         %hh2, {%h2, %h1};
1913// Instead we want just a constant move:
1914//        mov.b32         %hh2, 0x40003C00
1915//
1916// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
1917// generates good SASS in both cases.
1918SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1919                                               SelectionDAG &DAG) const {
1920  //return Op;
1921  if (!(Op->getValueType(0) == MVT::v2f16 &&
1922        isa<ConstantFPSDNode>(Op->getOperand(0)) &&
1923        isa<ConstantFPSDNode>(Op->getOperand(1))))
1924    return Op;
1925
1926  APInt E0 =
1927      cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
1928  APInt E1 =
1929      cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
1930  SDValue Const =
1931      DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
1932  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
1933}
1934
1935SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1936                                                     SelectionDAG &DAG) const {
1937  SDValue Index = Op->getOperand(1);
1938  // Constant index will be matched by tablegen.
1939  if (isa<ConstantSDNode>(Index.getNode()))
1940    return Op;
1941
1942  // Extract individual elements and select one of them.
1943  SDValue Vector = Op->getOperand(0);
1944  EVT VectorVT = Vector.getValueType();
1945  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
1946  EVT EltVT = VectorVT.getVectorElementType();
1947
1948  SDLoc dl(Op.getNode());
1949  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1950                           DAG.getIntPtrConstant(0, dl));
1951  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
1952                           DAG.getIntPtrConstant(1, dl));
1953  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
1954                         ISD::CondCode::SETEQ);
1955}
1956
1957/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
1958/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
1959///    amount, or
1960/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
1961///    amount.
1962SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
1963                                                  SelectionDAG &DAG) const {
1964  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
1965  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
1966
1967  EVT VT = Op.getValueType();
1968  unsigned VTBits = VT.getSizeInBits();
1969  SDLoc dl(Op);
1970  SDValue ShOpLo = Op.getOperand(0);
1971  SDValue ShOpHi = Op.getOperand(1);
1972  SDValue ShAmt  = Op.getOperand(2);
1973  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
1974
1975  if (VTBits == 32 && STI.getSmVersion() >= 35) {
1976    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
1977    // {dHi, dLo} = {aHi, aLo} >> Amt
1978    //   dHi = aHi >> Amt
1979    //   dLo = shf.r.clamp aLo, aHi, Amt
1980
1981    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
1982    SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
1983                             ShAmt);
1984
1985    SDValue Ops[2] = { Lo, Hi };
1986    return DAG.getMergeValues(Ops, dl);
1987  }
1988  else {
1989    // {dHi, dLo} = {aHi, aLo} >> Amt
1990    // - if (Amt>=size) then
1991    //      dLo = aHi >> (Amt-size)
1992    //      dHi = aHi >> Amt (this is either all 0 or all 1)
1993    //   else
1994    //      dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
1995    //      dHi = aHi >> Amt
1996
1997    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
1998                                   DAG.getConstant(VTBits, dl, MVT::i32),
1999                                   ShAmt);
2000    SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2001    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2002                                     DAG.getConstant(VTBits, dl, MVT::i32));
2003    SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2004    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2005    SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2006
2007    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2008                               DAG.getConstant(VTBits, dl, MVT::i32),
2009                               ISD::SETGE);
2010    SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2011    SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2012
2013    SDValue Ops[2] = { Lo, Hi };
2014    return DAG.getMergeValues(Ops, dl);
2015  }
2016}
2017
2018/// LowerShiftLeftParts - Lower SHL_PARTS, which
2019/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2020///    amount, or
2021/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2022///    amount.
2023SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2024                                                 SelectionDAG &DAG) const {
2025  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2026  assert(Op.getOpcode() == ISD::SHL_PARTS);
2027
2028  EVT VT = Op.getValueType();
2029  unsigned VTBits = VT.getSizeInBits();
2030  SDLoc dl(Op);
2031  SDValue ShOpLo = Op.getOperand(0);
2032  SDValue ShOpHi = Op.getOperand(1);
2033  SDValue ShAmt  = Op.getOperand(2);
2034
2035  if (VTBits == 32 && STI.getSmVersion() >= 35) {
2036    // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2037    // {dHi, dLo} = {aHi, aLo} << Amt
2038    //   dHi = shf.l.clamp aLo, aHi, Amt
2039    //   dLo = aLo << Amt
2040
2041    SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2042                             ShAmt);
2043    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2044
2045    SDValue Ops[2] = { Lo, Hi };
2046    return DAG.getMergeValues(Ops, dl);
2047  }
2048  else {
2049    // {dHi, dLo} = {aHi, aLo} << Amt
2050    // - if (Amt>=size) then
2051    //      dLo = aLo << Amt (all 0)
2052    //      dLo = aLo << (Amt-size)
2053    //   else
2054    //      dLo = aLo << Amt
2055    //      dHi = (aHi << Amt) | (aLo >> (size-Amt))
2056
2057    SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2058                                   DAG.getConstant(VTBits, dl, MVT::i32),
2059                                   ShAmt);
2060    SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2061    SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2062                                     DAG.getConstant(VTBits, dl, MVT::i32));
2063    SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2064    SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2065    SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2066
2067    SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2068                               DAG.getConstant(VTBits, dl, MVT::i32),
2069                               ISD::SETGE);
2070    SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2071    SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2072
2073    SDValue Ops[2] = { Lo, Hi };
2074    return DAG.getMergeValues(Ops, dl);
2075  }
2076}
2077
2078SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2079  EVT VT = Op.getValueType();
2080
2081  if (VT == MVT::f32)
2082    return LowerFROUND32(Op, DAG);
2083
2084  if (VT == MVT::f64)
2085    return LowerFROUND64(Op, DAG);
2086
2087  llvm_unreachable("unhandled type");
2088}
2089
2090// This is the the rounding method used in CUDA libdevice in C like code:
2091// float roundf(float A)
2092// {
2093//   float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2094//   RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2095//   return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2096// }
2097SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2098                                           SelectionDAG &DAG) const {
2099  SDLoc SL(Op);
2100  SDValue A = Op.getOperand(0);
2101  EVT VT = Op.getValueType();
2102
2103  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2104
2105  // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2106  SDValue Bitcast  = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2107  const int SignBitMask = 0x80000000;
2108  SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2109                             DAG.getConstant(SignBitMask, SL, MVT::i32));
2110  const int PointFiveInBits = 0x3F000000;
2111  SDValue PointFiveWithSignRaw =
2112      DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2113                  DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2114  SDValue PointFiveWithSign =
2115      DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2116  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2117  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2118
2119  // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2120  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2121  SDValue IsLarge =
2122      DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2123                   ISD::SETOGT);
2124  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2125
2126  // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2127  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2128                                DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2129  SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2130  return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2131}
2132
2133// The implementation of round(double) is similar to that of round(float) in
2134// that they both separate the value range into three regions and use a method
2135// specific to the region to round the values. However, round(double) first
2136// calculates the round of the absolute value and then adds the sign back while
2137// round(float) directly rounds the value with sign.
2138SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2139                                           SelectionDAG &DAG) const {
2140  SDLoc SL(Op);
2141  SDValue A = Op.getOperand(0);
2142  EVT VT = Op.getValueType();
2143
2144  SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2145
2146  // double RoundedA = (double) (int) (abs(A) + 0.5f);
2147  SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2148                                  DAG.getConstantFP(0.5, SL, VT));
2149  SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2150
2151  // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2152  EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2153  SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2154                                DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2155  RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2156                         DAG.getConstantFP(0, SL, VT),
2157                         RoundedA);
2158
2159  // Add sign to rounded_A
2160  RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2161  DAG.getNode(ISD::FTRUNC, SL, VT, A);
2162
2163  // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2164  SDValue IsLarge =
2165      DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2166                   ISD::SETOGT);
2167  return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2168}
2169
2170
2171
2172SDValue
2173NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
2174  switch (Op.getOpcode()) {
2175  case ISD::RETURNADDR:
2176    return SDValue();
2177  case ISD::FRAMEADDR:
2178    return SDValue();
2179  case ISD::GlobalAddress:
2180    return LowerGlobalAddress(Op, DAG);
2181  case ISD::INTRINSIC_W_CHAIN:
2182    return Op;
2183  case ISD::BUILD_VECTOR:
2184    return LowerBUILD_VECTOR(Op, DAG);
2185  case ISD::EXTRACT_SUBVECTOR:
2186    return Op;
2187  case ISD::EXTRACT_VECTOR_ELT:
2188    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2189  case ISD::CONCAT_VECTORS:
2190    return LowerCONCAT_VECTORS(Op, DAG);
2191  case ISD::STORE:
2192    return LowerSTORE(Op, DAG);
2193  case ISD::LOAD:
2194    return LowerLOAD(Op, DAG);
2195  case ISD::SHL_PARTS:
2196    return LowerShiftLeftParts(Op, DAG);
2197  case ISD::SRA_PARTS:
2198  case ISD::SRL_PARTS:
2199    return LowerShiftRightParts(Op, DAG);
2200  case ISD::SELECT:
2201    return LowerSelect(Op, DAG);
2202  case ISD::FROUND:
2203    return LowerFROUND(Op, DAG);
2204  default:
2205    llvm_unreachable("Custom lowering not defined for operation");
2206  }
2207}
2208
2209SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
2210  SDValue Op0 = Op->getOperand(0);
2211  SDValue Op1 = Op->getOperand(1);
2212  SDValue Op2 = Op->getOperand(2);
2213  SDLoc DL(Op.getNode());
2214
2215  assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
2216
2217  Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
2218  Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
2219  SDValue Select = DAG.getNode(ISD::SELECT, DL, MVT::i32, Op0, Op1, Op2);
2220  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
2221
2222  return Trunc;
2223}
2224
2225SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
2226  if (Op.getValueType() == MVT::i1)
2227    return LowerLOADi1(Op, DAG);
2228
2229  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2230  // loads and have to handle it here.
2231  if (Op.getValueType() == MVT::v2f16) {
2232    LoadSDNode *Load = cast<LoadSDNode>(Op);
2233    EVT MemVT = Load->getMemoryVT();
2234    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2235                                        MemVT, *Load->getMemOperand())) {
2236      SDValue Ops[2];
2237      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
2238      return DAG.getMergeValues(Ops, SDLoc(Op));
2239    }
2240  }
2241
2242  return SDValue();
2243}
2244
2245// v = ld i1* addr
2246//   =>
2247// v1 = ld i8* addr (-> i16)
2248// v = trunc i16 to i1
2249SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
2250  SDNode *Node = Op.getNode();
2251  LoadSDNode *LD = cast<LoadSDNode>(Node);
2252  SDLoc dl(Node);
2253  assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
2254  assert(Node->getValueType(0) == MVT::i1 &&
2255         "Custom lowering for i1 load only");
2256  SDValue newLD = DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
2257                              LD->getPointerInfo(), LD->getAlignment(),
2258                              LD->getMemOperand()->getFlags());
2259  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2260  // The legalizer (the caller) is expecting two values from the legalized
2261  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2262  // in LegalizeDAG.cpp which also uses MergeValues.
2263  SDValue Ops[] = { result, LD->getChain() };
2264  return DAG.getMergeValues(Ops, dl);
2265}
2266
2267SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2268  StoreSDNode *Store = cast<StoreSDNode>(Op);
2269  EVT VT = Store->getMemoryVT();
2270
2271  if (VT == MVT::i1)
2272    return LowerSTOREi1(Op, DAG);
2273
2274  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
2275  // stores and have to handle it here.
2276  if (VT == MVT::v2f16 &&
2277      !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2278                                      VT, *Store->getMemOperand()))
2279    return expandUnalignedStore(Store, DAG);
2280
2281  if (VT.isVector())
2282    return LowerSTOREVector(Op, DAG);
2283
2284  return SDValue();
2285}
2286
2287SDValue
2288NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
2289  SDNode *N = Op.getNode();
2290  SDValue Val = N->getOperand(1);
2291  SDLoc DL(N);
2292  EVT ValVT = Val.getValueType();
2293
2294  if (ValVT.isVector()) {
2295    // We only handle "native" vector sizes for now, e.g. <4 x double> is not
2296    // legal.  We can (and should) split that into 2 stores of <2 x double> here
2297    // but I'm leaving that as a TODO for now.
2298    if (!ValVT.isSimple())
2299      return SDValue();
2300    switch (ValVT.getSimpleVT().SimpleTy) {
2301    default:
2302      return SDValue();
2303    case MVT::v2i8:
2304    case MVT::v2i16:
2305    case MVT::v2i32:
2306    case MVT::v2i64:
2307    case MVT::v2f16:
2308    case MVT::v2f32:
2309    case MVT::v2f64:
2310    case MVT::v4i8:
2311    case MVT::v4i16:
2312    case MVT::v4i32:
2313    case MVT::v4f16:
2314    case MVT::v4f32:
2315    case MVT::v8f16: // <4 x f16x2>
2316      // This is a "native" vector type
2317      break;
2318    }
2319
2320    MemSDNode *MemSD = cast<MemSDNode>(N);
2321    const DataLayout &TD = DAG.getDataLayout();
2322
2323    unsigned Align = MemSD->getAlignment();
2324    unsigned PrefAlign =
2325        TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
2326    if (Align < PrefAlign) {
2327      // This store is not sufficiently aligned, so bail out and let this vector
2328      // store be scalarized.  Note that we may still be able to emit smaller
2329      // vector stores.  For example, if we are storing a <4 x float> with an
2330      // alignment of 8, this check will fail but the legalizer will try again
2331      // with 2 x <2 x float>, which will succeed with an alignment of 8.
2332      return SDValue();
2333    }
2334
2335    unsigned Opcode = 0;
2336    EVT EltVT = ValVT.getVectorElementType();
2337    unsigned NumElts = ValVT.getVectorNumElements();
2338
2339    // Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2340    // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
2341    // stored type to i16 and propagate the "real" type as the memory type.
2342    bool NeedExt = false;
2343    if (EltVT.getSizeInBits() < 16)
2344      NeedExt = true;
2345
2346    bool StoreF16x2 = false;
2347    switch (NumElts) {
2348    default:
2349      return SDValue();
2350    case 2:
2351      Opcode = NVPTXISD::StoreV2;
2352      break;
2353    case 4:
2354      Opcode = NVPTXISD::StoreV4;
2355      break;
2356    case 8:
2357      // v8f16 is a special case. PTX doesn't have st.v8.f16
2358      // instruction. Instead, we split the vector into v2f16 chunks and
2359      // store them with st.v4.b32.
2360      assert(EltVT == MVT::f16 && "Wrong type for the vector.");
2361      Opcode = NVPTXISD::StoreV4;
2362      StoreF16x2 = true;
2363      break;
2364    }
2365
2366    SmallVector<SDValue, 8> Ops;
2367
2368    // First is the chain
2369    Ops.push_back(N->getOperand(0));
2370
2371    if (StoreF16x2) {
2372      // Combine f16,f16 -> v2f16
2373      NumElts /= 2;
2374      for (unsigned i = 0; i < NumElts; ++i) {
2375        SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2376                                 DAG.getIntPtrConstant(i * 2, DL));
2377        SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
2378                                 DAG.getIntPtrConstant(i * 2 + 1, DL));
2379        SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
2380        Ops.push_back(V2);
2381      }
2382    } else {
2383      // Then the split values
2384      for (unsigned i = 0; i < NumElts; ++i) {
2385        SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2386                                     DAG.getIntPtrConstant(i, DL));
2387        if (NeedExt)
2388          ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
2389        Ops.push_back(ExtVal);
2390      }
2391    }
2392
2393    // Then any remaining arguments
2394    Ops.append(N->op_begin() + 2, N->op_end());
2395
2396    SDValue NewSt =
2397        DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
2398                                MemSD->getMemoryVT(), MemSD->getMemOperand());
2399
2400    // return DCI.CombineTo(N, NewSt, true);
2401    return NewSt;
2402  }
2403
2404  return SDValue();
2405}
2406
2407// st i1 v, addr
2408//    =>
2409// v1 = zxt v to i16
2410// st.u8 i16, addr
2411SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
2412  SDNode *Node = Op.getNode();
2413  SDLoc dl(Node);
2414  StoreSDNode *ST = cast<StoreSDNode>(Node);
2415  SDValue Tmp1 = ST->getChain();
2416  SDValue Tmp2 = ST->getBasePtr();
2417  SDValue Tmp3 = ST->getValue();
2418  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
2419  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
2420  SDValue Result =
2421      DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
2422                        ST->getAlignment(), ST->getMemOperand()->getFlags());
2423  return Result;
2424}
2425
2426SDValue
2427NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
2428  std::string ParamSym;
2429  raw_string_ostream ParamStr(ParamSym);
2430
2431  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
2432  ParamStr.flush();
2433
2434  std::string *SavedStr =
2435    nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
2436  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
2437}
2438
2439// Check to see if the kernel argument is image*_t or sampler_t
2440
2441static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
2442  static const char *const specialTypes[] = { "struct._image2d_t",
2443                                              "struct._image3d_t",
2444                                              "struct._sampler_t" };
2445
2446  Type *Ty = arg->getType();
2447  auto *PTy = dyn_cast<PointerType>(Ty);
2448
2449  if (!PTy)
2450    return false;
2451
2452  if (!context)
2453    return false;
2454
2455  auto *STy = dyn_cast<StructType>(PTy->getElementType());
2456  if (!STy || STy->isLiteral())
2457    return false;
2458
2459  return std::find(std::begin(specialTypes), std::end(specialTypes),
2460                   STy->getName()) != std::end(specialTypes);
2461}
2462
2463SDValue NVPTXTargetLowering::LowerFormalArguments(
2464    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2465    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2466    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2467  MachineFunction &MF = DAG.getMachineFunction();
2468  const DataLayout &DL = DAG.getDataLayout();
2469  auto PtrVT = getPointerTy(DAG.getDataLayout());
2470
2471  const Function *F = &MF.getFunction();
2472  const AttributeList &PAL = F->getAttributes();
2473  const TargetLowering *TLI = STI.getTargetLowering();
2474
2475  SDValue Root = DAG.getRoot();
2476  std::vector<SDValue> OutChains;
2477
2478  bool isABI = (STI.getSmVersion() >= 20);
2479  assert(isABI && "Non-ABI compilation is not supported");
2480  if (!isABI)
2481    return Chain;
2482
2483  std::vector<Type *> argTypes;
2484  std::vector<const Argument *> theArgs;
2485  for (const Argument &I : F->args()) {
2486    theArgs.push_back(&I);
2487    argTypes.push_back(I.getType());
2488  }
2489  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
2490  // Ins.size() will be larger
2491  //   * if there is an aggregate argument with multiple fields (each field
2492  //     showing up separately in Ins)
2493  //   * if there is a vector argument with more than typical vector-length
2494  //     elements (generally if more than 4) where each vector element is
2495  //     individually present in Ins.
2496  // So a different index should be used for indexing into Ins.
2497  // See similar issue in LowerCall.
2498  unsigned InsIdx = 0;
2499
2500  int idx = 0;
2501  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
2502    Type *Ty = argTypes[i];
2503
2504    // If the kernel argument is image*_t or sampler_t, convert it to
2505    // a i32 constant holding the parameter position. This can later
2506    // matched in the AsmPrinter to output the correct mangled name.
2507    if (isImageOrSamplerVal(
2508            theArgs[i],
2509            (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
2510                                     : nullptr))) {
2511      assert(isKernelFunction(*F) &&
2512             "Only kernels can have image/sampler params");
2513      InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
2514      continue;
2515    }
2516
2517    if (theArgs[i]->use_empty()) {
2518      // argument is dead
2519      if (Ty->isAggregateType() || Ty->isIntegerTy(128)) {
2520        SmallVector<EVT, 16> vtparts;
2521
2522        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
2523        assert(vtparts.size() > 0 && "empty aggregate type not expected");
2524        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
2525             ++parti) {
2526          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2527          ++InsIdx;
2528        }
2529        if (vtparts.size() > 0)
2530          --InsIdx;
2531        continue;
2532      }
2533      if (Ty->isVectorTy()) {
2534        EVT ObjectVT = getValueType(DL, Ty);
2535        unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
2536        for (unsigned parti = 0; parti < NumRegs; ++parti) {
2537          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2538          ++InsIdx;
2539        }
2540        if (NumRegs > 0)
2541          --InsIdx;
2542        continue;
2543      }
2544      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
2545      continue;
2546    }
2547
2548    // In the following cases, assign a node order of "idx+1"
2549    // to newly created nodes. The SDNodes for params have to
2550    // appear in the same order as their order of appearance
2551    // in the original function. "idx+1" holds that order.
2552    if (!PAL.hasParamAttribute(i, Attribute::ByVal)) {
2553      bool aggregateIsPacked = false;
2554      if (StructType *STy = dyn_cast<StructType>(Ty))
2555        aggregateIsPacked = STy->isPacked();
2556
2557      SmallVector<EVT, 16> VTs;
2558      SmallVector<uint64_t, 16> Offsets;
2559      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
2560      assert(VTs.size() > 0 && "Unexpected empty type.");
2561      auto VectorInfo =
2562          VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
2563
2564      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2565      int VecIdx = -1; // Index of the first element of the current vector.
2566      for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
2567        if (VectorInfo[parti] & PVF_FIRST) {
2568          assert(VecIdx == -1 && "Orphaned vector.");
2569          VecIdx = parti;
2570        }
2571
2572        // That's the last element of this store op.
2573        if (VectorInfo[parti] & PVF_LAST) {
2574          unsigned NumElts = parti - VecIdx + 1;
2575          EVT EltVT = VTs[parti];
2576          // i1 is loaded/stored as i8.
2577          EVT LoadVT = EltVT;
2578          if (EltVT == MVT::i1)
2579            LoadVT = MVT::i8;
2580          else if (EltVT == MVT::v2f16)
2581            // getLoad needs a vector type, but it can't handle
2582            // vectors which contain v2f16 elements. So we must load
2583            // using i32 here and then bitcast back.
2584            LoadVT = MVT::i32;
2585
2586          EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
2587          SDValue VecAddr =
2588              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
2589                          DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
2590          Value *srcValue = Constant::getNullValue(PointerType::get(
2591              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
2592          SDValue P =
2593              DAG.getLoad(VecVT, dl, Root, VecAddr,
2594                          MachinePointerInfo(srcValue), aggregateIsPacked,
2595                          MachineMemOperand::MODereferenceable |
2596                              MachineMemOperand::MOInvariant);
2597          if (P.getNode())
2598            P.getNode()->setIROrder(idx + 1);
2599          for (unsigned j = 0; j < NumElts; ++j) {
2600            SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
2601                                      DAG.getIntPtrConstant(j, dl));
2602            // We've loaded i1 as an i8 and now must truncate it back to i1
2603            if (EltVT == MVT::i1)
2604              Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
2605            // v2f16 was loaded as an i32. Now we must bitcast it back.
2606            else if (EltVT == MVT::v2f16)
2607              Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
2608            // Extend the element if necessary (e.g. an i8 is loaded
2609            // into an i16 register)
2610            if (Ins[InsIdx].VT.isInteger() &&
2611                Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
2612              unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
2613                                                           : ISD::ZERO_EXTEND;
2614              Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
2615            }
2616            InVals.push_back(Elt);
2617          }
2618
2619          // Reset vector tracking state.
2620          VecIdx = -1;
2621        }
2622        ++InsIdx;
2623      }
2624      if (VTs.size() > 0)
2625        --InsIdx;
2626      continue;
2627    }
2628
2629    // Param has ByVal attribute
2630    // Return MoveParam(param symbol).
2631    // Ideally, the param symbol can be returned directly,
2632    // but when SDNode builder decides to use it in a CopyToReg(),
2633    // machine instruction fails because TargetExternalSymbol
2634    // (not lowered) is target dependent, and CopyToReg assumes
2635    // the source is lowered.
2636    EVT ObjectVT = getValueType(DL, Ty);
2637    assert(ObjectVT == Ins[InsIdx].VT &&
2638           "Ins type did not match function type");
2639    SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
2640    SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
2641    if (p.getNode())
2642      p.getNode()->setIROrder(idx + 1);
2643    InVals.push_back(p);
2644  }
2645
2646  // Clang will check explicit VarArg and issue error if any. However, Clang
2647  // will let code with
2648  // implicit var arg like f() pass. See bug 617733.
2649  // We treat this case as if the arg list is empty.
2650  // if (F.isVarArg()) {
2651  // assert(0 && "VarArg not supported yet!");
2652  //}
2653
2654  if (!OutChains.empty())
2655    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
2656
2657  return Chain;
2658}
2659
2660SDValue
2661NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2662                                 bool isVarArg,
2663                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
2664                                 const SmallVectorImpl<SDValue> &OutVals,
2665                                 const SDLoc &dl, SelectionDAG &DAG) const {
2666  MachineFunction &MF = DAG.getMachineFunction();
2667  Type *RetTy = MF.getFunction().getReturnType();
2668
2669  bool isABI = (STI.getSmVersion() >= 20);
2670  assert(isABI && "Non-ABI compilation is not supported");
2671  if (!isABI)
2672    return Chain;
2673
2674  const DataLayout DL = DAG.getDataLayout();
2675  SmallVector<EVT, 16> VTs;
2676  SmallVector<uint64_t, 16> Offsets;
2677  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
2678  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
2679
2680  auto VectorInfo = VectorizePTXValueVTs(
2681      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
2682
2683  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
2684  // 32-bits are sign extended or zero extended, depending on whether
2685  // they are signed or unsigned types.
2686  bool ExtendIntegerRetVal =
2687      RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
2688
2689  SmallVector<SDValue, 6> StoreOperands;
2690  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
2691    // New load/store. Record chain and offset operands.
2692    if (VectorInfo[i] & PVF_FIRST) {
2693      assert(StoreOperands.empty() && "Orphaned operand list.");
2694      StoreOperands.push_back(Chain);
2695      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
2696    }
2697
2698    SDValue RetVal = OutVals[i];
2699    if (ExtendIntegerRetVal) {
2700      RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
2701                                                  : ISD::ZERO_EXTEND,
2702                           dl, MVT::i32, RetVal);
2703    } else if (RetVal.getValueSizeInBits() < 16) {
2704      // Use 16-bit registers for small load-stores as it's the
2705      // smallest general purpose register size supported by NVPTX.
2706      RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
2707    }
2708
2709    // Record the value to return.
2710    StoreOperands.push_back(RetVal);
2711
2712    // That's the last element of this store op.
2713    if (VectorInfo[i] & PVF_LAST) {
2714      NVPTXISD::NodeType Op;
2715      unsigned NumElts = StoreOperands.size() - 2;
2716      switch (NumElts) {
2717      case 1:
2718        Op = NVPTXISD::StoreRetval;
2719        break;
2720      case 2:
2721        Op = NVPTXISD::StoreRetvalV2;
2722        break;
2723      case 4:
2724        Op = NVPTXISD::StoreRetvalV4;
2725        break;
2726      default:
2727        llvm_unreachable("Invalid vector info.");
2728      }
2729
2730      // Adjust type of load/store op if we've extended the scalar
2731      // return value.
2732      EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
2733      Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
2734                                      StoreOperands, TheStoreType,
2735                                      MachinePointerInfo(), /* Align */ 1,
2736                                      MachineMemOperand::MOStore);
2737      // Cleanup vector state.
2738      StoreOperands.clear();
2739    }
2740  }
2741
2742  return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
2743}
2744
2745void NVPTXTargetLowering::LowerAsmOperandForConstraint(
2746    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
2747    SelectionDAG &DAG) const {
2748  if (Constraint.length() > 1)
2749    return;
2750  else
2751    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
2752}
2753
2754static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
2755  switch (Intrinsic) {
2756  default:
2757    return 0;
2758
2759  case Intrinsic::nvvm_tex_1d_v4f32_s32:
2760    return NVPTXISD::Tex1DFloatS32;
2761  case Intrinsic::nvvm_tex_1d_v4f32_f32:
2762    return NVPTXISD::Tex1DFloatFloat;
2763  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
2764    return NVPTXISD::Tex1DFloatFloatLevel;
2765  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
2766    return NVPTXISD::Tex1DFloatFloatGrad;
2767  case Intrinsic::nvvm_tex_1d_v4s32_s32:
2768    return NVPTXISD::Tex1DS32S32;
2769  case Intrinsic::nvvm_tex_1d_v4s32_f32:
2770    return NVPTXISD::Tex1DS32Float;
2771  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
2772    return NVPTXISD::Tex1DS32FloatLevel;
2773  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
2774    return NVPTXISD::Tex1DS32FloatGrad;
2775  case Intrinsic::nvvm_tex_1d_v4u32_s32:
2776    return NVPTXISD::Tex1DU32S32;
2777  case Intrinsic::nvvm_tex_1d_v4u32_f32:
2778    return NVPTXISD::Tex1DU32Float;
2779  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
2780    return NVPTXISD::Tex1DU32FloatLevel;
2781  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
2782    return NVPTXISD::Tex1DU32FloatGrad;
2783
2784  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
2785    return NVPTXISD::Tex1DArrayFloatS32;
2786  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
2787    return NVPTXISD::Tex1DArrayFloatFloat;
2788  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
2789    return NVPTXISD::Tex1DArrayFloatFloatLevel;
2790  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
2791    return NVPTXISD::Tex1DArrayFloatFloatGrad;
2792  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
2793    return NVPTXISD::Tex1DArrayS32S32;
2794  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
2795    return NVPTXISD::Tex1DArrayS32Float;
2796  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
2797    return NVPTXISD::Tex1DArrayS32FloatLevel;
2798  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
2799    return NVPTXISD::Tex1DArrayS32FloatGrad;
2800  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
2801    return NVPTXISD::Tex1DArrayU32S32;
2802  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
2803    return NVPTXISD::Tex1DArrayU32Float;
2804  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
2805    return NVPTXISD::Tex1DArrayU32FloatLevel;
2806  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
2807    return NVPTXISD::Tex1DArrayU32FloatGrad;
2808
2809  case Intrinsic::nvvm_tex_2d_v4f32_s32:
2810    return NVPTXISD::Tex2DFloatS32;
2811  case Intrinsic::nvvm_tex_2d_v4f32_f32:
2812    return NVPTXISD::Tex2DFloatFloat;
2813  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
2814    return NVPTXISD::Tex2DFloatFloatLevel;
2815  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
2816    return NVPTXISD::Tex2DFloatFloatGrad;
2817  case Intrinsic::nvvm_tex_2d_v4s32_s32:
2818    return NVPTXISD::Tex2DS32S32;
2819  case Intrinsic::nvvm_tex_2d_v4s32_f32:
2820    return NVPTXISD::Tex2DS32Float;
2821  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
2822    return NVPTXISD::Tex2DS32FloatLevel;
2823  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
2824    return NVPTXISD::Tex2DS32FloatGrad;
2825  case Intrinsic::nvvm_tex_2d_v4u32_s32:
2826    return NVPTXISD::Tex2DU32S32;
2827  case Intrinsic::nvvm_tex_2d_v4u32_f32:
2828    return NVPTXISD::Tex2DU32Float;
2829  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
2830    return NVPTXISD::Tex2DU32FloatLevel;
2831  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
2832    return NVPTXISD::Tex2DU32FloatGrad;
2833
2834  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
2835    return NVPTXISD::Tex2DArrayFloatS32;
2836  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
2837    return NVPTXISD::Tex2DArrayFloatFloat;
2838  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
2839    return NVPTXISD::Tex2DArrayFloatFloatLevel;
2840  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
2841    return NVPTXISD::Tex2DArrayFloatFloatGrad;
2842  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
2843    return NVPTXISD::Tex2DArrayS32S32;
2844  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
2845    return NVPTXISD::Tex2DArrayS32Float;
2846  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
2847    return NVPTXISD::Tex2DArrayS32FloatLevel;
2848  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
2849    return NVPTXISD::Tex2DArrayS32FloatGrad;
2850  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
2851    return NVPTXISD::Tex2DArrayU32S32;
2852  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
2853    return NVPTXISD::Tex2DArrayU32Float;
2854  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
2855    return NVPTXISD::Tex2DArrayU32FloatLevel;
2856  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
2857    return NVPTXISD::Tex2DArrayU32FloatGrad;
2858
2859  case Intrinsic::nvvm_tex_3d_v4f32_s32:
2860    return NVPTXISD::Tex3DFloatS32;
2861  case Intrinsic::nvvm_tex_3d_v4f32_f32:
2862    return NVPTXISD::Tex3DFloatFloat;
2863  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
2864    return NVPTXISD::Tex3DFloatFloatLevel;
2865  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
2866    return NVPTXISD::Tex3DFloatFloatGrad;
2867  case Intrinsic::nvvm_tex_3d_v4s32_s32:
2868    return NVPTXISD::Tex3DS32S32;
2869  case Intrinsic::nvvm_tex_3d_v4s32_f32:
2870    return NVPTXISD::Tex3DS32Float;
2871  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
2872    return NVPTXISD::Tex3DS32FloatLevel;
2873  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
2874    return NVPTXISD::Tex3DS32FloatGrad;
2875  case Intrinsic::nvvm_tex_3d_v4u32_s32:
2876    return NVPTXISD::Tex3DU32S32;
2877  case Intrinsic::nvvm_tex_3d_v4u32_f32:
2878    return NVPTXISD::Tex3DU32Float;
2879  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
2880    return NVPTXISD::Tex3DU32FloatLevel;
2881  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
2882    return NVPTXISD::Tex3DU32FloatGrad;
2883
2884  case Intrinsic::nvvm_tex_cube_v4f32_f32:
2885    return NVPTXISD::TexCubeFloatFloat;
2886  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
2887    return NVPTXISD::TexCubeFloatFloatLevel;
2888  case Intrinsic::nvvm_tex_cube_v4s32_f32:
2889    return NVPTXISD::TexCubeS32Float;
2890  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
2891    return NVPTXISD::TexCubeS32FloatLevel;
2892  case Intrinsic::nvvm_tex_cube_v4u32_f32:
2893    return NVPTXISD::TexCubeU32Float;
2894  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
2895    return NVPTXISD::TexCubeU32FloatLevel;
2896
2897  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
2898    return NVPTXISD::TexCubeArrayFloatFloat;
2899  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
2900    return NVPTXISD::TexCubeArrayFloatFloatLevel;
2901  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
2902    return NVPTXISD::TexCubeArrayS32Float;
2903  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
2904    return NVPTXISD::TexCubeArrayS32FloatLevel;
2905  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
2906    return NVPTXISD::TexCubeArrayU32Float;
2907  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
2908    return NVPTXISD::TexCubeArrayU32FloatLevel;
2909
2910  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
2911    return NVPTXISD::Tld4R2DFloatFloat;
2912  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
2913    return NVPTXISD::Tld4G2DFloatFloat;
2914  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
2915    return NVPTXISD::Tld4B2DFloatFloat;
2916  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
2917    return NVPTXISD::Tld4A2DFloatFloat;
2918  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
2919    return NVPTXISD::Tld4R2DS64Float;
2920  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
2921    return NVPTXISD::Tld4G2DS64Float;
2922  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
2923    return NVPTXISD::Tld4B2DS64Float;
2924  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
2925    return NVPTXISD::Tld4A2DS64Float;
2926  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
2927    return NVPTXISD::Tld4R2DU64Float;
2928  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
2929    return NVPTXISD::Tld4G2DU64Float;
2930  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
2931    return NVPTXISD::Tld4B2DU64Float;
2932  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
2933    return NVPTXISD::Tld4A2DU64Float;
2934
2935  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
2936    return NVPTXISD::TexUnified1DFloatS32;
2937  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
2938    return NVPTXISD::TexUnified1DFloatFloat;
2939  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
2940    return NVPTXISD::TexUnified1DFloatFloatLevel;
2941  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
2942    return NVPTXISD::TexUnified1DFloatFloatGrad;
2943  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
2944    return NVPTXISD::TexUnified1DS32S32;
2945  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
2946    return NVPTXISD::TexUnified1DS32Float;
2947  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
2948    return NVPTXISD::TexUnified1DS32FloatLevel;
2949  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
2950    return NVPTXISD::TexUnified1DS32FloatGrad;
2951  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
2952    return NVPTXISD::TexUnified1DU32S32;
2953  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
2954    return NVPTXISD::TexUnified1DU32Float;
2955  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
2956    return NVPTXISD::TexUnified1DU32FloatLevel;
2957  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
2958    return NVPTXISD::TexUnified1DU32FloatGrad;
2959
2960  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
2961    return NVPTXISD::TexUnified1DArrayFloatS32;
2962  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
2963    return NVPTXISD::TexUnified1DArrayFloatFloat;
2964  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
2965    return NVPTXISD::TexUnified1DArrayFloatFloatLevel;
2966  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
2967    return NVPTXISD::TexUnified1DArrayFloatFloatGrad;
2968  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
2969    return NVPTXISD::TexUnified1DArrayS32S32;
2970  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
2971    return NVPTXISD::TexUnified1DArrayS32Float;
2972  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
2973    return NVPTXISD::TexUnified1DArrayS32FloatLevel;
2974  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
2975    return NVPTXISD::TexUnified1DArrayS32FloatGrad;
2976  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
2977    return NVPTXISD::TexUnified1DArrayU32S32;
2978  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
2979    return NVPTXISD::TexUnified1DArrayU32Float;
2980  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
2981    return NVPTXISD::TexUnified1DArrayU32FloatLevel;
2982  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
2983    return NVPTXISD::TexUnified1DArrayU32FloatGrad;
2984
2985  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
2986    return NVPTXISD::TexUnified2DFloatS32;
2987  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
2988    return NVPTXISD::TexUnified2DFloatFloat;
2989  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
2990    return NVPTXISD::TexUnified2DFloatFloatLevel;
2991  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
2992    return NVPTXISD::TexUnified2DFloatFloatGrad;
2993  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
2994    return NVPTXISD::TexUnified2DS32S32;
2995  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
2996    return NVPTXISD::TexUnified2DS32Float;
2997  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
2998    return NVPTXISD::TexUnified2DS32FloatLevel;
2999  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3000    return NVPTXISD::TexUnified2DS32FloatGrad;
3001  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3002    return NVPTXISD::TexUnified2DU32S32;
3003  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3004    return NVPTXISD::TexUnified2DU32Float;
3005  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3006    return NVPTXISD::TexUnified2DU32FloatLevel;
3007  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3008    return NVPTXISD::TexUnified2DU32FloatGrad;
3009
3010  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3011    return NVPTXISD::TexUnified2DArrayFloatS32;
3012  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3013    return NVPTXISD::TexUnified2DArrayFloatFloat;
3014  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3015    return NVPTXISD::TexUnified2DArrayFloatFloatLevel;
3016  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3017    return NVPTXISD::TexUnified2DArrayFloatFloatGrad;
3018  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3019    return NVPTXISD::TexUnified2DArrayS32S32;
3020  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3021    return NVPTXISD::TexUnified2DArrayS32Float;
3022  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3023    return NVPTXISD::TexUnified2DArrayS32FloatLevel;
3024  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3025    return NVPTXISD::TexUnified2DArrayS32FloatGrad;
3026  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3027    return NVPTXISD::TexUnified2DArrayU32S32;
3028  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3029    return NVPTXISD::TexUnified2DArrayU32Float;
3030  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3031    return NVPTXISD::TexUnified2DArrayU32FloatLevel;
3032  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3033    return NVPTXISD::TexUnified2DArrayU32FloatGrad;
3034
3035  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3036    return NVPTXISD::TexUnified3DFloatS32;
3037  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3038    return NVPTXISD::TexUnified3DFloatFloat;
3039  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3040    return NVPTXISD::TexUnified3DFloatFloatLevel;
3041  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3042    return NVPTXISD::TexUnified3DFloatFloatGrad;
3043  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3044    return NVPTXISD::TexUnified3DS32S32;
3045  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3046    return NVPTXISD::TexUnified3DS32Float;
3047  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3048    return NVPTXISD::TexUnified3DS32FloatLevel;
3049  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3050    return NVPTXISD::TexUnified3DS32FloatGrad;
3051  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3052    return NVPTXISD::TexUnified3DU32S32;
3053  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3054    return NVPTXISD::TexUnified3DU32Float;
3055  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3056    return NVPTXISD::TexUnified3DU32FloatLevel;
3057  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3058    return NVPTXISD::TexUnified3DU32FloatGrad;
3059
3060  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3061    return NVPTXISD::TexUnifiedCubeFloatFloat;
3062  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3063    return NVPTXISD::TexUnifiedCubeFloatFloatLevel;
3064  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3065    return NVPTXISD::TexUnifiedCubeS32Float;
3066  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3067    return NVPTXISD::TexUnifiedCubeS32FloatLevel;
3068  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3069    return NVPTXISD::TexUnifiedCubeU32Float;
3070  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3071    return NVPTXISD::TexUnifiedCubeU32FloatLevel;
3072
3073  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3074    return NVPTXISD::TexUnifiedCubeArrayFloatFloat;
3075  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3076    return NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel;
3077  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3078    return NVPTXISD::TexUnifiedCubeArrayS32Float;
3079  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3080    return NVPTXISD::TexUnifiedCubeArrayS32FloatLevel;
3081  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3082    return NVPTXISD::TexUnifiedCubeArrayU32Float;
3083  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3084    return NVPTXISD::TexUnifiedCubeArrayU32FloatLevel;
3085
3086  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3087    return NVPTXISD::Tld4UnifiedR2DFloatFloat;
3088  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3089    return NVPTXISD::Tld4UnifiedG2DFloatFloat;
3090  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3091    return NVPTXISD::Tld4UnifiedB2DFloatFloat;
3092  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3093    return NVPTXISD::Tld4UnifiedA2DFloatFloat;
3094  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3095    return NVPTXISD::Tld4UnifiedR2DS64Float;
3096  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3097    return NVPTXISD::Tld4UnifiedG2DS64Float;
3098  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3099    return NVPTXISD::Tld4UnifiedB2DS64Float;
3100  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
3101    return NVPTXISD::Tld4UnifiedA2DS64Float;
3102  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
3103    return NVPTXISD::Tld4UnifiedR2DU64Float;
3104  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
3105    return NVPTXISD::Tld4UnifiedG2DU64Float;
3106  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
3107    return NVPTXISD::Tld4UnifiedB2DU64Float;
3108  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
3109    return NVPTXISD::Tld4UnifiedA2DU64Float;
3110  }
3111}
3112
3113static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
3114  switch (Intrinsic) {
3115  default:
3116    return 0;
3117  case Intrinsic::nvvm_suld_1d_i8_clamp:
3118    return NVPTXISD::Suld1DI8Clamp;
3119  case Intrinsic::nvvm_suld_1d_i16_clamp:
3120    return NVPTXISD::Suld1DI16Clamp;
3121  case Intrinsic::nvvm_suld_1d_i32_clamp:
3122    return NVPTXISD::Suld1DI32Clamp;
3123  case Intrinsic::nvvm_suld_1d_i64_clamp:
3124    return NVPTXISD::Suld1DI64Clamp;
3125  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
3126    return NVPTXISD::Suld1DV2I8Clamp;
3127  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
3128    return NVPTXISD::Suld1DV2I16Clamp;
3129  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
3130    return NVPTXISD::Suld1DV2I32Clamp;
3131  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
3132    return NVPTXISD::Suld1DV2I64Clamp;
3133  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
3134    return NVPTXISD::Suld1DV4I8Clamp;
3135  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
3136    return NVPTXISD::Suld1DV4I16Clamp;
3137  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
3138    return NVPTXISD::Suld1DV4I32Clamp;
3139  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
3140    return NVPTXISD::Suld1DArrayI8Clamp;
3141  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
3142    return NVPTXISD::Suld1DArrayI16Clamp;
3143  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
3144    return NVPTXISD::Suld1DArrayI32Clamp;
3145  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
3146    return NVPTXISD::Suld1DArrayI64Clamp;
3147  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
3148    return NVPTXISD::Suld1DArrayV2I8Clamp;
3149  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
3150    return NVPTXISD::Suld1DArrayV2I16Clamp;
3151  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
3152    return NVPTXISD::Suld1DArrayV2I32Clamp;
3153  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
3154    return NVPTXISD::Suld1DArrayV2I64Clamp;
3155  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
3156    return NVPTXISD::Suld1DArrayV4I8Clamp;
3157  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
3158    return NVPTXISD::Suld1DArrayV4I16Clamp;
3159  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
3160    return NVPTXISD::Suld1DArrayV4I32Clamp;
3161  case Intrinsic::nvvm_suld_2d_i8_clamp:
3162    return NVPTXISD::Suld2DI8Clamp;
3163  case Intrinsic::nvvm_suld_2d_i16_clamp:
3164    return NVPTXISD::Suld2DI16Clamp;
3165  case Intrinsic::nvvm_suld_2d_i32_clamp:
3166    return NVPTXISD::Suld2DI32Clamp;
3167  case Intrinsic::nvvm_suld_2d_i64_clamp:
3168    return NVPTXISD::Suld2DI64Clamp;
3169  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
3170    return NVPTXISD::Suld2DV2I8Clamp;
3171  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
3172    return NVPTXISD::Suld2DV2I16Clamp;
3173  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
3174    return NVPTXISD::Suld2DV2I32Clamp;
3175  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
3176    return NVPTXISD::Suld2DV2I64Clamp;
3177  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
3178    return NVPTXISD::Suld2DV4I8Clamp;
3179  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
3180    return NVPTXISD::Suld2DV4I16Clamp;
3181  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
3182    return NVPTXISD::Suld2DV4I32Clamp;
3183  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
3184    return NVPTXISD::Suld2DArrayI8Clamp;
3185  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
3186    return NVPTXISD::Suld2DArrayI16Clamp;
3187  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
3188    return NVPTXISD::Suld2DArrayI32Clamp;
3189  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
3190    return NVPTXISD::Suld2DArrayI64Clamp;
3191  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
3192    return NVPTXISD::Suld2DArrayV2I8Clamp;
3193  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
3194    return NVPTXISD::Suld2DArrayV2I16Clamp;
3195  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
3196    return NVPTXISD::Suld2DArrayV2I32Clamp;
3197  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
3198    return NVPTXISD::Suld2DArrayV2I64Clamp;
3199  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
3200    return NVPTXISD::Suld2DArrayV4I8Clamp;
3201  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
3202    return NVPTXISD::Suld2DArrayV4I16Clamp;
3203  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
3204    return NVPTXISD::Suld2DArrayV4I32Clamp;
3205  case Intrinsic::nvvm_suld_3d_i8_clamp:
3206    return NVPTXISD::Suld3DI8Clamp;
3207  case Intrinsic::nvvm_suld_3d_i16_clamp:
3208    return NVPTXISD::Suld3DI16Clamp;
3209  case Intrinsic::nvvm_suld_3d_i32_clamp:
3210    return NVPTXISD::Suld3DI32Clamp;
3211  case Intrinsic::nvvm_suld_3d_i64_clamp:
3212    return NVPTXISD::Suld3DI64Clamp;
3213  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
3214    return NVPTXISD::Suld3DV2I8Clamp;
3215  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
3216    return NVPTXISD::Suld3DV2I16Clamp;
3217  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
3218    return NVPTXISD::Suld3DV2I32Clamp;
3219  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
3220    return NVPTXISD::Suld3DV2I64Clamp;
3221  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
3222    return NVPTXISD::Suld3DV4I8Clamp;
3223  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
3224    return NVPTXISD::Suld3DV4I16Clamp;
3225  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
3226    return NVPTXISD::Suld3DV4I32Clamp;
3227  case Intrinsic::nvvm_suld_1d_i8_trap:
3228    return NVPTXISD::Suld1DI8Trap;
3229  case Intrinsic::nvvm_suld_1d_i16_trap:
3230    return NVPTXISD::Suld1DI16Trap;
3231  case Intrinsic::nvvm_suld_1d_i32_trap:
3232    return NVPTXISD::Suld1DI32Trap;
3233  case Intrinsic::nvvm_suld_1d_i64_trap:
3234    return NVPTXISD::Suld1DI64Trap;
3235  case Intrinsic::nvvm_suld_1d_v2i8_trap:
3236    return NVPTXISD::Suld1DV2I8Trap;
3237  case Intrinsic::nvvm_suld_1d_v2i16_trap:
3238    return NVPTXISD::Suld1DV2I16Trap;
3239  case Intrinsic::nvvm_suld_1d_v2i32_trap:
3240    return NVPTXISD::Suld1DV2I32Trap;
3241  case Intrinsic::nvvm_suld_1d_v2i64_trap:
3242    return NVPTXISD::Suld1DV2I64Trap;
3243  case Intrinsic::nvvm_suld_1d_v4i8_trap:
3244    return NVPTXISD::Suld1DV4I8Trap;
3245  case Intrinsic::nvvm_suld_1d_v4i16_trap:
3246    return NVPTXISD::Suld1DV4I16Trap;
3247  case Intrinsic::nvvm_suld_1d_v4i32_trap:
3248    return NVPTXISD::Suld1DV4I32Trap;
3249  case Intrinsic::nvvm_suld_1d_array_i8_trap:
3250    return NVPTXISD::Suld1DArrayI8Trap;
3251  case Intrinsic::nvvm_suld_1d_array_i16_trap:
3252    return NVPTXISD::Suld1DArrayI16Trap;
3253  case Intrinsic::nvvm_suld_1d_array_i32_trap:
3254    return NVPTXISD::Suld1DArrayI32Trap;
3255  case Intrinsic::nvvm_suld_1d_array_i64_trap:
3256    return NVPTXISD::Suld1DArrayI64Trap;
3257  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
3258    return NVPTXISD::Suld1DArrayV2I8Trap;
3259  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
3260    return NVPTXISD::Suld1DArrayV2I16Trap;
3261  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
3262    return NVPTXISD::Suld1DArrayV2I32Trap;
3263  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
3264    return NVPTXISD::Suld1DArrayV2I64Trap;
3265  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
3266    return NVPTXISD::Suld1DArrayV4I8Trap;
3267  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
3268    return NVPTXISD::Suld1DArrayV4I16Trap;
3269  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
3270    return NVPTXISD::Suld1DArrayV4I32Trap;
3271  case Intrinsic::nvvm_suld_2d_i8_trap:
3272    return NVPTXISD::Suld2DI8Trap;
3273  case Intrinsic::nvvm_suld_2d_i16_trap:
3274    return NVPTXISD::Suld2DI16Trap;
3275  case Intrinsic::nvvm_suld_2d_i32_trap:
3276    return NVPTXISD::Suld2DI32Trap;
3277  case Intrinsic::nvvm_suld_2d_i64_trap:
3278    return NVPTXISD::Suld2DI64Trap;
3279  case Intrinsic::nvvm_suld_2d_v2i8_trap:
3280    return NVPTXISD::Suld2DV2I8Trap;
3281  case Intrinsic::nvvm_suld_2d_v2i16_trap:
3282    return NVPTXISD::Suld2DV2I16Trap;
3283  case Intrinsic::nvvm_suld_2d_v2i32_trap:
3284    return NVPTXISD::Suld2DV2I32Trap;
3285  case Intrinsic::nvvm_suld_2d_v2i64_trap:
3286    return NVPTXISD::Suld2DV2I64Trap;
3287  case Intrinsic::nvvm_suld_2d_v4i8_trap:
3288    return NVPTXISD::Suld2DV4I8Trap;
3289  case Intrinsic::nvvm_suld_2d_v4i16_trap:
3290    return NVPTXISD::Suld2DV4I16Trap;
3291  case Intrinsic::nvvm_suld_2d_v4i32_trap:
3292    return NVPTXISD::Suld2DV4I32Trap;
3293  case Intrinsic::nvvm_suld_2d_array_i8_trap:
3294    return NVPTXISD::Suld2DArrayI8Trap;
3295  case Intrinsic::nvvm_suld_2d_array_i16_trap:
3296    return NVPTXISD::Suld2DArrayI16Trap;
3297  case Intrinsic::nvvm_suld_2d_array_i32_trap:
3298    return NVPTXISD::Suld2DArrayI32Trap;
3299  case Intrinsic::nvvm_suld_2d_array_i64_trap:
3300    return NVPTXISD::Suld2DArrayI64Trap;
3301  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
3302    return NVPTXISD::Suld2DArrayV2I8Trap;
3303  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
3304    return NVPTXISD::Suld2DArrayV2I16Trap;
3305  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
3306    return NVPTXISD::Suld2DArrayV2I32Trap;
3307  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
3308    return NVPTXISD::Suld2DArrayV2I64Trap;
3309  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
3310    return NVPTXISD::Suld2DArrayV4I8Trap;
3311  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
3312    return NVPTXISD::Suld2DArrayV4I16Trap;
3313  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
3314    return NVPTXISD::Suld2DArrayV4I32Trap;
3315  case Intrinsic::nvvm_suld_3d_i8_trap:
3316    return NVPTXISD::Suld3DI8Trap;
3317  case Intrinsic::nvvm_suld_3d_i16_trap:
3318    return NVPTXISD::Suld3DI16Trap;
3319  case Intrinsic::nvvm_suld_3d_i32_trap:
3320    return NVPTXISD::Suld3DI32Trap;
3321  case Intrinsic::nvvm_suld_3d_i64_trap:
3322    return NVPTXISD::Suld3DI64Trap;
3323  case Intrinsic::nvvm_suld_3d_v2i8_trap:
3324    return NVPTXISD::Suld3DV2I8Trap;
3325  case Intrinsic::nvvm_suld_3d_v2i16_trap:
3326    return NVPTXISD::Suld3DV2I16Trap;
3327  case Intrinsic::nvvm_suld_3d_v2i32_trap:
3328    return NVPTXISD::Suld3DV2I32Trap;
3329  case Intrinsic::nvvm_suld_3d_v2i64_trap:
3330    return NVPTXISD::Suld3DV2I64Trap;
3331  case Intrinsic::nvvm_suld_3d_v4i8_trap:
3332    return NVPTXISD::Suld3DV4I8Trap;
3333  case Intrinsic::nvvm_suld_3d_v4i16_trap:
3334    return NVPTXISD::Suld3DV4I16Trap;
3335  case Intrinsic::nvvm_suld_3d_v4i32_trap:
3336    return NVPTXISD::Suld3DV4I32Trap;
3337  case Intrinsic::nvvm_suld_1d_i8_zero:
3338    return NVPTXISD::Suld1DI8Zero;
3339  case Intrinsic::nvvm_suld_1d_i16_zero:
3340    return NVPTXISD::Suld1DI16Zero;
3341  case Intrinsic::nvvm_suld_1d_i32_zero:
3342    return NVPTXISD::Suld1DI32Zero;
3343  case Intrinsic::nvvm_suld_1d_i64_zero:
3344    return NVPTXISD::Suld1DI64Zero;
3345  case Intrinsic::nvvm_suld_1d_v2i8_zero:
3346    return NVPTXISD::Suld1DV2I8Zero;
3347  case Intrinsic::nvvm_suld_1d_v2i16_zero:
3348    return NVPTXISD::Suld1DV2I16Zero;
3349  case Intrinsic::nvvm_suld_1d_v2i32_zero:
3350    return NVPTXISD::Suld1DV2I32Zero;
3351  case Intrinsic::nvvm_suld_1d_v2i64_zero:
3352    return NVPTXISD::Suld1DV2I64Zero;
3353  case Intrinsic::nvvm_suld_1d_v4i8_zero:
3354    return NVPTXISD::Suld1DV4I8Zero;
3355  case Intrinsic::nvvm_suld_1d_v4i16_zero:
3356    return NVPTXISD::Suld1DV4I16Zero;
3357  case Intrinsic::nvvm_suld_1d_v4i32_zero:
3358    return NVPTXISD::Suld1DV4I32Zero;
3359  case Intrinsic::nvvm_suld_1d_array_i8_zero:
3360    return NVPTXISD::Suld1DArrayI8Zero;
3361  case Intrinsic::nvvm_suld_1d_array_i16_zero:
3362    return NVPTXISD::Suld1DArrayI16Zero;
3363  case Intrinsic::nvvm_suld_1d_array_i32_zero:
3364    return NVPTXISD::Suld1DArrayI32Zero;
3365  case Intrinsic::nvvm_suld_1d_array_i64_zero:
3366    return NVPTXISD::Suld1DArrayI64Zero;
3367  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
3368    return NVPTXISD::Suld1DArrayV2I8Zero;
3369  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
3370    return NVPTXISD::Suld1DArrayV2I16Zero;
3371  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
3372    return NVPTXISD::Suld1DArrayV2I32Zero;
3373  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
3374    return NVPTXISD::Suld1DArrayV2I64Zero;
3375  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
3376    return NVPTXISD::Suld1DArrayV4I8Zero;
3377  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
3378    return NVPTXISD::Suld1DArrayV4I16Zero;
3379  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
3380    return NVPTXISD::Suld1DArrayV4I32Zero;
3381  case Intrinsic::nvvm_suld_2d_i8_zero:
3382    return NVPTXISD::Suld2DI8Zero;
3383  case Intrinsic::nvvm_suld_2d_i16_zero:
3384    return NVPTXISD::Suld2DI16Zero;
3385  case Intrinsic::nvvm_suld_2d_i32_zero:
3386    return NVPTXISD::Suld2DI32Zero;
3387  case Intrinsic::nvvm_suld_2d_i64_zero:
3388    return NVPTXISD::Suld2DI64Zero;
3389  case Intrinsic::nvvm_suld_2d_v2i8_zero:
3390    return NVPTXISD::Suld2DV2I8Zero;
3391  case Intrinsic::nvvm_suld_2d_v2i16_zero:
3392    return NVPTXISD::Suld2DV2I16Zero;
3393  case Intrinsic::nvvm_suld_2d_v2i32_zero:
3394    return NVPTXISD::Suld2DV2I32Zero;
3395  case Intrinsic::nvvm_suld_2d_v2i64_zero:
3396    return NVPTXISD::Suld2DV2I64Zero;
3397  case Intrinsic::nvvm_suld_2d_v4i8_zero:
3398    return NVPTXISD::Suld2DV4I8Zero;
3399  case Intrinsic::nvvm_suld_2d_v4i16_zero:
3400    return NVPTXISD::Suld2DV4I16Zero;
3401  case Intrinsic::nvvm_suld_2d_v4i32_zero:
3402    return NVPTXISD::Suld2DV4I32Zero;
3403  case Intrinsic::nvvm_suld_2d_array_i8_zero:
3404    return NVPTXISD::Suld2DArrayI8Zero;
3405  case Intrinsic::nvvm_suld_2d_array_i16_zero:
3406    return NVPTXISD::Suld2DArrayI16Zero;
3407  case Intrinsic::nvvm_suld_2d_array_i32_zero:
3408    return NVPTXISD::Suld2DArrayI32Zero;
3409  case Intrinsic::nvvm_suld_2d_array_i64_zero:
3410    return NVPTXISD::Suld2DArrayI64Zero;
3411  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
3412    return NVPTXISD::Suld2DArrayV2I8Zero;
3413  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
3414    return NVPTXISD::Suld2DArrayV2I16Zero;
3415  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
3416    return NVPTXISD::Suld2DArrayV2I32Zero;
3417  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
3418    return NVPTXISD::Suld2DArrayV2I64Zero;
3419  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
3420    return NVPTXISD::Suld2DArrayV4I8Zero;
3421  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
3422    return NVPTXISD::Suld2DArrayV4I16Zero;
3423  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
3424    return NVPTXISD::Suld2DArrayV4I32Zero;
3425  case Intrinsic::nvvm_suld_3d_i8_zero:
3426    return NVPTXISD::Suld3DI8Zero;
3427  case Intrinsic::nvvm_suld_3d_i16_zero:
3428    return NVPTXISD::Suld3DI16Zero;
3429  case Intrinsic::nvvm_suld_3d_i32_zero:
3430    return NVPTXISD::Suld3DI32Zero;
3431  case Intrinsic::nvvm_suld_3d_i64_zero:
3432    return NVPTXISD::Suld3DI64Zero;
3433  case Intrinsic::nvvm_suld_3d_v2i8_zero:
3434    return NVPTXISD::Suld3DV2I8Zero;
3435  case Intrinsic::nvvm_suld_3d_v2i16_zero:
3436    return NVPTXISD::Suld3DV2I16Zero;
3437  case Intrinsic::nvvm_suld_3d_v2i32_zero:
3438    return NVPTXISD::Suld3DV2I32Zero;
3439  case Intrinsic::nvvm_suld_3d_v2i64_zero:
3440    return NVPTXISD::Suld3DV2I64Zero;
3441  case Intrinsic::nvvm_suld_3d_v4i8_zero:
3442    return NVPTXISD::Suld3DV4I8Zero;
3443  case Intrinsic::nvvm_suld_3d_v4i16_zero:
3444    return NVPTXISD::Suld3DV4I16Zero;
3445  case Intrinsic::nvvm_suld_3d_v4i32_zero:
3446    return NVPTXISD::Suld3DV4I32Zero;
3447  }
3448}
3449
3450// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3451// TgtMemIntrinsic
3452// because we need the information that is only available in the "Value" type
3453// of destination
3454// pointer. In particular, the address space information.
3455bool NVPTXTargetLowering::getTgtMemIntrinsic(
3456    IntrinsicInfo &Info, const CallInst &I,
3457    MachineFunction &MF, unsigned Intrinsic) const {
3458  switch (Intrinsic) {
3459  default:
3460    return false;
3461  case Intrinsic::nvvm_match_all_sync_i32p:
3462  case Intrinsic::nvvm_match_all_sync_i64p:
3463    Info.opc = ISD::INTRINSIC_W_CHAIN;
3464    // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3465    // in order to model data exchange with other threads, but perform no real
3466    // memory accesses.
3467    Info.memVT = MVT::i1;
3468
3469    // Our result depends on both our and other thread's arguments.
3470    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
3471    return true;
3472  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3473  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3474  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3475  case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3476  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3477  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3478  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3479  case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3480  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3481  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3482  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3483  case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3484  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3485  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3486  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3487  case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3488  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3489  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3490  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3491  case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3492  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3493  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3494  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3495  case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3496    Info.opc = ISD::INTRINSIC_W_CHAIN;
3497    Info.memVT = MVT::v8f16;
3498    Info.ptrVal = I.getArgOperand(0);
3499    Info.offset = 0;
3500    Info.flags = MachineMemOperand::MOLoad;
3501    Info.align = Align(16);
3502    return true;
3503  }
3504  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3505  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3506  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3507  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3508  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3509  case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3510  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3511  case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3512  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3513  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3514  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3515  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3516  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3517  case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3518  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3519  case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row: {
3520    Info.opc = ISD::INTRINSIC_W_CHAIN;
3521    Info.memVT = MVT::v2i32;
3522    Info.ptrVal = I.getArgOperand(0);
3523    Info.offset = 0;
3524    Info.flags = MachineMemOperand::MOLoad;
3525    Info.align = Align(8);
3526    return true;
3527  }
3528
3529  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3530  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3531  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3532  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3533  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3534  case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3535  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3536  case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3537
3538  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3539  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3540  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3541  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3542  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3543  case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3544  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3545  case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row: {
3546    Info.opc = ISD::INTRINSIC_W_CHAIN;
3547    Info.memVT = MVT::v4i32;
3548    Info.ptrVal = I.getArgOperand(0);
3549    Info.offset = 0;
3550    Info.flags = MachineMemOperand::MOLoad;
3551    Info.align = Align(16);
3552    return true;
3553  }
3554
3555  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3556  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3557  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3558  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3559  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3560  case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3561  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3562  case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3563
3564  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3565  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3566  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3567  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3568  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3569  case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3570  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3571  case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3572  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3573  case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3574  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3575  case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3576  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3577  case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3578  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3579  case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3580  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3581  case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3582  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3583  case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col: {
3584    Info.opc = ISD::INTRINSIC_W_CHAIN;
3585    Info.memVT = MVT::i32;
3586    Info.ptrVal = I.getArgOperand(0);
3587    Info.offset = 0;
3588    Info.flags = MachineMemOperand::MOLoad;
3589    Info.align = Align(4);
3590    return true;
3591  }
3592
3593  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3594  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3595  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3596  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3597  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3598  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3599  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3600  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3601  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3602  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3603  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3604  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3605    Info.opc = ISD::INTRINSIC_W_CHAIN;
3606    Info.memVT = MVT::v4f16;
3607    Info.ptrVal = I.getArgOperand(0);
3608    Info.offset = 0;
3609    Info.flags = MachineMemOperand::MOLoad;
3610    Info.align = Align(16);
3611    return true;
3612  }
3613
3614  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3615  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3616  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3617  case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3618  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3619  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3620  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3621  case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3622  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3623  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3624  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3625  case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride: {
3626    Info.opc = ISD::INTRINSIC_W_CHAIN;
3627    Info.memVT = MVT::v8f32;
3628    Info.ptrVal = I.getArgOperand(0);
3629    Info.offset = 0;
3630    Info.flags = MachineMemOperand::MOLoad;
3631    Info.align = Align(16);
3632    return true;
3633  }
3634
3635  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3636  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3637  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3638  case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3639  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3640  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3641  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3642  case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3643  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3644  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3645  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3646  case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3647    Info.opc = ISD::INTRINSIC_W_CHAIN;
3648    Info.memVT = MVT::v8i32;
3649    Info.ptrVal = I.getArgOperand(0);
3650    Info.offset = 0;
3651    Info.flags = MachineMemOperand::MOLoad;
3652    Info.align = Align(16);
3653    return true;
3654  }
3655
3656  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3657  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3658  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3659  case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3660  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3661  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3662  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3663  case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride: {
3664    Info.opc = ISD::INTRINSIC_W_CHAIN;
3665    Info.memVT = MVT::v2i32;
3666    Info.ptrVal = I.getArgOperand(0);
3667    Info.offset = 0;
3668    Info.flags = MachineMemOperand::MOLoad;
3669    Info.align = Align(8);
3670    return true;
3671  }
3672
3673  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3674  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3675  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3676  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3677  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3678  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3679  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3680  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3681  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3682  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3683  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3684  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3685    Info.opc = ISD::INTRINSIC_VOID;
3686    Info.memVT = MVT::v4f16;
3687    Info.ptrVal = I.getArgOperand(0);
3688    Info.offset = 0;
3689    Info.flags = MachineMemOperand::MOStore;
3690    Info.align = Align(16);
3691    return true;
3692  }
3693
3694  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3695  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3696  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3697  case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3698  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3699  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3700  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3701  case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3702  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3703  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3704  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3705  case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride: {
3706    Info.opc = ISD::INTRINSIC_VOID;
3707    Info.memVT = MVT::v8f32;
3708    Info.ptrVal = I.getArgOperand(0);
3709    Info.offset = 0;
3710    Info.flags = MachineMemOperand::MOStore;
3711    Info.align = Align(16);
3712    return true;
3713  }
3714
3715  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3716  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3717  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3718  case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3719  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3720  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3721  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3722  case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3723  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3724  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3725  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3726  case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3727    Info.opc = ISD::INTRINSIC_VOID;
3728    Info.memVT = MVT::v8i32;
3729    Info.ptrVal = I.getArgOperand(0);
3730    Info.offset = 0;
3731    Info.flags = MachineMemOperand::MOStore;
3732    Info.align = Align(16);
3733    return true;
3734  }
3735
3736  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3737  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3738  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3739  case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3740  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3741  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3742  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3743  case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3744    Info.opc = ISD::INTRINSIC_VOID;
3745    Info.memVT = MVT::v2i32;
3746    Info.ptrVal = I.getArgOperand(0);
3747    Info.offset = 0;
3748    Info.flags = MachineMemOperand::MOStore;
3749    Info.align = Align(8);
3750    return true;
3751  }
3752
3753  case Intrinsic::nvvm_atomic_load_inc_32:
3754  case Intrinsic::nvvm_atomic_load_dec_32:
3755
3756  case Intrinsic::nvvm_atomic_add_gen_f_cta:
3757  case Intrinsic::nvvm_atomic_add_gen_f_sys:
3758  case Intrinsic::nvvm_atomic_add_gen_i_cta:
3759  case Intrinsic::nvvm_atomic_add_gen_i_sys:
3760  case Intrinsic::nvvm_atomic_and_gen_i_cta:
3761  case Intrinsic::nvvm_atomic_and_gen_i_sys:
3762  case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3763  case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3764  case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3765  case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3766  case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3767  case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3768  case Intrinsic::nvvm_atomic_max_gen_i_cta:
3769  case Intrinsic::nvvm_atomic_max_gen_i_sys:
3770  case Intrinsic::nvvm_atomic_min_gen_i_cta:
3771  case Intrinsic::nvvm_atomic_min_gen_i_sys:
3772  case Intrinsic::nvvm_atomic_or_gen_i_cta:
3773  case Intrinsic::nvvm_atomic_or_gen_i_sys:
3774  case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3775  case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3776  case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3777  case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3778    auto &DL = I.getModule()->getDataLayout();
3779    Info.opc = ISD::INTRINSIC_W_CHAIN;
3780    Info.memVT = getValueType(DL, I.getType());
3781    Info.ptrVal = I.getArgOperand(0);
3782    Info.offset = 0;
3783    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
3784    Info.align.reset();
3785    return true;
3786  }
3787
3788  case Intrinsic::nvvm_ldu_global_i:
3789  case Intrinsic::nvvm_ldu_global_f:
3790  case Intrinsic::nvvm_ldu_global_p: {
3791    auto &DL = I.getModule()->getDataLayout();
3792    Info.opc = ISD::INTRINSIC_W_CHAIN;
3793    if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3794      Info.memVT = getValueType(DL, I.getType());
3795    else if(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3796      Info.memVT = getPointerTy(DL);
3797    else
3798      Info.memVT = getValueType(DL, I.getType());
3799    Info.ptrVal = I.getArgOperand(0);
3800    Info.offset = 0;
3801    Info.flags = MachineMemOperand::MOLoad;
3802    Info.align =
3803        MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
3804
3805    return true;
3806  }
3807  case Intrinsic::nvvm_ldg_global_i:
3808  case Intrinsic::nvvm_ldg_global_f:
3809  case Intrinsic::nvvm_ldg_global_p: {
3810    auto &DL = I.getModule()->getDataLayout();
3811
3812    Info.opc = ISD::INTRINSIC_W_CHAIN;
3813    if (Intrinsic == Intrinsic::nvvm_ldg_global_i)
3814      Info.memVT = getValueType(DL, I.getType());
3815    else if(Intrinsic == Intrinsic::nvvm_ldg_global_p)
3816      Info.memVT = getPointerTy(DL);
3817    else
3818      Info.memVT = getValueType(DL, I.getType());
3819    Info.ptrVal = I.getArgOperand(0);
3820    Info.offset = 0;
3821    Info.flags = MachineMemOperand::MOLoad;
3822    Info.align =
3823        MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
3824
3825    return true;
3826  }
3827
3828  case Intrinsic::nvvm_tex_1d_v4f32_s32:
3829  case Intrinsic::nvvm_tex_1d_v4f32_f32:
3830  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3831  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3832  case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3833  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3834  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3835  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3836  case Intrinsic::nvvm_tex_2d_v4f32_s32:
3837  case Intrinsic::nvvm_tex_2d_v4f32_f32:
3838  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3839  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3840  case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3841  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3842  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3843  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3844  case Intrinsic::nvvm_tex_3d_v4f32_s32:
3845  case Intrinsic::nvvm_tex_3d_v4f32_f32:
3846  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3847  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3848  case Intrinsic::nvvm_tex_cube_v4f32_f32:
3849  case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3850  case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3851  case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3852  case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3853  case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3854  case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3855  case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3856  case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3857  case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3858  case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3859  case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3860  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3861  case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3862  case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3863  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3864  case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3865  case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3866  case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3867  case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3868  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3869  case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3870  case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3871  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3872  case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3873  case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3874  case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3875  case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3876  case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3877  case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3878  case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3879  case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3880  case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3881  case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3882  case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3883  case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3884    Info.opc = getOpcForTextureInstr(Intrinsic);
3885    Info.memVT = MVT::v4f32;
3886    Info.ptrVal = nullptr;
3887    Info.offset = 0;
3888    Info.flags = MachineMemOperand::MOLoad;
3889    Info.align = Align(16);
3890    return true;
3891
3892  case Intrinsic::nvvm_tex_1d_v4s32_s32:
3893  case Intrinsic::nvvm_tex_1d_v4s32_f32:
3894  case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3895  case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3896  case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3897  case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3898  case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3899  case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3900  case Intrinsic::nvvm_tex_2d_v4s32_s32:
3901  case Intrinsic::nvvm_tex_2d_v4s32_f32:
3902  case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3903  case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3904  case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3905  case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3906  case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3907  case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3908  case Intrinsic::nvvm_tex_3d_v4s32_s32:
3909  case Intrinsic::nvvm_tex_3d_v4s32_f32:
3910  case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3911  case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3912  case Intrinsic::nvvm_tex_cube_v4s32_f32:
3913  case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3914  case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3915  case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3916  case Intrinsic::nvvm_tex_cube_v4u32_f32:
3917  case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3918  case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3919  case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3920  case Intrinsic::nvvm_tex_1d_v4u32_s32:
3921  case Intrinsic::nvvm_tex_1d_v4u32_f32:
3922  case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3923  case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3924  case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3925  case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3926  case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3927  case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3928  case Intrinsic::nvvm_tex_2d_v4u32_s32:
3929  case Intrinsic::nvvm_tex_2d_v4u32_f32:
3930  case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3931  case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3932  case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3933  case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3934  case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3935  case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3936  case Intrinsic::nvvm_tex_3d_v4u32_s32:
3937  case Intrinsic::nvvm_tex_3d_v4u32_f32:
3938  case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3939  case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3940  case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3941  case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3942  case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3943  case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
3944  case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
3945  case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
3946  case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
3947  case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
3948  case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
3949  case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
3950  case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
3951  case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
3952  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
3953  case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
3954  case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
3955  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
3956  case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
3957  case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
3958  case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
3959  case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
3960  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
3961  case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
3962  case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
3963  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
3964  case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
3965  case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
3966  case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
3967  case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
3968  case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
3969  case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
3970  case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
3971  case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
3972  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
3973  case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
3974  case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
3975  case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
3976  case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
3977  case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
3978  case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
3979  case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
3980  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
3981  case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
3982  case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
3983  case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
3984  case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
3985  case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
3986  case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
3987  case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
3988  case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
3989  case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
3990  case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
3991  case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
3992  case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
3993  case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
3994  case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
3995  case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
3996  case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
3997  case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
3998  case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
3999  case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4000  case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4001  case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4002  case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4003  case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4004    Info.opc = getOpcForTextureInstr(Intrinsic);
4005    Info.memVT = MVT::v4i32;
4006    Info.ptrVal = nullptr;
4007    Info.offset = 0;
4008    Info.flags = MachineMemOperand::MOLoad;
4009    Info.align = Align(16);
4010    return true;
4011
4012  case Intrinsic::nvvm_suld_1d_i8_clamp:
4013  case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4014  case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4015  case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4016  case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4017  case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4018  case Intrinsic::nvvm_suld_2d_i8_clamp:
4019  case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4020  case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4021  case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4022  case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4023  case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4024  case Intrinsic::nvvm_suld_3d_i8_clamp:
4025  case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4026  case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4027  case Intrinsic::nvvm_suld_1d_i8_trap:
4028  case Intrinsic::nvvm_suld_1d_v2i8_trap:
4029  case Intrinsic::nvvm_suld_1d_v4i8_trap:
4030  case Intrinsic::nvvm_suld_1d_array_i8_trap:
4031  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4032  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4033  case Intrinsic::nvvm_suld_2d_i8_trap:
4034  case Intrinsic::nvvm_suld_2d_v2i8_trap:
4035  case Intrinsic::nvvm_suld_2d_v4i8_trap:
4036  case Intrinsic::nvvm_suld_2d_array_i8_trap:
4037  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4038  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4039  case Intrinsic::nvvm_suld_3d_i8_trap:
4040  case Intrinsic::nvvm_suld_3d_v2i8_trap:
4041  case Intrinsic::nvvm_suld_3d_v4i8_trap:
4042  case Intrinsic::nvvm_suld_1d_i8_zero:
4043  case Intrinsic::nvvm_suld_1d_v2i8_zero:
4044  case Intrinsic::nvvm_suld_1d_v4i8_zero:
4045  case Intrinsic::nvvm_suld_1d_array_i8_zero:
4046  case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4047  case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4048  case Intrinsic::nvvm_suld_2d_i8_zero:
4049  case Intrinsic::nvvm_suld_2d_v2i8_zero:
4050  case Intrinsic::nvvm_suld_2d_v4i8_zero:
4051  case Intrinsic::nvvm_suld_2d_array_i8_zero:
4052  case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4053  case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4054  case Intrinsic::nvvm_suld_3d_i8_zero:
4055  case Intrinsic::nvvm_suld_3d_v2i8_zero:
4056  case Intrinsic::nvvm_suld_3d_v4i8_zero:
4057    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4058    Info.memVT = MVT::i8;
4059    Info.ptrVal = nullptr;
4060    Info.offset = 0;
4061    Info.flags = MachineMemOperand::MOLoad;
4062    Info.align = Align(16);
4063    return true;
4064
4065  case Intrinsic::nvvm_suld_1d_i16_clamp:
4066  case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4067  case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4068  case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4069  case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4070  case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4071  case Intrinsic::nvvm_suld_2d_i16_clamp:
4072  case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4073  case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4074  case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4075  case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4076  case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4077  case Intrinsic::nvvm_suld_3d_i16_clamp:
4078  case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4079  case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4080  case Intrinsic::nvvm_suld_1d_i16_trap:
4081  case Intrinsic::nvvm_suld_1d_v2i16_trap:
4082  case Intrinsic::nvvm_suld_1d_v4i16_trap:
4083  case Intrinsic::nvvm_suld_1d_array_i16_trap:
4084  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4085  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4086  case Intrinsic::nvvm_suld_2d_i16_trap:
4087  case Intrinsic::nvvm_suld_2d_v2i16_trap:
4088  case Intrinsic::nvvm_suld_2d_v4i16_trap:
4089  case Intrinsic::nvvm_suld_2d_array_i16_trap:
4090  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4091  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4092  case Intrinsic::nvvm_suld_3d_i16_trap:
4093  case Intrinsic::nvvm_suld_3d_v2i16_trap:
4094  case Intrinsic::nvvm_suld_3d_v4i16_trap:
4095  case Intrinsic::nvvm_suld_1d_i16_zero:
4096  case Intrinsic::nvvm_suld_1d_v2i16_zero:
4097  case Intrinsic::nvvm_suld_1d_v4i16_zero:
4098  case Intrinsic::nvvm_suld_1d_array_i16_zero:
4099  case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4100  case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4101  case Intrinsic::nvvm_suld_2d_i16_zero:
4102  case Intrinsic::nvvm_suld_2d_v2i16_zero:
4103  case Intrinsic::nvvm_suld_2d_v4i16_zero:
4104  case Intrinsic::nvvm_suld_2d_array_i16_zero:
4105  case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4106  case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4107  case Intrinsic::nvvm_suld_3d_i16_zero:
4108  case Intrinsic::nvvm_suld_3d_v2i16_zero:
4109  case Intrinsic::nvvm_suld_3d_v4i16_zero:
4110    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4111    Info.memVT = MVT::i16;
4112    Info.ptrVal = nullptr;
4113    Info.offset = 0;
4114    Info.flags = MachineMemOperand::MOLoad;
4115    Info.align = Align(16);
4116    return true;
4117
4118  case Intrinsic::nvvm_suld_1d_i32_clamp:
4119  case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4120  case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4121  case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4122  case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4123  case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4124  case Intrinsic::nvvm_suld_2d_i32_clamp:
4125  case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4126  case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4127  case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4128  case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4129  case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4130  case Intrinsic::nvvm_suld_3d_i32_clamp:
4131  case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4132  case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4133  case Intrinsic::nvvm_suld_1d_i32_trap:
4134  case Intrinsic::nvvm_suld_1d_v2i32_trap:
4135  case Intrinsic::nvvm_suld_1d_v4i32_trap:
4136  case Intrinsic::nvvm_suld_1d_array_i32_trap:
4137  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4138  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4139  case Intrinsic::nvvm_suld_2d_i32_trap:
4140  case Intrinsic::nvvm_suld_2d_v2i32_trap:
4141  case Intrinsic::nvvm_suld_2d_v4i32_trap:
4142  case Intrinsic::nvvm_suld_2d_array_i32_trap:
4143  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4144  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4145  case Intrinsic::nvvm_suld_3d_i32_trap:
4146  case Intrinsic::nvvm_suld_3d_v2i32_trap:
4147  case Intrinsic::nvvm_suld_3d_v4i32_trap:
4148  case Intrinsic::nvvm_suld_1d_i32_zero:
4149  case Intrinsic::nvvm_suld_1d_v2i32_zero:
4150  case Intrinsic::nvvm_suld_1d_v4i32_zero:
4151  case Intrinsic::nvvm_suld_1d_array_i32_zero:
4152  case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4153  case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4154  case Intrinsic::nvvm_suld_2d_i32_zero:
4155  case Intrinsic::nvvm_suld_2d_v2i32_zero:
4156  case Intrinsic::nvvm_suld_2d_v4i32_zero:
4157  case Intrinsic::nvvm_suld_2d_array_i32_zero:
4158  case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4159  case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4160  case Intrinsic::nvvm_suld_3d_i32_zero:
4161  case Intrinsic::nvvm_suld_3d_v2i32_zero:
4162  case Intrinsic::nvvm_suld_3d_v4i32_zero:
4163    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4164    Info.memVT = MVT::i32;
4165    Info.ptrVal = nullptr;
4166    Info.offset = 0;
4167    Info.flags = MachineMemOperand::MOLoad;
4168    Info.align = Align(16);
4169    return true;
4170
4171  case Intrinsic::nvvm_suld_1d_i64_clamp:
4172  case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4173  case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4174  case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4175  case Intrinsic::nvvm_suld_2d_i64_clamp:
4176  case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4177  case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4178  case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4179  case Intrinsic::nvvm_suld_3d_i64_clamp:
4180  case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4181  case Intrinsic::nvvm_suld_1d_i64_trap:
4182  case Intrinsic::nvvm_suld_1d_v2i64_trap:
4183  case Intrinsic::nvvm_suld_1d_array_i64_trap:
4184  case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4185  case Intrinsic::nvvm_suld_2d_i64_trap:
4186  case Intrinsic::nvvm_suld_2d_v2i64_trap:
4187  case Intrinsic::nvvm_suld_2d_array_i64_trap:
4188  case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4189  case Intrinsic::nvvm_suld_3d_i64_trap:
4190  case Intrinsic::nvvm_suld_3d_v2i64_trap:
4191  case Intrinsic::nvvm_suld_1d_i64_zero:
4192  case Intrinsic::nvvm_suld_1d_v2i64_zero:
4193  case Intrinsic::nvvm_suld_1d_array_i64_zero:
4194  case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4195  case Intrinsic::nvvm_suld_2d_i64_zero:
4196  case Intrinsic::nvvm_suld_2d_v2i64_zero:
4197  case Intrinsic::nvvm_suld_2d_array_i64_zero:
4198  case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4199  case Intrinsic::nvvm_suld_3d_i64_zero:
4200  case Intrinsic::nvvm_suld_3d_v2i64_zero:
4201    Info.opc = getOpcForSurfaceInstr(Intrinsic);
4202    Info.memVT = MVT::i64;
4203    Info.ptrVal = nullptr;
4204    Info.offset = 0;
4205    Info.flags = MachineMemOperand::MOLoad;
4206    Info.align = Align(16);
4207    return true;
4208  }
4209  return false;
4210}
4211
4212/// isLegalAddressingMode - Return true if the addressing mode represented
4213/// by AM is legal for this target, for a load/store of the specified type.
4214/// Used to guide target specific optimizations, like loop strength reduction
4215/// (LoopStrengthReduce.cpp) and memory optimization for address mode
4216/// (CodeGenPrepare.cpp)
4217bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL,
4218                                                const AddrMode &AM, Type *Ty,
4219                                                unsigned AS, Instruction *I) const {
4220  // AddrMode - This represents an addressing mode of:
4221  //    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4222  //
4223  // The legal address modes are
4224  // - [avar]
4225  // - [areg]
4226  // - [areg+immoff]
4227  // - [immAddr]
4228
4229  if (AM.BaseGV) {
4230    return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4231  }
4232
4233  switch (AM.Scale) {
4234  case 0: // "r", "r+i" or "i" is allowed
4235    break;
4236  case 1:
4237    if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
4238      return false;
4239    // Otherwise we have r+i.
4240    break;
4241  default:
4242    // No scale > 1 is allowed
4243    return false;
4244  }
4245  return true;
4246}
4247
4248//===----------------------------------------------------------------------===//
4249//                         NVPTX Inline Assembly Support
4250//===----------------------------------------------------------------------===//
4251
4252/// getConstraintType - Given a constraint letter, return the type of
4253/// constraint it is for this target.
4254NVPTXTargetLowering::ConstraintType
4255NVPTXTargetLowering::getConstraintType(StringRef Constraint) const {
4256  if (Constraint.size() == 1) {
4257    switch (Constraint[0]) {
4258    default:
4259      break;
4260    case 'b':
4261    case 'r':
4262    case 'h':
4263    case 'c':
4264    case 'l':
4265    case 'f':
4266    case 'd':
4267    case '0':
4268    case 'N':
4269      return C_RegisterClass;
4270    }
4271  }
4272  return TargetLowering::getConstraintType(Constraint);
4273}
4274
4275std::pair<unsigned, const TargetRegisterClass *>
4276NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
4277                                                  StringRef Constraint,
4278                                                  MVT VT) const {
4279  if (Constraint.size() == 1) {
4280    switch (Constraint[0]) {
4281    case 'b':
4282      return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4283    case 'c':
4284      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4285    case 'h':
4286      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4287    case 'r':
4288      return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4289    case 'l':
4290    case 'N':
4291      return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4292    case 'f':
4293      return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4294    case 'd':
4295      return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4296    }
4297  }
4298  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4299}
4300
4301//===----------------------------------------------------------------------===//
4302//                         NVPTX DAG Combining
4303//===----------------------------------------------------------------------===//
4304
4305bool NVPTXTargetLowering::allowFMA(MachineFunction &MF,
4306                                   CodeGenOpt::Level OptLevel) const {
4307  // Always honor command-line argument
4308  if (FMAContractLevelOpt.getNumOccurrences() > 0)
4309    return FMAContractLevelOpt > 0;
4310
4311  // Do not contract if we're not optimizing the code.
4312  if (OptLevel == 0)
4313    return false;
4314
4315  // Honor TargetOptions flags that explicitly say fusion is okay.
4316  if (MF.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast)
4317    return true;
4318
4319  return allowUnsafeFPMath(MF);
4320}
4321
4322bool NVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const {
4323  // Honor TargetOptions flags that explicitly say unsafe math is okay.
4324  if (MF.getTarget().Options.UnsafeFPMath)
4325    return true;
4326
4327  // Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4328  const Function &F = MF.getFunction();
4329  if (F.hasFnAttribute("unsafe-fp-math")) {
4330    Attribute Attr = F.getFnAttribute("unsafe-fp-math");
4331    StringRef Val = Attr.getValueAsString();
4332    if (Val == "true")
4333      return true;
4334  }
4335
4336  return false;
4337}
4338
4339/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4340/// operands N0 and N1.  This is a helper for PerformADDCombine that is
4341/// called with the default operands, and if that fails, with commuted
4342/// operands.
4343static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
4344                                           TargetLowering::DAGCombinerInfo &DCI,
4345                                             const NVPTXSubtarget &Subtarget,
4346                                             CodeGenOpt::Level OptLevel) {
4347  SelectionDAG  &DAG = DCI.DAG;
4348  // Skip non-integer, non-scalar case
4349  EVT VT=N0.getValueType();
4350  if (VT.isVector())
4351    return SDValue();
4352
4353  // fold (add (mul a, b), c) -> (mad a, b, c)
4354  //
4355  if (N0.getOpcode() == ISD::MUL) {
4356    assert (VT.isInteger());
4357    // For integer:
4358    // Since integer multiply-add costs the same as integer multiply
4359    // but is more costly than integer add, do the fusion only when
4360    // the mul is only used in the add.
4361    if (OptLevel==CodeGenOpt::None || VT != MVT::i32 ||
4362        !N0.getNode()->hasOneUse())
4363      return SDValue();
4364
4365    // Do the folding
4366    return DAG.getNode(NVPTXISD::IMAD, SDLoc(N), VT,
4367                       N0.getOperand(0), N0.getOperand(1), N1);
4368  }
4369  else if (N0.getOpcode() == ISD::FMUL) {
4370    if (VT == MVT::f32 || VT == MVT::f64) {
4371      const auto *TLI = static_cast<const NVPTXTargetLowering *>(
4372          &DAG.getTargetLoweringInfo());
4373      if (!TLI->allowFMA(DAG.getMachineFunction(), OptLevel))
4374        return SDValue();
4375
4376      // For floating point:
4377      // Do the fusion only when the mul has less than 5 uses and all
4378      // are add.
4379      // The heuristic is that if a use is not an add, then that use
4380      // cannot be fused into fma, therefore mul is still needed anyway.
4381      // If there are more than 4 uses, even if they are all add, fusing
4382      // them will increase register pressue.
4383      //
4384      int numUses = 0;
4385      int nonAddCount = 0;
4386      for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
4387           UE = N0.getNode()->use_end();
4388           UI != UE; ++UI) {
4389        numUses++;
4390        SDNode *User = *UI;
4391        if (User->getOpcode() != ISD::FADD)
4392          ++nonAddCount;
4393      }
4394      if (numUses >= 5)
4395        return SDValue();
4396      if (nonAddCount) {
4397        int orderNo = N->getIROrder();
4398        int orderNo2 = N0.getNode()->getIROrder();
4399        // simple heuristics here for considering potential register
4400        // pressure, the logics here is that the differnce are used
4401        // to measure the distance between def and use, the longer distance
4402        // more likely cause register pressure.
4403        if (orderNo - orderNo2 < 500)
4404          return SDValue();
4405
4406        // Now, check if at least one of the FMUL's operands is live beyond the node N,
4407        // which guarantees that the FMA will not increase register pressure at node N.
4408        bool opIsLive = false;
4409        const SDNode *left = N0.getOperand(0).getNode();
4410        const SDNode *right = N0.getOperand(1).getNode();
4411
4412        if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4413          opIsLive = true;
4414
4415        if (!opIsLive)
4416          for (SDNode::use_iterator UI = left->use_begin(), UE = left->use_end(); UI != UE; ++UI) {
4417            SDNode *User = *UI;
4418            int orderNo3 = User->getIROrder();
4419            if (orderNo3 > orderNo) {
4420              opIsLive = true;
4421              break;
4422            }
4423          }
4424
4425        if (!opIsLive)
4426          for (SDNode::use_iterator UI = right->use_begin(), UE = right->use_end(); UI != UE; ++UI) {
4427            SDNode *User = *UI;
4428            int orderNo3 = User->getIROrder();
4429            if (orderNo3 > orderNo) {
4430              opIsLive = true;
4431              break;
4432            }
4433          }
4434
4435        if (!opIsLive)
4436          return SDValue();
4437      }
4438
4439      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
4440                         N0.getOperand(0), N0.getOperand(1), N1);
4441    }
4442  }
4443
4444  return SDValue();
4445}
4446
4447/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4448///
4449static SDValue PerformADDCombine(SDNode *N,
4450                                 TargetLowering::DAGCombinerInfo &DCI,
4451                                 const NVPTXSubtarget &Subtarget,
4452                                 CodeGenOpt::Level OptLevel) {
4453  SDValue N0 = N->getOperand(0);
4454  SDValue N1 = N->getOperand(1);
4455
4456  // First try with the default operand order.
4457  if (SDValue Result =
4458          PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget, OptLevel))
4459    return Result;
4460
4461  // If that didn't work, try again with the operands commuted.
4462  return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget, OptLevel);
4463}
4464
4465static SDValue PerformANDCombine(SDNode *N,
4466                                 TargetLowering::DAGCombinerInfo &DCI) {
4467  // The type legalizer turns a vector load of i8 values into a zextload to i16
4468  // registers, optionally ANY_EXTENDs it (if target type is integer),
4469  // and ANDs off the high 8 bits. Since we turn this load into a
4470  // target-specific DAG node, the DAG combiner fails to eliminate these AND
4471  // nodes. Do that here.
4472  SDValue Val = N->getOperand(0);
4473  SDValue Mask = N->getOperand(1);
4474
4475  if (isa<ConstantSDNode>(Val)) {
4476    std::swap(Val, Mask);
4477  }
4478
4479  SDValue AExt;
4480  // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4481  if (Val.getOpcode() == ISD::ANY_EXTEND) {
4482    AExt = Val;
4483    Val = Val->getOperand(0);
4484  }
4485
4486  if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) {
4487    Val = Val->getOperand(0);
4488  }
4489
4490  if (Val->getOpcode() == NVPTXISD::LoadV2 ||
4491      Val->getOpcode() == NVPTXISD::LoadV4) {
4492    ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
4493    if (!MaskCnst) {
4494      // Not an AND with a constant
4495      return SDValue();
4496    }
4497
4498    uint64_t MaskVal = MaskCnst->getZExtValue();
4499    if (MaskVal != 0xff) {
4500      // Not an AND that chops off top 8 bits
4501      return SDValue();
4502    }
4503
4504    MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4505    if (!Mem) {
4506      // Not a MemSDNode?!?
4507      return SDValue();
4508    }
4509
4510    EVT MemVT = Mem->getMemoryVT();
4511    if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4512      // We only handle the i8 case
4513      return SDValue();
4514    }
4515
4516    unsigned ExtType =
4517      cast<ConstantSDNode>(Val->getOperand(Val->getNumOperands()-1))->
4518        getZExtValue();
4519    if (ExtType == ISD::SEXTLOAD) {
4520      // If for some reason the load is a sextload, the and is needed to zero
4521      // out the high 8 bits
4522      return SDValue();
4523    }
4524
4525    bool AddTo = false;
4526    if (AExt.getNode() != nullptr) {
4527      // Re-insert the ext as a zext.
4528      Val = DCI.DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
4529                            AExt.getValueType(), Val);
4530      AddTo = true;
4531    }
4532
4533    // If we get here, the AND is unnecessary.  Just replace it with the load
4534    DCI.CombineTo(N, Val, AddTo);
4535  }
4536
4537  return SDValue();
4538}
4539
4540static SDValue PerformREMCombine(SDNode *N,
4541                                 TargetLowering::DAGCombinerInfo &DCI,
4542                                 CodeGenOpt::Level OptLevel) {
4543  assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
4544
4545  // Don't do anything at less than -O2.
4546  if (OptLevel < CodeGenOpt::Default)
4547    return SDValue();
4548
4549  SelectionDAG &DAG = DCI.DAG;
4550  SDLoc DL(N);
4551  EVT VT = N->getValueType(0);
4552  bool IsSigned = N->getOpcode() == ISD::SREM;
4553  unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
4554
4555  const SDValue &Num = N->getOperand(0);
4556  const SDValue &Den = N->getOperand(1);
4557
4558  for (const SDNode *U : Num->uses()) {
4559    if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4560        U->getOperand(1) == Den) {
4561      // Num % Den -> Num - (Num / Den) * Den
4562      return DAG.getNode(ISD::SUB, DL, VT, Num,
4563                         DAG.getNode(ISD::MUL, DL, VT,
4564                                     DAG.getNode(DivOpc, DL, VT, Num, Den),
4565                                     Den));
4566    }
4567  }
4568  return SDValue();
4569}
4570
4571enum OperandSignedness {
4572  Signed = 0,
4573  Unsigned,
4574  Unknown
4575};
4576
4577/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4578/// that can be demoted to \p OptSize bits without loss of information. The
4579/// signedness of the operand, if determinable, is placed in \p S.
4580static bool IsMulWideOperandDemotable(SDValue Op,
4581                                      unsigned OptSize,
4582                                      OperandSignedness &S) {
4583  S = Unknown;
4584
4585  if (Op.getOpcode() == ISD::SIGN_EXTEND ||
4586      Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
4587    EVT OrigVT = Op.getOperand(0).getValueType();
4588    if (OrigVT.getSizeInBits() <= OptSize) {
4589      S = Signed;
4590      return true;
4591    }
4592  } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
4593    EVT OrigVT = Op.getOperand(0).getValueType();
4594    if (OrigVT.getSizeInBits() <= OptSize) {
4595      S = Unsigned;
4596      return true;
4597    }
4598  }
4599
4600  return false;
4601}
4602
4603/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4604/// be demoted to \p OptSize bits without loss of information. If the operands
4605/// contain a constant, it should appear as the RHS operand. The signedness of
4606/// the operands is placed in \p IsSigned.
4607static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS,
4608                                        unsigned OptSize,
4609                                        bool &IsSigned) {
4610  OperandSignedness LHSSign;
4611
4612  // The LHS operand must be a demotable op
4613  if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4614    return false;
4615
4616  // We should have been able to determine the signedness from the LHS
4617  if (LHSSign == Unknown)
4618    return false;
4619
4620  IsSigned = (LHSSign == Signed);
4621
4622  // The RHS can be a demotable op or a constant
4623  if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4624    const APInt &Val = CI->getAPIntValue();
4625    if (LHSSign == Unsigned) {
4626      return Val.isIntN(OptSize);
4627    } else {
4628      return Val.isSignedIntN(OptSize);
4629    }
4630  } else {
4631    OperandSignedness RHSSign;
4632    if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4633      return false;
4634
4635    return LHSSign == RHSSign;
4636  }
4637}
4638
4639/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4640/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4641/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4642/// amount.
4643static SDValue TryMULWIDECombine(SDNode *N,
4644                                 TargetLowering::DAGCombinerInfo &DCI) {
4645  EVT MulType = N->getValueType(0);
4646  if (MulType != MVT::i32 && MulType != MVT::i64) {
4647    return SDValue();
4648  }
4649
4650  SDLoc DL(N);
4651  unsigned OptSize = MulType.getSizeInBits() >> 1;
4652  SDValue LHS = N->getOperand(0);
4653  SDValue RHS = N->getOperand(1);
4654
4655  // Canonicalize the multiply so the constant (if any) is on the right
4656  if (N->getOpcode() == ISD::MUL) {
4657    if (isa<ConstantSDNode>(LHS)) {
4658      std::swap(LHS, RHS);
4659    }
4660  }
4661
4662  // If we have a SHL, determine the actual multiply amount
4663  if (N->getOpcode() == ISD::SHL) {
4664    ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4665    if (!ShlRHS) {
4666      return SDValue();
4667    }
4668
4669    APInt ShiftAmt = ShlRHS->getAPIntValue();
4670    unsigned BitWidth = MulType.getSizeInBits();
4671    if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4672      APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
4673      RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
4674    } else {
4675      return SDValue();
4676    }
4677  }
4678
4679  bool Signed;
4680  // Verify that our operands are demotable
4681  if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
4682    return SDValue();
4683  }
4684
4685  EVT DemotedVT;
4686  if (MulType == MVT::i32) {
4687    DemotedVT = MVT::i16;
4688  } else {
4689    DemotedVT = MVT::i32;
4690  }
4691
4692  // Truncate the operands to the correct size. Note that these are just for
4693  // type consistency and will (likely) be eliminated in later phases.
4694  SDValue TruncLHS =
4695    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
4696  SDValue TruncRHS =
4697    DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
4698
4699  unsigned Opc;
4700  if (Signed) {
4701    Opc = NVPTXISD::MUL_WIDE_SIGNED;
4702  } else {
4703    Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
4704  }
4705
4706  return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
4707}
4708
4709/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
4710static SDValue PerformMULCombine(SDNode *N,
4711                                 TargetLowering::DAGCombinerInfo &DCI,
4712                                 CodeGenOpt::Level OptLevel) {
4713  if (OptLevel > 0) {
4714    // Try mul.wide combining at OptLevel > 0
4715    if (SDValue Ret = TryMULWIDECombine(N, DCI))
4716      return Ret;
4717  }
4718
4719  return SDValue();
4720}
4721
4722/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
4723static SDValue PerformSHLCombine(SDNode *N,
4724                                 TargetLowering::DAGCombinerInfo &DCI,
4725                                 CodeGenOpt::Level OptLevel) {
4726  if (OptLevel > 0) {
4727    // Try mul.wide combining at OptLevel > 0
4728    if (SDValue Ret = TryMULWIDECombine(N, DCI))
4729      return Ret;
4730  }
4731
4732  return SDValue();
4733}
4734
4735static SDValue PerformSETCCCombine(SDNode *N,
4736                                   TargetLowering::DAGCombinerInfo &DCI) {
4737  EVT CCType = N->getValueType(0);
4738  SDValue A = N->getOperand(0);
4739  SDValue B = N->getOperand(1);
4740
4741  if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
4742    return SDValue();
4743
4744  SDLoc DL(N);
4745  // setp.f16x2 returns two scalar predicates, which we need to
4746  // convert back to v2i1. The returned result will be scalarized by
4747  // the legalizer, but the comparison will remain a single vector
4748  // instruction.
4749  SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
4750                                   DCI.DAG.getVTList(MVT::i1, MVT::i1),
4751                                   {A, B, N->getOperand(2)});
4752  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
4753                         CCNode.getValue(1));
4754}
4755
4756SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
4757                                               DAGCombinerInfo &DCI) const {
4758  CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
4759  switch (N->getOpcode()) {
4760    default: break;
4761    case ISD::ADD:
4762    case ISD::FADD:
4763      return PerformADDCombine(N, DCI, STI, OptLevel);
4764    case ISD::MUL:
4765      return PerformMULCombine(N, DCI, OptLevel);
4766    case ISD::SHL:
4767      return PerformSHLCombine(N, DCI, OptLevel);
4768    case ISD::AND:
4769      return PerformANDCombine(N, DCI);
4770    case ISD::UREM:
4771    case ISD::SREM:
4772      return PerformREMCombine(N, DCI, OptLevel);
4773    case ISD::SETCC:
4774      return PerformSETCCCombine(N, DCI);
4775  }
4776  return SDValue();
4777}
4778
4779/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
4780static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
4781                              SmallVectorImpl<SDValue> &Results) {
4782  EVT ResVT = N->getValueType(0);
4783  SDLoc DL(N);
4784
4785  assert(ResVT.isVector() && "Vector load must have vector type");
4786
4787  // We only handle "native" vector sizes for now, e.g. <4 x double> is not
4788  // legal.  We can (and should) split that into 2 loads of <2 x double> here
4789  // but I'm leaving that as a TODO for now.
4790  assert(ResVT.isSimple() && "Can only handle simple types");
4791  switch (ResVT.getSimpleVT().SimpleTy) {
4792  default:
4793    return;
4794  case MVT::v2i8:
4795  case MVT::v2i16:
4796  case MVT::v2i32:
4797  case MVT::v2i64:
4798  case MVT::v2f16:
4799  case MVT::v2f32:
4800  case MVT::v2f64:
4801  case MVT::v4i8:
4802  case MVT::v4i16:
4803  case MVT::v4i32:
4804  case MVT::v4f16:
4805  case MVT::v4f32:
4806  case MVT::v8f16: // <4 x f16x2>
4807    // This is a "native" vector type
4808    break;
4809  }
4810
4811  LoadSDNode *LD = cast<LoadSDNode>(N);
4812
4813  unsigned Align = LD->getAlignment();
4814  auto &TD = DAG.getDataLayout();
4815  unsigned PrefAlign =
4816      TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
4817  if (Align < PrefAlign) {
4818    // This load is not sufficiently aligned, so bail out and let this vector
4819    // load be scalarized.  Note that we may still be able to emit smaller
4820    // vector loads.  For example, if we are loading a <4 x float> with an
4821    // alignment of 8, this check will fail but the legalizer will try again
4822    // with 2 x <2 x float>, which will succeed with an alignment of 8.
4823    return;
4824  }
4825
4826  EVT EltVT = ResVT.getVectorElementType();
4827  unsigned NumElts = ResVT.getVectorNumElements();
4828
4829  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
4830  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
4831  // loaded type to i16 and propagate the "real" type as the memory type.
4832  bool NeedTrunc = false;
4833  if (EltVT.getSizeInBits() < 16) {
4834    EltVT = MVT::i16;
4835    NeedTrunc = true;
4836  }
4837
4838  unsigned Opcode = 0;
4839  SDVTList LdResVTs;
4840  bool LoadF16x2 = false;
4841
4842  switch (NumElts) {
4843  default:
4844    return;
4845  case 2:
4846    Opcode = NVPTXISD::LoadV2;
4847    LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4848    break;
4849  case 4: {
4850    Opcode = NVPTXISD::LoadV4;
4851    EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4852    LdResVTs = DAG.getVTList(ListVTs);
4853    break;
4854  }
4855  case 8: {
4856    // v8f16 is a special case. PTX doesn't have ld.v8.f16
4857    // instruction. Instead, we split the vector into v2f16 chunks and
4858    // load them with ld.v4.b32.
4859    assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
4860    LoadF16x2 = true;
4861    Opcode = NVPTXISD::LoadV4;
4862    EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
4863                     MVT::Other};
4864    LdResVTs = DAG.getVTList(ListVTs);
4865    break;
4866  }
4867  }
4868
4869  // Copy regular operands
4870  SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());
4871
4872  // The select routine does not have access to the LoadSDNode instance, so
4873  // pass along the extension information
4874  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
4875
4876  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
4877                                          LD->getMemoryVT(),
4878                                          LD->getMemOperand());
4879
4880  SmallVector<SDValue, 8> ScalarRes;
4881  if (LoadF16x2) {
4882    // Split v2f16 subvectors back into individual elements.
4883    NumElts /= 2;
4884    for (unsigned i = 0; i < NumElts; ++i) {
4885      SDValue SubVector = NewLD.getValue(i);
4886      SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4887                               DAG.getIntPtrConstant(0, DL));
4888      SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
4889                               DAG.getIntPtrConstant(1, DL));
4890      ScalarRes.push_back(E0);
4891      ScalarRes.push_back(E1);
4892    }
4893  } else {
4894    for (unsigned i = 0; i < NumElts; ++i) {
4895      SDValue Res = NewLD.getValue(i);
4896      if (NeedTrunc)
4897        Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
4898      ScalarRes.push_back(Res);
4899    }
4900  }
4901
4902  SDValue LoadChain = NewLD.getValue(NumElts);
4903
4904  SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
4905
4906  Results.push_back(BuildVec);
4907  Results.push_back(LoadChain);
4908}
4909
4910static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
4911                                     SmallVectorImpl<SDValue> &Results) {
4912  SDValue Chain = N->getOperand(0);
4913  SDValue Intrin = N->getOperand(1);
4914  SDLoc DL(N);
4915
4916  // Get the intrinsic ID
4917  unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
4918  switch (IntrinNo) {
4919  default:
4920    return;
4921  case Intrinsic::nvvm_ldg_global_i:
4922  case Intrinsic::nvvm_ldg_global_f:
4923  case Intrinsic::nvvm_ldg_global_p:
4924  case Intrinsic::nvvm_ldu_global_i:
4925  case Intrinsic::nvvm_ldu_global_f:
4926  case Intrinsic::nvvm_ldu_global_p: {
4927    EVT ResVT = N->getValueType(0);
4928
4929    if (ResVT.isVector()) {
4930      // Vector LDG/LDU
4931
4932      unsigned NumElts = ResVT.getVectorNumElements();
4933      EVT EltVT = ResVT.getVectorElementType();
4934
4935      // Since LDU/LDG are target nodes, we cannot rely on DAG type
4936      // legalization.
4937      // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
4938      // loaded type to i16 and propagate the "real" type as the memory type.
4939      bool NeedTrunc = false;
4940      if (EltVT.getSizeInBits() < 16) {
4941        EltVT = MVT::i16;
4942        NeedTrunc = true;
4943      }
4944
4945      unsigned Opcode = 0;
4946      SDVTList LdResVTs;
4947
4948      switch (NumElts) {
4949      default:
4950        return;
4951      case 2:
4952        switch (IntrinNo) {
4953        default:
4954          return;
4955        case Intrinsic::nvvm_ldg_global_i:
4956        case Intrinsic::nvvm_ldg_global_f:
4957        case Intrinsic::nvvm_ldg_global_p:
4958          Opcode = NVPTXISD::LDGV2;
4959          break;
4960        case Intrinsic::nvvm_ldu_global_i:
4961        case Intrinsic::nvvm_ldu_global_f:
4962        case Intrinsic::nvvm_ldu_global_p:
4963          Opcode = NVPTXISD::LDUV2;
4964          break;
4965        }
4966        LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
4967        break;
4968      case 4: {
4969        switch (IntrinNo) {
4970        default:
4971          return;
4972        case Intrinsic::nvvm_ldg_global_i:
4973        case Intrinsic::nvvm_ldg_global_f:
4974        case Intrinsic::nvvm_ldg_global_p:
4975          Opcode = NVPTXISD::LDGV4;
4976          break;
4977        case Intrinsic::nvvm_ldu_global_i:
4978        case Intrinsic::nvvm_ldu_global_f:
4979        case Intrinsic::nvvm_ldu_global_p:
4980          Opcode = NVPTXISD::LDUV4;
4981          break;
4982        }
4983        EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
4984        LdResVTs = DAG.getVTList(ListVTs);
4985        break;
4986      }
4987      }
4988
4989      SmallVector<SDValue, 8> OtherOps;
4990
4991      // Copy regular operands
4992
4993      OtherOps.push_back(Chain); // Chain
4994                                 // Skip operand 1 (intrinsic ID)
4995      // Others
4996      OtherOps.append(N->op_begin() + 2, N->op_end());
4997
4998      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
4999
5000      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
5001                                              MemSD->getMemoryVT(),
5002                                              MemSD->getMemOperand());
5003
5004      SmallVector<SDValue, 4> ScalarRes;
5005
5006      for (unsigned i = 0; i < NumElts; ++i) {
5007        SDValue Res = NewLD.getValue(i);
5008        if (NeedTrunc)
5009          Res =
5010              DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
5011        ScalarRes.push_back(Res);
5012      }
5013
5014      SDValue LoadChain = NewLD.getValue(NumElts);
5015
5016      SDValue BuildVec =
5017          DAG.getBuildVector(ResVT, DL, ScalarRes);
5018
5019      Results.push_back(BuildVec);
5020      Results.push_back(LoadChain);
5021    } else {
5022      // i8 LDG/LDU
5023      assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
5024             "Custom handling of non-i8 ldu/ldg?");
5025
5026      // Just copy all operands as-is
5027      SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
5028
5029      // Force output to i16
5030      SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
5031
5032      MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5033
5034      // We make sure the memory type is i8, which will be used during isel
5035      // to select the proper instruction.
5036      SDValue NewLD =
5037          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
5038                                  MVT::i8, MemSD->getMemOperand());
5039
5040      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
5041                                    NewLD.getValue(0)));
5042      Results.push_back(NewLD.getValue(1));
5043    }
5044  }
5045  }
5046}
5047
5048void NVPTXTargetLowering::ReplaceNodeResults(
5049    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
5050  switch (N->getOpcode()) {
5051  default:
5052    report_fatal_error("Unhandled custom legalization");
5053  case ISD::LOAD:
5054    ReplaceLoadVector(N, DAG, Results);
5055    return;
5056  case ISD::INTRINSIC_W_CHAIN:
5057    ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
5058    return;
5059  }
5060}
5061
5062// Pin NVPTXTargetObjectFile's vtables to this file.
5063NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {}
5064
5065MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
5066    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
5067  return getDataSection();
5068}
5069