1288943Sdim//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===//
2277323Sdim//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6277323Sdim//
7277323Sdim//===----------------------------------------------------------------------===//
8277323Sdim
9288943Sdim#include "NVPTXTargetTransformInfo.h"
10288943Sdim#include "NVPTXUtilities.h"
11277323Sdim#include "llvm/Analysis/LoopInfo.h"
12277323Sdim#include "llvm/Analysis/TargetTransformInfo.h"
13277323Sdim#include "llvm/Analysis/ValueTracking.h"
14288943Sdim#include "llvm/CodeGen/BasicTTIImpl.h"
15327952Sdim#include "llvm/CodeGen/CostTable.h"
16327952Sdim#include "llvm/CodeGen/TargetLowering.h"
17360784Sdim#include "llvm/IR/IntrinsicsNVPTX.h"
18277323Sdim#include "llvm/Support/Debug.h"
19277323Sdimusing namespace llvm;
20277323Sdim
21277323Sdim#define DEBUG_TYPE "NVPTXtti"
22277323Sdim
23288943Sdim// Whether the given intrinsic reads threadIdx.x/y/z.
24288943Sdimstatic bool readsThreadIndex(const IntrinsicInst *II) {
25288943Sdim  switch (II->getIntrinsicID()) {
26288943Sdim    default: return false;
27288943Sdim    case Intrinsic::nvvm_read_ptx_sreg_tid_x:
28288943Sdim    case Intrinsic::nvvm_read_ptx_sreg_tid_y:
29288943Sdim    case Intrinsic::nvvm_read_ptx_sreg_tid_z:
30288943Sdim      return true;
31288943Sdim  }
32277323Sdim}
33277323Sdim
34288943Sdimstatic bool readsLaneId(const IntrinsicInst *II) {
35309124Sdim  return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid;
36288943Sdim}
37277323Sdim
38288943Sdim// Whether the given intrinsic is an atomic instruction in PTX.
39288943Sdimstatic bool isNVVMAtomic(const IntrinsicInst *II) {
40288943Sdim  switch (II->getIntrinsicID()) {
41288943Sdim    default: return false;
42288943Sdim    case Intrinsic::nvvm_atomic_load_inc_32:
43288943Sdim    case Intrinsic::nvvm_atomic_load_dec_32:
44314564Sdim
45314564Sdim    case Intrinsic::nvvm_atomic_add_gen_f_cta:
46314564Sdim    case Intrinsic::nvvm_atomic_add_gen_f_sys:
47314564Sdim    case Intrinsic::nvvm_atomic_add_gen_i_cta:
48314564Sdim    case Intrinsic::nvvm_atomic_add_gen_i_sys:
49314564Sdim    case Intrinsic::nvvm_atomic_and_gen_i_cta:
50314564Sdim    case Intrinsic::nvvm_atomic_and_gen_i_sys:
51314564Sdim    case Intrinsic::nvvm_atomic_cas_gen_i_cta:
52314564Sdim    case Intrinsic::nvvm_atomic_cas_gen_i_sys:
53314564Sdim    case Intrinsic::nvvm_atomic_dec_gen_i_cta:
54314564Sdim    case Intrinsic::nvvm_atomic_dec_gen_i_sys:
55314564Sdim    case Intrinsic::nvvm_atomic_inc_gen_i_cta:
56314564Sdim    case Intrinsic::nvvm_atomic_inc_gen_i_sys:
57314564Sdim    case Intrinsic::nvvm_atomic_max_gen_i_cta:
58314564Sdim    case Intrinsic::nvvm_atomic_max_gen_i_sys:
59314564Sdim    case Intrinsic::nvvm_atomic_min_gen_i_cta:
60314564Sdim    case Intrinsic::nvvm_atomic_min_gen_i_sys:
61314564Sdim    case Intrinsic::nvvm_atomic_or_gen_i_cta:
62314564Sdim    case Intrinsic::nvvm_atomic_or_gen_i_sys:
63314564Sdim    case Intrinsic::nvvm_atomic_exch_gen_i_cta:
64314564Sdim    case Intrinsic::nvvm_atomic_exch_gen_i_sys:
65314564Sdim    case Intrinsic::nvvm_atomic_xor_gen_i_cta:
66314564Sdim    case Intrinsic::nvvm_atomic_xor_gen_i_sys:
67288943Sdim      return true;
68277323Sdim  }
69288943Sdim}
70277323Sdim
71288943Sdimbool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
72288943Sdim  // Without inter-procedural analysis, we conservatively assume that arguments
73288943Sdim  // to __device__ functions are divergent.
74288943Sdim  if (const Argument *Arg = dyn_cast<Argument>(V))
75288943Sdim    return !isKernelFunction(*Arg->getParent());
76277323Sdim
77288943Sdim  if (const Instruction *I = dyn_cast<Instruction>(V)) {
78288943Sdim    // Without pointer analysis, we conservatively assume values loaded from
79288943Sdim    // generic or local address space are divergent.
80288943Sdim    if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
81288943Sdim      unsigned AS = LI->getPointerAddressSpace();
82288943Sdim      return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL;
83288943Sdim    }
84288943Sdim    // Atomic instructions may cause divergence. Atomic instructions are
85288943Sdim    // executed sequentially across all threads in a warp. Therefore, an earlier
86288943Sdim    // executed thread may see different memory inputs than a later executed
87288943Sdim    // thread. For example, suppose *a = 0 initially.
88288943Sdim    //
89288943Sdim    //   atom.global.add.s32 d, [a], 1
90288943Sdim    //
91288943Sdim    // returns 0 for the first thread that enters the critical region, and 1 for
92288943Sdim    // the second thread.
93288943Sdim    if (I->isAtomic())
94288943Sdim      return true;
95288943Sdim    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
96288943Sdim      // Instructions that read threadIdx are obviously divergent.
97288943Sdim      if (readsThreadIndex(II) || readsLaneId(II))
98288943Sdim        return true;
99288943Sdim      // Handle the NVPTX atomic instrinsics that cannot be represented as an
100288943Sdim      // atomic IR instruction.
101288943Sdim      if (isNVVMAtomic(II))
102288943Sdim        return true;
103288943Sdim    }
104288943Sdim    // Conservatively consider the return value of function calls as divergent.
105288943Sdim    // We could analyze callees with bodies more precisely using
106288943Sdim    // inter-procedural analysis.
107288943Sdim    if (isa<CallInst>(I))
108288943Sdim      return true;
109277323Sdim  }
110277323Sdim
111288943Sdim  return false;
112277323Sdim}
113277323Sdim
114296417Sdimint NVPTXTTIImpl::getArithmeticInstrCost(
115288943Sdim    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
116288943Sdim    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
117360784Sdim    TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
118360784Sdim    const Instruction *CxtI) {
119277323Sdim  // Legalize the type.
120296417Sdim  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
121277323Sdim
122277323Sdim  int ISD = TLI->InstructionOpcodeToISD(Opcode);
123277323Sdim
124277323Sdim  switch (ISD) {
125277323Sdim  default:
126288943Sdim    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
127288943Sdim                                         Opd1PropInfo, Opd2PropInfo);
128277323Sdim  case ISD::ADD:
129277323Sdim  case ISD::MUL:
130277323Sdim  case ISD::XOR:
131277323Sdim  case ISD::OR:
132277323Sdim  case ISD::AND:
133277323Sdim    // The machine code (SASS) simulates an i64 with two i32. Therefore, we
134277323Sdim    // estimate that arithmetic operations on i64 are twice as expensive as
135277323Sdim    // those on types that can fit into one machine register.
136277323Sdim    if (LT.second.SimpleTy == MVT::i64)
137277323Sdim      return 2 * LT.first;
138277323Sdim    // Delegate other cases to the basic TTI.
139288943Sdim    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
140288943Sdim                                         Opd1PropInfo, Opd2PropInfo);
141277323Sdim  }
142277323Sdim}
143288943Sdim
144321369Sdimvoid NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
145288943Sdim                                           TTI::UnrollingPreferences &UP) {
146321369Sdim  BaseT::getUnrollingPreferences(L, SE, UP);
147288943Sdim
148288943Sdim  // Enable partial unrolling and runtime unrolling, but reduce the
149288943Sdim  // threshold.  This partially unrolls small loops which are often
150288943Sdim  // unrolled by the PTX to SASS compiler and unrolling earlier can be
151288943Sdim  // beneficial.
152288943Sdim  UP.Partial = UP.Runtime = true;
153288943Sdim  UP.PartialThreshold = UP.Threshold / 4;
154288943Sdim}
155