NVPTXTargetTransformInfo.cpp revision 360784
1288943Sdim//===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===// 2277323Sdim// 3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4353358Sdim// See https://llvm.org/LICENSE.txt for license information. 5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6277323Sdim// 7277323Sdim//===----------------------------------------------------------------------===// 8277323Sdim 9288943Sdim#include "NVPTXTargetTransformInfo.h" 10288943Sdim#include "NVPTXUtilities.h" 11277323Sdim#include "llvm/Analysis/LoopInfo.h" 12277323Sdim#include "llvm/Analysis/TargetTransformInfo.h" 13277323Sdim#include "llvm/Analysis/ValueTracking.h" 14288943Sdim#include "llvm/CodeGen/BasicTTIImpl.h" 15327952Sdim#include "llvm/CodeGen/CostTable.h" 16327952Sdim#include "llvm/CodeGen/TargetLowering.h" 17360784Sdim#include "llvm/IR/IntrinsicsNVPTX.h" 18277323Sdim#include "llvm/Support/Debug.h" 19277323Sdimusing namespace llvm; 20277323Sdim 21277323Sdim#define DEBUG_TYPE "NVPTXtti" 22277323Sdim 23288943Sdim// Whether the given intrinsic reads threadIdx.x/y/z. 24288943Sdimstatic bool readsThreadIndex(const IntrinsicInst *II) { 25288943Sdim switch (II->getIntrinsicID()) { 26288943Sdim default: return false; 27288943Sdim case Intrinsic::nvvm_read_ptx_sreg_tid_x: 28288943Sdim case Intrinsic::nvvm_read_ptx_sreg_tid_y: 29288943Sdim case Intrinsic::nvvm_read_ptx_sreg_tid_z: 30288943Sdim return true; 31288943Sdim } 32277323Sdim} 33277323Sdim 34288943Sdimstatic bool readsLaneId(const IntrinsicInst *II) { 35309124Sdim return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid; 36288943Sdim} 37277323Sdim 38288943Sdim// Whether the given intrinsic is an atomic instruction in PTX. 39288943Sdimstatic bool isNVVMAtomic(const IntrinsicInst *II) { 40288943Sdim switch (II->getIntrinsicID()) { 41288943Sdim default: return false; 42288943Sdim case Intrinsic::nvvm_atomic_load_inc_32: 43288943Sdim case Intrinsic::nvvm_atomic_load_dec_32: 44314564Sdim 45314564Sdim case Intrinsic::nvvm_atomic_add_gen_f_cta: 46314564Sdim case Intrinsic::nvvm_atomic_add_gen_f_sys: 47314564Sdim case Intrinsic::nvvm_atomic_add_gen_i_cta: 48314564Sdim case Intrinsic::nvvm_atomic_add_gen_i_sys: 49314564Sdim case Intrinsic::nvvm_atomic_and_gen_i_cta: 50314564Sdim case Intrinsic::nvvm_atomic_and_gen_i_sys: 51314564Sdim case Intrinsic::nvvm_atomic_cas_gen_i_cta: 52314564Sdim case Intrinsic::nvvm_atomic_cas_gen_i_sys: 53314564Sdim case Intrinsic::nvvm_atomic_dec_gen_i_cta: 54314564Sdim case Intrinsic::nvvm_atomic_dec_gen_i_sys: 55314564Sdim case Intrinsic::nvvm_atomic_inc_gen_i_cta: 56314564Sdim case Intrinsic::nvvm_atomic_inc_gen_i_sys: 57314564Sdim case Intrinsic::nvvm_atomic_max_gen_i_cta: 58314564Sdim case Intrinsic::nvvm_atomic_max_gen_i_sys: 59314564Sdim case Intrinsic::nvvm_atomic_min_gen_i_cta: 60314564Sdim case Intrinsic::nvvm_atomic_min_gen_i_sys: 61314564Sdim case Intrinsic::nvvm_atomic_or_gen_i_cta: 62314564Sdim case Intrinsic::nvvm_atomic_or_gen_i_sys: 63314564Sdim case Intrinsic::nvvm_atomic_exch_gen_i_cta: 64314564Sdim case Intrinsic::nvvm_atomic_exch_gen_i_sys: 65314564Sdim case Intrinsic::nvvm_atomic_xor_gen_i_cta: 66314564Sdim case Intrinsic::nvvm_atomic_xor_gen_i_sys: 67288943Sdim return true; 68277323Sdim } 69288943Sdim} 70277323Sdim 71288943Sdimbool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { 72288943Sdim // Without inter-procedural analysis, we conservatively assume that arguments 73288943Sdim // to __device__ functions are divergent. 74288943Sdim if (const Argument *Arg = dyn_cast<Argument>(V)) 75288943Sdim return !isKernelFunction(*Arg->getParent()); 76277323Sdim 77288943Sdim if (const Instruction *I = dyn_cast<Instruction>(V)) { 78288943Sdim // Without pointer analysis, we conservatively assume values loaded from 79288943Sdim // generic or local address space are divergent. 80288943Sdim if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { 81288943Sdim unsigned AS = LI->getPointerAddressSpace(); 82288943Sdim return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL; 83288943Sdim } 84288943Sdim // Atomic instructions may cause divergence. Atomic instructions are 85288943Sdim // executed sequentially across all threads in a warp. Therefore, an earlier 86288943Sdim // executed thread may see different memory inputs than a later executed 87288943Sdim // thread. For example, suppose *a = 0 initially. 88288943Sdim // 89288943Sdim // atom.global.add.s32 d, [a], 1 90288943Sdim // 91288943Sdim // returns 0 for the first thread that enters the critical region, and 1 for 92288943Sdim // the second thread. 93288943Sdim if (I->isAtomic()) 94288943Sdim return true; 95288943Sdim if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { 96288943Sdim // Instructions that read threadIdx are obviously divergent. 97288943Sdim if (readsThreadIndex(II) || readsLaneId(II)) 98288943Sdim return true; 99288943Sdim // Handle the NVPTX atomic instrinsics that cannot be represented as an 100288943Sdim // atomic IR instruction. 101288943Sdim if (isNVVMAtomic(II)) 102288943Sdim return true; 103288943Sdim } 104288943Sdim // Conservatively consider the return value of function calls as divergent. 105288943Sdim // We could analyze callees with bodies more precisely using 106288943Sdim // inter-procedural analysis. 107288943Sdim if (isa<CallInst>(I)) 108288943Sdim return true; 109277323Sdim } 110277323Sdim 111288943Sdim return false; 112277323Sdim} 113277323Sdim 114296417Sdimint NVPTXTTIImpl::getArithmeticInstrCost( 115288943Sdim unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, 116288943Sdim TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, 117360784Sdim TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, 118360784Sdim const Instruction *CxtI) { 119277323Sdim // Legalize the type. 120296417Sdim std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 121277323Sdim 122277323Sdim int ISD = TLI->InstructionOpcodeToISD(Opcode); 123277323Sdim 124277323Sdim switch (ISD) { 125277323Sdim default: 126288943Sdim return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 127288943Sdim Opd1PropInfo, Opd2PropInfo); 128277323Sdim case ISD::ADD: 129277323Sdim case ISD::MUL: 130277323Sdim case ISD::XOR: 131277323Sdim case ISD::OR: 132277323Sdim case ISD::AND: 133277323Sdim // The machine code (SASS) simulates an i64 with two i32. Therefore, we 134277323Sdim // estimate that arithmetic operations on i64 are twice as expensive as 135277323Sdim // those on types that can fit into one machine register. 136277323Sdim if (LT.second.SimpleTy == MVT::i64) 137277323Sdim return 2 * LT.first; 138277323Sdim // Delegate other cases to the basic TTI. 139288943Sdim return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, 140288943Sdim Opd1PropInfo, Opd2PropInfo); 141277323Sdim } 142277323Sdim} 143288943Sdim 144321369Sdimvoid NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 145288943Sdim TTI::UnrollingPreferences &UP) { 146321369Sdim BaseT::getUnrollingPreferences(L, SE, UP); 147288943Sdim 148288943Sdim // Enable partial unrolling and runtime unrolling, but reduce the 149288943Sdim // threshold. This partially unrolls small loops which are often 150288943Sdim // unrolled by the PTX to SASS compiler and unrolling earlier can be 151288943Sdim // beneficial. 152288943Sdim UP.Partial = UP.Runtime = true; 153288943Sdim UP.PartialThreshold = UP.Threshold / 4; 154288943Sdim} 155