1202375Srdivacky//===- InstCombineCalls.cpp -----------------------------------------------===//
2202375Srdivacky//
3353358Sdim// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4353358Sdim// See https://llvm.org/LICENSE.txt for license information.
5353358Sdim// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6202375Srdivacky//
7202375Srdivacky//===----------------------------------------------------------------------===//
8202375Srdivacky//
9353358Sdim// This file implements the visitCall, visitInvoke, and visitCallBr functions.
10202375Srdivacky//
11202375Srdivacky//===----------------------------------------------------------------------===//
12202375Srdivacky
13288943Sdim#include "InstCombineInternal.h"
14314564Sdim#include "llvm/ADT/APFloat.h"
15314564Sdim#include "llvm/ADT/APInt.h"
16353358Sdim#include "llvm/ADT/APSInt.h"
17314564Sdim#include "llvm/ADT/ArrayRef.h"
18314564Sdim#include "llvm/ADT/None.h"
19327952Sdim#include "llvm/ADT/Optional.h"
20314564Sdim#include "llvm/ADT/STLExtras.h"
21314564Sdim#include "llvm/ADT/SmallVector.h"
22321369Sdim#include "llvm/ADT/Statistic.h"
23314564Sdim#include "llvm/ADT/Twine.h"
24327952Sdim#include "llvm/Analysis/AssumptionCache.h"
25288943Sdim#include "llvm/Analysis/InstructionSimplify.h"
26353358Sdim#include "llvm/Analysis/Loads.h"
27249423Sdim#include "llvm/Analysis/MemoryBuiltins.h"
28314564Sdim#include "llvm/Analysis/ValueTracking.h"
29353358Sdim#include "llvm/Analysis/VectorUtils.h"
30327952Sdim#include "llvm/IR/Attributes.h"
31314564Sdim#include "llvm/IR/BasicBlock.h"
32314564Sdim#include "llvm/IR/Constant.h"
33327952Sdim#include "llvm/IR/Constants.h"
34314564Sdim#include "llvm/IR/DataLayout.h"
35314564Sdim#include "llvm/IR/DerivedTypes.h"
36314564Sdim#include "llvm/IR/Function.h"
37314564Sdim#include "llvm/IR/GlobalVariable.h"
38314564Sdim#include "llvm/IR/InstrTypes.h"
39314564Sdim#include "llvm/IR/Instruction.h"
40314564Sdim#include "llvm/IR/Instructions.h"
41314564Sdim#include "llvm/IR/IntrinsicInst.h"
42314564Sdim#include "llvm/IR/Intrinsics.h"
43360784Sdim#include "llvm/IR/IntrinsicsX86.h"
44360784Sdim#include "llvm/IR/IntrinsicsARM.h"
45360784Sdim#include "llvm/IR/IntrinsicsAArch64.h"
46360784Sdim#include "llvm/IR/IntrinsicsNVPTX.h"
47360784Sdim#include "llvm/IR/IntrinsicsAMDGPU.h"
48360784Sdim#include "llvm/IR/IntrinsicsPowerPC.h"
49314564Sdim#include "llvm/IR/LLVMContext.h"
50314564Sdim#include "llvm/IR/Metadata.h"
51276479Sdim#include "llvm/IR/PatternMatch.h"
52280031Sdim#include "llvm/IR/Statepoint.h"
53314564Sdim#include "llvm/IR/Type.h"
54327952Sdim#include "llvm/IR/User.h"
55314564Sdim#include "llvm/IR/Value.h"
56314564Sdim#include "llvm/IR/ValueHandle.h"
57327952Sdim#include "llvm/Support/AtomicOrdering.h"
58314564Sdim#include "llvm/Support/Casting.h"
59327952Sdim#include "llvm/Support/CommandLine.h"
60327952Sdim#include "llvm/Support/Compiler.h"
61314564Sdim#include "llvm/Support/Debug.h"
62327952Sdim#include "llvm/Support/ErrorHandling.h"
63321369Sdim#include "llvm/Support/KnownBits.h"
64314564Sdim#include "llvm/Support/MathExtras.h"
65327952Sdim#include "llvm/Support/raw_ostream.h"
66327952Sdim#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
67353358Sdim#include "llvm/Transforms/Utils/Local.h"
68288943Sdim#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
69314564Sdim#include <algorithm>
70314564Sdim#include <cassert>
71314564Sdim#include <cstdint>
72314564Sdim#include <cstring>
73327952Sdim#include <utility>
74314564Sdim#include <vector>
75314564Sdim
76202375Srdivackyusing namespace llvm;
77249423Sdimusing namespace PatternMatch;
78202375Srdivacky
79276479Sdim#define DEBUG_TYPE "instcombine"
80276479Sdim
81249423SdimSTATISTIC(NumSimplified, "Number of library calls simplified");
82249423Sdim
83341825Sdimstatic cl::opt<unsigned> GuardWideningWindow(
84341825Sdim    "instcombine-guard-widening-window",
85341825Sdim    cl::init(3),
86341825Sdim    cl::desc("How wide an instruction window to bypass looking for "
87341825Sdim             "another guard"));
88321369Sdim
89309124Sdim/// Return the specified type promoted as it would be to pass though a va_arg
90309124Sdim/// area.
91226633Sdimstatic Type *getPromotedType(Type *Ty) {
92226633Sdim  if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
93202375Srdivacky    if (ITy->getBitWidth() < 32)
94202375Srdivacky      return Type::getInt32Ty(Ty->getContext());
95202375Srdivacky  }
96202375Srdivacky  return Ty;
97202375Srdivacky}
98202375Srdivacky
99309124Sdim/// Return a constant boolean vector that has true elements in all positions
100309124Sdim/// where the input constant data vector has an element with the sign bit set.
101309124Sdimstatic Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
102309124Sdim  SmallVector<Constant *, 32> BoolVec;
103309124Sdim  IntegerType *BoolTy = Type::getInt1Ty(V->getContext());
104309124Sdim  for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) {
105309124Sdim    Constant *Elt = V->getElementAsConstant(I);
106309124Sdim    assert((isa<ConstantInt>(Elt) || isa<ConstantFP>(Elt)) &&
107309124Sdim           "Unexpected constant data vector element type");
108309124Sdim    bool Sign = V->getElementType()->isIntegerTy()
109309124Sdim                    ? cast<ConstantInt>(Elt)->isNegative()
110309124Sdim                    : cast<ConstantFP>(Elt)->isNegative();
111309124Sdim    BoolVec.push_back(ConstantInt::get(BoolTy, Sign));
112309124Sdim  }
113309124Sdim  return ConstantVector::get(BoolVec);
114309124Sdim}
115309124Sdim
116341825SdimInstruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
117341825Sdim  unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
118341825Sdim  unsigned CopyDstAlign = MI->getDestAlignment();
119341825Sdim  if (CopyDstAlign < DstAlign){
120341825Sdim    MI->setDestAlignment(DstAlign);
121341825Sdim    return MI;
122321369Sdim  }
123321369Sdim
124341825Sdim  unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
125341825Sdim  unsigned CopySrcAlign = MI->getSourceAlignment();
126341825Sdim  if (CopySrcAlign < SrcAlign) {
127341825Sdim    MI->setSourceAlignment(SrcAlign);
128202375Srdivacky    return MI;
129202375Srdivacky  }
130234353Sdim
131353358Sdim  // If we have a store to a location which is known constant, we can conclude
132353358Sdim  // that the store must be storing the constant value (else the memory
133353358Sdim  // wouldn't be constant), and this must be a noop.
134353358Sdim  if (AA->pointsToConstantMemory(MI->getDest())) {
135353358Sdim    // Set the size of the copy to 0, it will be deleted on the next iteration.
136353358Sdim    MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
137353358Sdim    return MI;
138353358Sdim  }
139353358Sdim
140202375Srdivacky  // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
141202375Srdivacky  // load/store.
142341825Sdim  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
143276479Sdim  if (!MemOpLength) return nullptr;
144234353Sdim
145202375Srdivacky  // Source and destination pointer types are always "i8*" for intrinsic.  See
146202375Srdivacky  // if the size is something we can handle with a single primitive load/store.
147202375Srdivacky  // A single load+store correctly handles overlapping memory in the memmove
148202375Srdivacky  // case.
149239462Sdim  uint64_t Size = MemOpLength->getLimitedValue();
150276479Sdim  assert(Size && "0-sized memory transferring should be removed already.");
151234353Sdim
152202375Srdivacky  if (Size > 8 || (Size&(Size-1)))
153276479Sdim    return nullptr;  // If not 1/2/4/8 bytes, exit.
154234353Sdim
155344779Sdim  // If it is an atomic and alignment is less than the size then we will
156344779Sdim  // introduce the unaligned memory access which will be later transformed
157344779Sdim  // into libcall in CodeGen. This is not evident performance gain so disable
158344779Sdim  // it now.
159344779Sdim  if (isa<AtomicMemTransferInst>(MI))
160344779Sdim    if (CopyDstAlign < Size || CopySrcAlign < Size)
161344779Sdim      return nullptr;
162344779Sdim
163202375Srdivacky  // Use an integer load+store unless we can find something better.
164206274Srdivacky  unsigned SrcAddrSp =
165210299Sed    cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
166206274Srdivacky  unsigned DstAddrSp =
167210299Sed    cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
168206274Srdivacky
169226633Sdim  IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
170206274Srdivacky  Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
171206274Srdivacky  Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
172234353Sdim
173321369Sdim  // If the memcpy has metadata describing the members, see if we can get the
174321369Sdim  // TBAA tag describing our copy.
175276479Sdim  MDNode *CopyMD = nullptr;
176341825Sdim  if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) {
177341825Sdim    CopyMD = M;
178341825Sdim  } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
179321369Sdim    if (M->getNumOperands() == 3 && M->getOperand(0) &&
180321369Sdim        mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
181321369Sdim        mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
182321369Sdim        M->getOperand(1) &&
183321369Sdim        mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
184321369Sdim        mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
185321369Sdim        Size &&
186321369Sdim        M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
187321369Sdim      CopyMD = cast<MDNode>(M->getOperand(2));
188202375Srdivacky  }
189234353Sdim
190321369Sdim  Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
191321369Sdim  Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
192353358Sdim  LoadInst *L = Builder.CreateLoad(IntType, Src);
193341825Sdim  // Alignment from the mem intrinsic will be better, so use it.
194360784Sdim  L->setAlignment(
195360784Sdim      MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead.
196243830Sdim  if (CopyMD)
197243830Sdim    L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
198314564Sdim  MDNode *LoopMemParallelMD =
199314564Sdim    MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
200314564Sdim  if (LoopMemParallelMD)
201314564Sdim    L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
202344779Sdim  MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group);
203344779Sdim  if (AccessGroupMD)
204344779Sdim    L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD);
205314564Sdim
206341825Sdim  StoreInst *S = Builder.CreateStore(L, Dest);
207341825Sdim  // Alignment from the mem intrinsic will be better, so use it.
208360784Sdim  S->setAlignment(
209360784Sdim      MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead.
210243830Sdim  if (CopyMD)
211243830Sdim    S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
212314564Sdim  if (LoopMemParallelMD)
213314564Sdim    S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
214344779Sdim  if (AccessGroupMD)
215344779Sdim    S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD);
216202375Srdivacky
217341825Sdim  if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
218341825Sdim    // non-atomics can be volatile
219341825Sdim    L->setVolatile(MT->isVolatile());
220341825Sdim    S->setVolatile(MT->isVolatile());
221341825Sdim  }
222341825Sdim  if (isa<AtomicMemTransferInst>(MI)) {
223341825Sdim    // atomics have to be unordered
224341825Sdim    L->setOrdering(AtomicOrdering::Unordered);
225341825Sdim    S->setOrdering(AtomicOrdering::Unordered);
226341825Sdim  }
227341825Sdim
228202375Srdivacky  // Set the size of the copy to 0, it will be deleted on the next iteration.
229341825Sdim  MI->setLength(Constant::getNullValue(MemOpLength->getType()));
230202375Srdivacky  return MI;
231202375Srdivacky}
232202375Srdivacky
233341825SdimInstruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
234360784Sdim  const unsigned KnownAlignment =
235360784Sdim      getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
236360784Sdim  if (MI->getDestAlignment() < KnownAlignment) {
237360784Sdim    MI->setDestAlignment(KnownAlignment);
238202375Srdivacky    return MI;
239202375Srdivacky  }
240234353Sdim
241353358Sdim  // If we have a store to a location which is known constant, we can conclude
242353358Sdim  // that the store must be storing the constant value (else the memory
243353358Sdim  // wouldn't be constant), and this must be a noop.
244353358Sdim  if (AA->pointsToConstantMemory(MI->getDest())) {
245353358Sdim    // Set the size of the copy to 0, it will be deleted on the next iteration.
246353358Sdim    MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
247353358Sdim    return MI;
248353358Sdim  }
249353358Sdim
250202375Srdivacky  // Extract the length and alignment and fill if they are constant.
251202375Srdivacky  ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
252202375Srdivacky  ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
253203954Srdivacky  if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
254276479Sdim    return nullptr;
255360784Sdim  const uint64_t Len = LenC->getLimitedValue();
256239462Sdim  assert(Len && "0-sized memory setting should be removed already.");
257360784Sdim  const Align Alignment = assumeAligned(MI->getDestAlignment());
258234353Sdim
259344779Sdim  // If it is an atomic and alignment is less than the size then we will
260344779Sdim  // introduce the unaligned memory access which will be later transformed
261344779Sdim  // into libcall in CodeGen. This is not evident performance gain so disable
262344779Sdim  // it now.
263344779Sdim  if (isa<AtomicMemSetInst>(MI))
264344779Sdim    if (Alignment < Len)
265344779Sdim      return nullptr;
266344779Sdim
267202375Srdivacky  // memset(s,c,n) -> store s, c (for n=1,2,4,8)
268202375Srdivacky  if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
269226633Sdim    Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
270234353Sdim
271202375Srdivacky    Value *Dest = MI->getDest();
272218893Sdim    unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
273218893Sdim    Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
274321369Sdim    Dest = Builder.CreateBitCast(Dest, NewDstPtrTy);
275202375Srdivacky
276202375Srdivacky    // Extract the fill value and store.
277202375Srdivacky    uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
278321369Sdim    StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
279321369Sdim                                       MI->isVolatile());
280223017Sdim    S->setAlignment(Alignment);
281341825Sdim    if (isa<AtomicMemSetInst>(MI))
282341825Sdim      S->setOrdering(AtomicOrdering::Unordered);
283234353Sdim
284202375Srdivacky    // Set the size of the copy to 0, it will be deleted on the next iteration.
285202375Srdivacky    MI->setLength(Constant::getNullValue(LenC->getType()));
286202375Srdivacky    return MI;
287202375Srdivacky  }
288202375Srdivacky
289276479Sdim  return nullptr;
290202375Srdivacky}
291202375Srdivacky
292309124Sdimstatic Value *simplifyX86immShift(const IntrinsicInst &II,
293296417Sdim                                  InstCombiner::BuilderTy &Builder) {
294296417Sdim  bool LogicalShift = false;
295296417Sdim  bool ShiftLeft = false;
296296417Sdim
297296417Sdim  switch (II.getIntrinsicID()) {
298314564Sdim  default: llvm_unreachable("Unexpected intrinsic!");
299296417Sdim  case Intrinsic::x86_sse2_psra_d:
300296417Sdim  case Intrinsic::x86_sse2_psra_w:
301296417Sdim  case Intrinsic::x86_sse2_psrai_d:
302296417Sdim  case Intrinsic::x86_sse2_psrai_w:
303296417Sdim  case Intrinsic::x86_avx2_psra_d:
304296417Sdim  case Intrinsic::x86_avx2_psra_w:
305296417Sdim  case Intrinsic::x86_avx2_psrai_d:
306296417Sdim  case Intrinsic::x86_avx2_psrai_w:
307314564Sdim  case Intrinsic::x86_avx512_psra_q_128:
308314564Sdim  case Intrinsic::x86_avx512_psrai_q_128:
309314564Sdim  case Intrinsic::x86_avx512_psra_q_256:
310314564Sdim  case Intrinsic::x86_avx512_psrai_q_256:
311314564Sdim  case Intrinsic::x86_avx512_psra_d_512:
312314564Sdim  case Intrinsic::x86_avx512_psra_q_512:
313314564Sdim  case Intrinsic::x86_avx512_psra_w_512:
314314564Sdim  case Intrinsic::x86_avx512_psrai_d_512:
315314564Sdim  case Intrinsic::x86_avx512_psrai_q_512:
316314564Sdim  case Intrinsic::x86_avx512_psrai_w_512:
317296417Sdim    LogicalShift = false; ShiftLeft = false;
318296417Sdim    break;
319296417Sdim  case Intrinsic::x86_sse2_psrl_d:
320296417Sdim  case Intrinsic::x86_sse2_psrl_q:
321296417Sdim  case Intrinsic::x86_sse2_psrl_w:
322296417Sdim  case Intrinsic::x86_sse2_psrli_d:
323296417Sdim  case Intrinsic::x86_sse2_psrli_q:
324296417Sdim  case Intrinsic::x86_sse2_psrli_w:
325296417Sdim  case Intrinsic::x86_avx2_psrl_d:
326296417Sdim  case Intrinsic::x86_avx2_psrl_q:
327296417Sdim  case Intrinsic::x86_avx2_psrl_w:
328296417Sdim  case Intrinsic::x86_avx2_psrli_d:
329296417Sdim  case Intrinsic::x86_avx2_psrli_q:
330296417Sdim  case Intrinsic::x86_avx2_psrli_w:
331314564Sdim  case Intrinsic::x86_avx512_psrl_d_512:
332314564Sdim  case Intrinsic::x86_avx512_psrl_q_512:
333314564Sdim  case Intrinsic::x86_avx512_psrl_w_512:
334314564Sdim  case Intrinsic::x86_avx512_psrli_d_512:
335314564Sdim  case Intrinsic::x86_avx512_psrli_q_512:
336314564Sdim  case Intrinsic::x86_avx512_psrli_w_512:
337296417Sdim    LogicalShift = true; ShiftLeft = false;
338296417Sdim    break;
339296417Sdim  case Intrinsic::x86_sse2_psll_d:
340296417Sdim  case Intrinsic::x86_sse2_psll_q:
341296417Sdim  case Intrinsic::x86_sse2_psll_w:
342296417Sdim  case Intrinsic::x86_sse2_pslli_d:
343296417Sdim  case Intrinsic::x86_sse2_pslli_q:
344296417Sdim  case Intrinsic::x86_sse2_pslli_w:
345296417Sdim  case Intrinsic::x86_avx2_psll_d:
346296417Sdim  case Intrinsic::x86_avx2_psll_q:
347296417Sdim  case Intrinsic::x86_avx2_psll_w:
348296417Sdim  case Intrinsic::x86_avx2_pslli_d:
349296417Sdim  case Intrinsic::x86_avx2_pslli_q:
350296417Sdim  case Intrinsic::x86_avx2_pslli_w:
351314564Sdim  case Intrinsic::x86_avx512_psll_d_512:
352314564Sdim  case Intrinsic::x86_avx512_psll_q_512:
353314564Sdim  case Intrinsic::x86_avx512_psll_w_512:
354314564Sdim  case Intrinsic::x86_avx512_pslli_d_512:
355314564Sdim  case Intrinsic::x86_avx512_pslli_q_512:
356314564Sdim  case Intrinsic::x86_avx512_pslli_w_512:
357296417Sdim    LogicalShift = true; ShiftLeft = true;
358296417Sdim    break;
359296417Sdim  }
360296417Sdim  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
361296417Sdim
362296417Sdim  // Simplify if count is constant.
363296417Sdim  auto Arg1 = II.getArgOperand(1);
364296417Sdim  auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
365296417Sdim  auto CDV = dyn_cast<ConstantDataVector>(Arg1);
366296417Sdim  auto CInt = dyn_cast<ConstantInt>(Arg1);
367296417Sdim  if (!CAZ && !CDV && !CInt)
368296417Sdim    return nullptr;
369296417Sdim
370296417Sdim  APInt Count(64, 0);
371296417Sdim  if (CDV) {
372296417Sdim    // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
373296417Sdim    // operand to compute the shift amount.
374296417Sdim    auto VT = cast<VectorType>(CDV->getType());
375296417Sdim    unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits();
376296417Sdim    assert((64 % BitWidth) == 0 && "Unexpected packed shift size");
377296417Sdim    unsigned NumSubElts = 64 / BitWidth;
378296417Sdim
379296417Sdim    // Concatenate the sub-elements to create the 64-bit value.
380296417Sdim    for (unsigned i = 0; i != NumSubElts; ++i) {
381296417Sdim      unsigned SubEltIdx = (NumSubElts - 1) - i;
382296417Sdim      auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
383321369Sdim      Count <<= BitWidth;
384296417Sdim      Count |= SubElt->getValue().zextOrTrunc(64);
385296417Sdim    }
386296417Sdim  }
387296417Sdim  else if (CInt)
388296417Sdim    Count = CInt->getValue();
389296417Sdim
390296417Sdim  auto Vec = II.getArgOperand(0);
391296417Sdim  auto VT = cast<VectorType>(Vec->getType());
392296417Sdim  auto SVT = VT->getElementType();
393296417Sdim  unsigned VWidth = VT->getNumElements();
394296417Sdim  unsigned BitWidth = SVT->getPrimitiveSizeInBits();
395296417Sdim
396296417Sdim  // If shift-by-zero then just return the original value.
397321369Sdim  if (Count.isNullValue())
398296417Sdim    return Vec;
399296417Sdim
400296417Sdim  // Handle cases when Shift >= BitWidth.
401296417Sdim  if (Count.uge(BitWidth)) {
402296417Sdim    // If LogicalShift - just return zero.
403296417Sdim    if (LogicalShift)
404296417Sdim      return ConstantAggregateZero::get(VT);
405296417Sdim
406296417Sdim    // If ArithmeticShift - clamp Shift to (BitWidth - 1).
407296417Sdim    Count = APInt(64, BitWidth - 1);
408296417Sdim  }
409296417Sdim
410296417Sdim  // Get a constant vector of the same type as the first operand.
411296417Sdim  auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
412296417Sdim  auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
413296417Sdim
414296417Sdim  if (ShiftLeft)
415296417Sdim    return Builder.CreateShl(Vec, ShiftVec);
416296417Sdim
417296417Sdim  if (LogicalShift)
418296417Sdim    return Builder.CreateLShr(Vec, ShiftVec);
419296417Sdim
420296417Sdim  return Builder.CreateAShr(Vec, ShiftVec);
421296417Sdim}
422296417Sdim
423309124Sdim// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
424309124Sdim// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
425309124Sdim// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
426309124Sdimstatic Value *simplifyX86varShift(const IntrinsicInst &II,
427309124Sdim                                  InstCombiner::BuilderTy &Builder) {
428309124Sdim  bool LogicalShift = false;
429309124Sdim  bool ShiftLeft = false;
430296417Sdim
431309124Sdim  switch (II.getIntrinsicID()) {
432314564Sdim  default: llvm_unreachable("Unexpected intrinsic!");
433309124Sdim  case Intrinsic::x86_avx2_psrav_d:
434309124Sdim  case Intrinsic::x86_avx2_psrav_d_256:
435314564Sdim  case Intrinsic::x86_avx512_psrav_q_128:
436314564Sdim  case Intrinsic::x86_avx512_psrav_q_256:
437314564Sdim  case Intrinsic::x86_avx512_psrav_d_512:
438314564Sdim  case Intrinsic::x86_avx512_psrav_q_512:
439314564Sdim  case Intrinsic::x86_avx512_psrav_w_128:
440314564Sdim  case Intrinsic::x86_avx512_psrav_w_256:
441314564Sdim  case Intrinsic::x86_avx512_psrav_w_512:
442309124Sdim    LogicalShift = false;
443309124Sdim    ShiftLeft = false;
444309124Sdim    break;
445309124Sdim  case Intrinsic::x86_avx2_psrlv_d:
446309124Sdim  case Intrinsic::x86_avx2_psrlv_d_256:
447309124Sdim  case Intrinsic::x86_avx2_psrlv_q:
448309124Sdim  case Intrinsic::x86_avx2_psrlv_q_256:
449314564Sdim  case Intrinsic::x86_avx512_psrlv_d_512:
450314564Sdim  case Intrinsic::x86_avx512_psrlv_q_512:
451314564Sdim  case Intrinsic::x86_avx512_psrlv_w_128:
452314564Sdim  case Intrinsic::x86_avx512_psrlv_w_256:
453314564Sdim  case Intrinsic::x86_avx512_psrlv_w_512:
454309124Sdim    LogicalShift = true;
455309124Sdim    ShiftLeft = false;
456309124Sdim    break;
457309124Sdim  case Intrinsic::x86_avx2_psllv_d:
458309124Sdim  case Intrinsic::x86_avx2_psllv_d_256:
459309124Sdim  case Intrinsic::x86_avx2_psllv_q:
460309124Sdim  case Intrinsic::x86_avx2_psllv_q_256:
461314564Sdim  case Intrinsic::x86_avx512_psllv_d_512:
462314564Sdim  case Intrinsic::x86_avx512_psllv_q_512:
463314564Sdim  case Intrinsic::x86_avx512_psllv_w_128:
464314564Sdim  case Intrinsic::x86_avx512_psllv_w_256:
465314564Sdim  case Intrinsic::x86_avx512_psllv_w_512:
466309124Sdim    LogicalShift = true;
467309124Sdim    ShiftLeft = true;
468309124Sdim    break;
469309124Sdim  }
470309124Sdim  assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
471296417Sdim
472309124Sdim  // Simplify if all shift amounts are constant/undef.
473309124Sdim  auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
474309124Sdim  if (!CShift)
475309124Sdim    return nullptr;
476309124Sdim
477309124Sdim  auto Vec = II.getArgOperand(0);
478309124Sdim  auto VT = cast<VectorType>(II.getType());
479309124Sdim  auto SVT = VT->getVectorElementType();
480309124Sdim  int NumElts = VT->getNumElements();
481309124Sdim  int BitWidth = SVT->getIntegerBitWidth();
482309124Sdim
483309124Sdim  // Collect each element's shift amount.
484309124Sdim  // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
485309124Sdim  bool AnyOutOfRange = false;
486309124Sdim  SmallVector<int, 8> ShiftAmts;
487309124Sdim  for (int I = 0; I < NumElts; ++I) {
488309124Sdim    auto *CElt = CShift->getAggregateElement(I);
489309124Sdim    if (CElt && isa<UndefValue>(CElt)) {
490309124Sdim      ShiftAmts.push_back(-1);
491309124Sdim      continue;
492309124Sdim    }
493309124Sdim
494309124Sdim    auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
495309124Sdim    if (!COp)
496309124Sdim      return nullptr;
497309124Sdim
498309124Sdim    // Handle out of range shifts.
499309124Sdim    // If LogicalShift - set to BitWidth (special case).
500309124Sdim    // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
501309124Sdim    APInt ShiftVal = COp->getValue();
502309124Sdim    if (ShiftVal.uge(BitWidth)) {
503309124Sdim      AnyOutOfRange = LogicalShift;
504309124Sdim      ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
505309124Sdim      continue;
506309124Sdim    }
507309124Sdim
508309124Sdim    ShiftAmts.push_back((int)ShiftVal.getZExtValue());
509309124Sdim  }
510309124Sdim
511309124Sdim  // If all elements out of range or UNDEF, return vector of zeros/undefs.
512309124Sdim  // ArithmeticShift should only hit this if they are all UNDEF.
513309124Sdim  auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
514327952Sdim  if (llvm::all_of(ShiftAmts, OutOfRange)) {
515309124Sdim    SmallVector<Constant *, 8> ConstantVec;
516309124Sdim    for (int Idx : ShiftAmts) {
517309124Sdim      if (Idx < 0) {
518309124Sdim        ConstantVec.push_back(UndefValue::get(SVT));
519309124Sdim      } else {
520309124Sdim        assert(LogicalShift && "Logical shift expected");
521309124Sdim        ConstantVec.push_back(ConstantInt::getNullValue(SVT));
522309124Sdim      }
523309124Sdim    }
524309124Sdim    return ConstantVector::get(ConstantVec);
525309124Sdim  }
526309124Sdim
527309124Sdim  // We can't handle only some out of range values with generic logical shifts.
528309124Sdim  if (AnyOutOfRange)
529309124Sdim    return nullptr;
530309124Sdim
531309124Sdim  // Build the shift amount constant vector.
532309124Sdim  SmallVector<Constant *, 8> ShiftVecAmts;
533309124Sdim  for (int Idx : ShiftAmts) {
534309124Sdim    if (Idx < 0)
535309124Sdim      ShiftVecAmts.push_back(UndefValue::get(SVT));
536309124Sdim    else
537309124Sdim      ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
538309124Sdim  }
539309124Sdim  auto ShiftVec = ConstantVector::get(ShiftVecAmts);
540309124Sdim
541309124Sdim  if (ShiftLeft)
542309124Sdim    return Builder.CreateShl(Vec, ShiftVec);
543309124Sdim
544309124Sdim  if (LogicalShift)
545309124Sdim    return Builder.CreateLShr(Vec, ShiftVec);
546309124Sdim
547309124Sdim  return Builder.CreateAShr(Vec, ShiftVec);
548296417Sdim}
549296417Sdim
550353358Sdimstatic Value *simplifyX86pack(IntrinsicInst &II,
551353358Sdim                              InstCombiner::BuilderTy &Builder, bool IsSigned) {
552321369Sdim  Value *Arg0 = II.getArgOperand(0);
553321369Sdim  Value *Arg1 = II.getArgOperand(1);
554321369Sdim  Type *ResTy = II.getType();
555321369Sdim
556321369Sdim  // Fast all undef handling.
557321369Sdim  if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
558321369Sdim    return UndefValue::get(ResTy);
559321369Sdim
560321369Sdim  Type *ArgTy = Arg0->getType();
561321369Sdim  unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
562321369Sdim  unsigned NumSrcElts = ArgTy->getVectorNumElements();
563353358Sdim  assert(ResTy->getVectorNumElements() == (2 * NumSrcElts) &&
564353358Sdim         "Unexpected packing types");
565321369Sdim
566321369Sdim  unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
567321369Sdim  unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
568353358Sdim  unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
569353358Sdim  assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
570321369Sdim         "Unexpected packing types");
571321369Sdim
572321369Sdim  // Constant folding.
573353358Sdim  if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
574321369Sdim    return nullptr;
575321369Sdim
576353358Sdim  // Clamp Values - signed/unsigned both use signed clamp values, but they
577353358Sdim  // differ on the min/max values.
578353358Sdim  APInt MinValue, MaxValue;
579353358Sdim  if (IsSigned) {
580353358Sdim    // PACKSS: Truncate signed value with signed saturation.
581353358Sdim    // Source values less than dst minint are saturated to minint.
582353358Sdim    // Source values greater than dst maxint are saturated to maxint.
583353358Sdim    MinValue =
584353358Sdim        APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
585353358Sdim    MaxValue =
586353358Sdim        APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
587353358Sdim  } else {
588353358Sdim    // PACKUS: Truncate signed value with unsigned saturation.
589353358Sdim    // Source values less than zero are saturated to zero.
590353358Sdim    // Source values greater than dst maxuint are saturated to maxuint.
591353358Sdim    MinValue = APInt::getNullValue(SrcScalarSizeInBits);
592353358Sdim    MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
593321369Sdim  }
594321369Sdim
595353358Sdim  auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
596353358Sdim  auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
597353358Sdim  Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
598353358Sdim  Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
599353358Sdim  Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
600353358Sdim  Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
601321369Sdim
602353358Sdim  // Shuffle clamped args together at the lane level.
603353358Sdim  SmallVector<unsigned, 32> PackMask;
604353358Sdim  for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
605353358Sdim    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
606353358Sdim      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
607353358Sdim    for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
608353358Sdim      PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
609341825Sdim  }
610353358Sdim  auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
611341825Sdim
612353358Sdim  // Truncate to dst size.
613353358Sdim  return Builder.CreateTrunc(Shuffle, ResTy);
614341825Sdim}
615341825Sdim
616344779Sdimstatic Value *simplifyX86movmsk(const IntrinsicInst &II,
617344779Sdim                                InstCombiner::BuilderTy &Builder) {
618309124Sdim  Value *Arg = II.getArgOperand(0);
619309124Sdim  Type *ResTy = II.getType();
620309124Sdim  Type *ArgTy = Arg->getType();
621309124Sdim
622309124Sdim  // movmsk(undef) -> zero as we must ensure the upper bits are zero.
623309124Sdim  if (isa<UndefValue>(Arg))
624309124Sdim    return Constant::getNullValue(ResTy);
625309124Sdim
626309124Sdim  // We can't easily peek through x86_mmx types.
627309124Sdim  if (!ArgTy->isVectorTy())
628309124Sdim    return nullptr;
629309124Sdim
630353358Sdim  // Expand MOVMSK to compare/bitcast/zext:
631353358Sdim  // e.g. PMOVMSKB(v16i8 x):
632353358Sdim  // %cmp = icmp slt <16 x i8> %x, zeroinitializer
633353358Sdim  // %int = bitcast <16 x i1> %cmp to i16
634353358Sdim  // %res = zext i16 %int to i32
635353358Sdim  unsigned NumElts = ArgTy->getVectorNumElements();
636353358Sdim  Type *IntegerVecTy = VectorType::getInteger(cast<VectorType>(ArgTy));
637353358Sdim  Type *IntegerTy = Builder.getIntNTy(NumElts);
638309124Sdim
639353358Sdim  Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
640353358Sdim  Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
641353358Sdim  Res = Builder.CreateBitCast(Res, IntegerTy);
642353358Sdim  Res = Builder.CreateZExtOrTrunc(Res, ResTy);
643353358Sdim  return Res;
644353358Sdim}
645309124Sdim
646353358Sdimstatic Value *simplifyX86addcarry(const IntrinsicInst &II,
647353358Sdim                                  InstCombiner::BuilderTy &Builder) {
648353358Sdim  Value *CarryIn = II.getArgOperand(0);
649353358Sdim  Value *Op1 = II.getArgOperand(1);
650353358Sdim  Value *Op2 = II.getArgOperand(2);
651353358Sdim  Type *RetTy = II.getType();
652353358Sdim  Type *OpTy = Op1->getType();
653353358Sdim  assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
654353358Sdim         RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
655353358Sdim         "Unexpected types for x86 addcarry");
656309124Sdim
657353358Sdim  // If carry-in is zero, this is just an unsigned add with overflow.
658353358Sdim  if (match(CarryIn, m_ZeroInt())) {
659353358Sdim    Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
660353358Sdim                                          { Op1, Op2 });
661353358Sdim    // The types have to be adjusted to match the x86 call types.
662353358Sdim    Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
663353358Sdim    Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
664353358Sdim                                       Builder.getInt8Ty());
665353358Sdim    Value *Res = UndefValue::get(RetTy);
666353358Sdim    Res = Builder.CreateInsertValue(Res, UAddOV, 0);
667353358Sdim    return Builder.CreateInsertValue(Res, UAddResult, 1);
668309124Sdim  }
669309124Sdim
670344779Sdim  return nullptr;
671309124Sdim}
672309124Sdim
673309124Sdimstatic Value *simplifyX86insertps(const IntrinsicInst &II,
674288943Sdim                                  InstCombiner::BuilderTy &Builder) {
675309124Sdim  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
676309124Sdim  if (!CInt)
677309124Sdim    return nullptr;
678296417Sdim
679309124Sdim  VectorType *VecTy = cast<VectorType>(II.getType());
680309124Sdim  assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
681288943Sdim
682309124Sdim  // The immediate permute control byte looks like this:
683309124Sdim  //    [3:0] - zero mask for each 32-bit lane
684309124Sdim  //    [5:4] - select one 32-bit destination lane
685309124Sdim  //    [7:6] - select one 32-bit source lane
686288943Sdim
687309124Sdim  uint8_t Imm = CInt->getZExtValue();
688309124Sdim  uint8_t ZMask = Imm & 0xf;
689309124Sdim  uint8_t DestLane = (Imm >> 4) & 0x3;
690309124Sdim  uint8_t SourceLane = (Imm >> 6) & 0x3;
691288943Sdim
692309124Sdim  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
693288943Sdim
694309124Sdim  // If all zero mask bits are set, this was just a weird way to
695309124Sdim  // generate a zero vector.
696309124Sdim  if (ZMask == 0xf)
697309124Sdim    return ZeroVector;
698288943Sdim
699309124Sdim  // Initialize by passing all of the first source bits through.
700309124Sdim  uint32_t ShuffleMask[4] = { 0, 1, 2, 3 };
701288943Sdim
702309124Sdim  // We may replace the second operand with the zero vector.
703309124Sdim  Value *V1 = II.getArgOperand(1);
704309124Sdim
705309124Sdim  if (ZMask) {
706309124Sdim    // If the zero mask is being used with a single input or the zero mask
707309124Sdim    // overrides the destination lane, this is a shuffle with the zero vector.
708309124Sdim    if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
709309124Sdim        (ZMask & (1 << DestLane))) {
710309124Sdim      V1 = ZeroVector;
711309124Sdim      // We may still move 32-bits of the first source vector from one lane
712309124Sdim      // to another.
713309124Sdim      ShuffleMask[DestLane] = SourceLane;
714309124Sdim      // The zero mask may override the previous insert operation.
715309124Sdim      for (unsigned i = 0; i < 4; ++i)
716309124Sdim        if ((ZMask >> i) & 0x1)
717309124Sdim          ShuffleMask[i] = i + 4;
718288943Sdim    } else {
719309124Sdim      // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
720309124Sdim      return nullptr;
721288943Sdim    }
722309124Sdim  } else {
723309124Sdim    // Replace the selected destination lane with the selected source lane.
724309124Sdim    ShuffleMask[DestLane] = SourceLane + 4;
725309124Sdim  }
726296417Sdim
727309124Sdim  return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
728288943Sdim}
729288943Sdim
730296417Sdim/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
731296417Sdim/// or conversion to a shuffle vector.
732309124Sdimstatic Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
733296417Sdim                               ConstantInt *CILength, ConstantInt *CIIndex,
734296417Sdim                               InstCombiner::BuilderTy &Builder) {
735296417Sdim  auto LowConstantHighUndef = [&](uint64_t Val) {
736296417Sdim    Type *IntTy64 = Type::getInt64Ty(II.getContext());
737296417Sdim    Constant *Args[] = {ConstantInt::get(IntTy64, Val),
738296417Sdim                        UndefValue::get(IntTy64)};
739296417Sdim    return ConstantVector::get(Args);
740296417Sdim  };
741296417Sdim
742296417Sdim  // See if we're dealing with constant values.
743296417Sdim  Constant *C0 = dyn_cast<Constant>(Op0);
744296417Sdim  ConstantInt *CI0 =
745314564Sdim      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
746296417Sdim         : nullptr;
747296417Sdim
748296417Sdim  // Attempt to constant fold.
749296417Sdim  if (CILength && CIIndex) {
750296417Sdim    // From AMD documentation: "The bit index and field length are each six
751296417Sdim    // bits in length other bits of the field are ignored."
752296417Sdim    APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
753296417Sdim    APInt APLength = CILength->getValue().zextOrTrunc(6);
754296417Sdim
755296417Sdim    unsigned Index = APIndex.getZExtValue();
756296417Sdim
757296417Sdim    // From AMD documentation: "a value of zero in the field length is
758296417Sdim    // defined as length of 64".
759296417Sdim    unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
760296417Sdim
761296417Sdim    // From AMD documentation: "If the sum of the bit index + length field
762296417Sdim    // is greater than 64, the results are undefined".
763296417Sdim    unsigned End = Index + Length;
764296417Sdim
765296417Sdim    // Note that both field index and field length are 8-bit quantities.
766296417Sdim    // Since variables 'Index' and 'Length' are unsigned values
767296417Sdim    // obtained from zero-extending field index and field length
768296417Sdim    // respectively, their sum should never wrap around.
769296417Sdim    if (End > 64)
770296417Sdim      return UndefValue::get(II.getType());
771296417Sdim
772296417Sdim    // If we are inserting whole bytes, we can convert this to a shuffle.
773296417Sdim    // Lowering can recognize EXTRQI shuffle masks.
774296417Sdim    if ((Length % 8) == 0 && (Index % 8) == 0) {
775296417Sdim      // Convert bit indices to byte indices.
776296417Sdim      Length /= 8;
777296417Sdim      Index /= 8;
778296417Sdim
779296417Sdim      Type *IntTy8 = Type::getInt8Ty(II.getContext());
780296417Sdim      Type *IntTy32 = Type::getInt32Ty(II.getContext());
781296417Sdim      VectorType *ShufTy = VectorType::get(IntTy8, 16);
782296417Sdim
783296417Sdim      SmallVector<Constant *, 16> ShuffleMask;
784296417Sdim      for (int i = 0; i != (int)Length; ++i)
785296417Sdim        ShuffleMask.push_back(
786296417Sdim            Constant::getIntegerValue(IntTy32, APInt(32, i + Index)));
787296417Sdim      for (int i = Length; i != 8; ++i)
788296417Sdim        ShuffleMask.push_back(
789296417Sdim            Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
790296417Sdim      for (int i = 8; i != 16; ++i)
791296417Sdim        ShuffleMask.push_back(UndefValue::get(IntTy32));
792296417Sdim
793296417Sdim      Value *SV = Builder.CreateShuffleVector(
794296417Sdim          Builder.CreateBitCast(Op0, ShufTy),
795296417Sdim          ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask));
796296417Sdim      return Builder.CreateBitCast(SV, II.getType());
797296417Sdim    }
798296417Sdim
799296417Sdim    // Constant Fold - shift Index'th bit to lowest position and mask off
800296417Sdim    // Length bits.
801296417Sdim    if (CI0) {
802296417Sdim      APInt Elt = CI0->getValue();
803321369Sdim      Elt.lshrInPlace(Index);
804321369Sdim      Elt = Elt.zextOrTrunc(Length);
805296417Sdim      return LowConstantHighUndef(Elt.getZExtValue());
806296417Sdim    }
807296417Sdim
808296417Sdim    // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
809296417Sdim    if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
810296417Sdim      Value *Args[] = {Op0, CILength, CIIndex};
811296417Sdim      Module *M = II.getModule();
812353358Sdim      Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
813296417Sdim      return Builder.CreateCall(F, Args);
814296417Sdim    }
815296417Sdim  }
816296417Sdim
817296417Sdim  // Constant Fold - extraction from zero is always {zero, undef}.
818321369Sdim  if (CI0 && CI0->isZero())
819296417Sdim    return LowConstantHighUndef(0);
820296417Sdim
821296417Sdim  return nullptr;
822296417Sdim}
823296417Sdim
824296417Sdim/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
825296417Sdim/// folding or conversion to a shuffle vector.
826309124Sdimstatic Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
827296417Sdim                                 APInt APLength, APInt APIndex,
828296417Sdim                                 InstCombiner::BuilderTy &Builder) {
829296417Sdim  // From AMD documentation: "The bit index and field length are each six bits
830296417Sdim  // in length other bits of the field are ignored."
831296417Sdim  APIndex = APIndex.zextOrTrunc(6);
832296417Sdim  APLength = APLength.zextOrTrunc(6);
833296417Sdim
834296417Sdim  // Attempt to constant fold.
835296417Sdim  unsigned Index = APIndex.getZExtValue();
836296417Sdim
837296417Sdim  // From AMD documentation: "a value of zero in the field length is
838296417Sdim  // defined as length of 64".
839296417Sdim  unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
840296417Sdim
841296417Sdim  // From AMD documentation: "If the sum of the bit index + length field
842296417Sdim  // is greater than 64, the results are undefined".
843296417Sdim  unsigned End = Index + Length;
844296417Sdim
845296417Sdim  // Note that both field index and field length are 8-bit quantities.
846296417Sdim  // Since variables 'Index' and 'Length' are unsigned values
847296417Sdim  // obtained from zero-extending field index and field length
848296417Sdim  // respectively, their sum should never wrap around.
849296417Sdim  if (End > 64)
850296417Sdim    return UndefValue::get(II.getType());
851296417Sdim
852296417Sdim  // If we are inserting whole bytes, we can convert this to a shuffle.
853296417Sdim  // Lowering can recognize INSERTQI shuffle masks.
854296417Sdim  if ((Length % 8) == 0 && (Index % 8) == 0) {
855296417Sdim    // Convert bit indices to byte indices.
856296417Sdim    Length /= 8;
857296417Sdim    Index /= 8;
858296417Sdim
859296417Sdim    Type *IntTy8 = Type::getInt8Ty(II.getContext());
860296417Sdim    Type *IntTy32 = Type::getInt32Ty(II.getContext());
861296417Sdim    VectorType *ShufTy = VectorType::get(IntTy8, 16);
862296417Sdim
863296417Sdim    SmallVector<Constant *, 16> ShuffleMask;
864296417Sdim    for (int i = 0; i != (int)Index; ++i)
865296417Sdim      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
866296417Sdim    for (int i = 0; i != (int)Length; ++i)
867296417Sdim      ShuffleMask.push_back(
868296417Sdim          Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
869296417Sdim    for (int i = Index + Length; i != 8; ++i)
870296417Sdim      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
871296417Sdim    for (int i = 8; i != 16; ++i)
872296417Sdim      ShuffleMask.push_back(UndefValue::get(IntTy32));
873296417Sdim
874296417Sdim    Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
875296417Sdim                                            Builder.CreateBitCast(Op1, ShufTy),
876296417Sdim                                            ConstantVector::get(ShuffleMask));
877296417Sdim    return Builder.CreateBitCast(SV, II.getType());
878296417Sdim  }
879296417Sdim
880296417Sdim  // See if we're dealing with constant values.
881296417Sdim  Constant *C0 = dyn_cast<Constant>(Op0);
882296417Sdim  Constant *C1 = dyn_cast<Constant>(Op1);
883296417Sdim  ConstantInt *CI00 =
884314564Sdim      C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
885296417Sdim         : nullptr;
886296417Sdim  ConstantInt *CI10 =
887314564Sdim      C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
888296417Sdim         : nullptr;
889296417Sdim
890296417Sdim  // Constant Fold - insert bottom Length bits starting at the Index'th bit.
891296417Sdim  if (CI00 && CI10) {
892296417Sdim    APInt V00 = CI00->getValue();
893296417Sdim    APInt V10 = CI10->getValue();
894296417Sdim    APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
895296417Sdim    V00 = V00 & ~Mask;
896296417Sdim    V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
897296417Sdim    APInt Val = V00 | V10;
898296417Sdim    Type *IntTy64 = Type::getInt64Ty(II.getContext());
899296417Sdim    Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
900296417Sdim                        UndefValue::get(IntTy64)};
901296417Sdim    return ConstantVector::get(Args);
902296417Sdim  }
903296417Sdim
904296417Sdim  // If we were an INSERTQ call, we'll save demanded elements if we convert to
905296417Sdim  // INSERTQI.
906296417Sdim  if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
907296417Sdim    Type *IntTy8 = Type::getInt8Ty(II.getContext());
908296417Sdim    Constant *CILength = ConstantInt::get(IntTy8, Length, false);
909296417Sdim    Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
910296417Sdim
911296417Sdim    Value *Args[] = {Op0, Op1, CILength, CIIndex};
912296417Sdim    Module *M = II.getModule();
913353358Sdim    Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
914296417Sdim    return Builder.CreateCall(F, Args);
915296417Sdim  }
916296417Sdim
917296417Sdim  return nullptr;
918296417Sdim}
919296417Sdim
920309124Sdim/// Attempt to convert pshufb* to shufflevector if the mask is constant.
921309124Sdimstatic Value *simplifyX86pshufb(const IntrinsicInst &II,
922309124Sdim                                InstCombiner::BuilderTy &Builder) {
923309124Sdim  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
924309124Sdim  if (!V)
925309124Sdim    return nullptr;
926309124Sdim
927309124Sdim  auto *VecTy = cast<VectorType>(II.getType());
928309124Sdim  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
929309124Sdim  unsigned NumElts = VecTy->getNumElements();
930314564Sdim  assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
931309124Sdim         "Unexpected number of elements in shuffle mask!");
932309124Sdim
933309124Sdim  // Construct a shuffle mask from constant integers or UNDEFs.
934314564Sdim  Constant *Indexes[64] = {nullptr};
935309124Sdim
936309124Sdim  // Each byte in the shuffle control mask forms an index to permute the
937309124Sdim  // corresponding byte in the destination operand.
938309124Sdim  for (unsigned I = 0; I < NumElts; ++I) {
939309124Sdim    Constant *COp = V->getAggregateElement(I);
940309124Sdim    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
941309124Sdim      return nullptr;
942309124Sdim
943309124Sdim    if (isa<UndefValue>(COp)) {
944309124Sdim      Indexes[I] = UndefValue::get(MaskEltTy);
945309124Sdim      continue;
946309124Sdim    }
947309124Sdim
948309124Sdim    int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
949309124Sdim
950309124Sdim    // If the most significant bit (bit[7]) of each byte of the shuffle
951309124Sdim    // control mask is set, then zero is written in the result byte.
952309124Sdim    // The zero vector is in the right-hand side of the resulting
953309124Sdim    // shufflevector.
954309124Sdim
955309124Sdim    // The value of each index for the high 128-bit lane is the least
956309124Sdim    // significant 4 bits of the respective shuffle control byte.
957309124Sdim    Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
958309124Sdim    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
959309124Sdim  }
960309124Sdim
961309124Sdim  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
962309124Sdim  auto V1 = II.getArgOperand(0);
963309124Sdim  auto V2 = Constant::getNullValue(VecTy);
964309124Sdim  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
965309124Sdim}
966309124Sdim
967309124Sdim/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
968309124Sdimstatic Value *simplifyX86vpermilvar(const IntrinsicInst &II,
969309124Sdim                                    InstCombiner::BuilderTy &Builder) {
970309124Sdim  Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
971309124Sdim  if (!V)
972309124Sdim    return nullptr;
973309124Sdim
974314564Sdim  auto *VecTy = cast<VectorType>(II.getType());
975309124Sdim  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
976314564Sdim  unsigned NumElts = VecTy->getVectorNumElements();
977314564Sdim  bool IsPD = VecTy->getScalarType()->isDoubleTy();
978314564Sdim  unsigned NumLaneElts = IsPD ? 2 : 4;
979314564Sdim  assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
980309124Sdim
981309124Sdim  // Construct a shuffle mask from constant integers or UNDEFs.
982314564Sdim  Constant *Indexes[16] = {nullptr};
983309124Sdim
984309124Sdim  // The intrinsics only read one or two bits, clear the rest.
985309124Sdim  for (unsigned I = 0; I < NumElts; ++I) {
986309124Sdim    Constant *COp = V->getAggregateElement(I);
987309124Sdim    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
988309124Sdim      return nullptr;
989309124Sdim
990309124Sdim    if (isa<UndefValue>(COp)) {
991309124Sdim      Indexes[I] = UndefValue::get(MaskEltTy);
992309124Sdim      continue;
993309124Sdim    }
994309124Sdim
995309124Sdim    APInt Index = cast<ConstantInt>(COp)->getValue();
996309124Sdim    Index = Index.zextOrTrunc(32).getLoBits(2);
997309124Sdim
998309124Sdim    // The PD variants uses bit 1 to select per-lane element index, so
999309124Sdim    // shift down to convert to generic shuffle mask index.
1000314564Sdim    if (IsPD)
1001321369Sdim      Index.lshrInPlace(1);
1002309124Sdim
1003309124Sdim    // The _256 variants are a bit trickier since the mask bits always index
1004309124Sdim    // into the corresponding 128 half. In order to convert to a generic
1005309124Sdim    // shuffle, we have to make that explicit.
1006314564Sdim    Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
1007309124Sdim
1008309124Sdim    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
1009309124Sdim  }
1010309124Sdim
1011309124Sdim  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
1012309124Sdim  auto V1 = II.getArgOperand(0);
1013309124Sdim  auto V2 = UndefValue::get(V1->getType());
1014309124Sdim  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
1015309124Sdim}
1016309124Sdim
1017309124Sdim/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
1018309124Sdimstatic Value *simplifyX86vpermv(const IntrinsicInst &II,
1019309124Sdim                                InstCombiner::BuilderTy &Builder) {
1020309124Sdim  auto *V = dyn_cast<Constant>(II.getArgOperand(1));
1021309124Sdim  if (!V)
1022309124Sdim    return nullptr;
1023309124Sdim
1024309124Sdim  auto *VecTy = cast<VectorType>(II.getType());
1025309124Sdim  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
1026309124Sdim  unsigned Size = VecTy->getNumElements();
1027314564Sdim  assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
1028314564Sdim         "Unexpected shuffle mask size");
1029309124Sdim
1030309124Sdim  // Construct a shuffle mask from constant integers or UNDEFs.
1031314564Sdim  Constant *Indexes[64] = {nullptr};
1032309124Sdim
1033309124Sdim  for (unsigned I = 0; I < Size; ++I) {
1034309124Sdim    Constant *COp = V->getAggregateElement(I);
1035309124Sdim    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
1036309124Sdim      return nullptr;
1037309124Sdim
1038309124Sdim    if (isa<UndefValue>(COp)) {
1039309124Sdim      Indexes[I] = UndefValue::get(MaskEltTy);
1040309124Sdim      continue;
1041309124Sdim    }
1042309124Sdim
1043314564Sdim    uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
1044314564Sdim    Index &= Size - 1;
1045309124Sdim    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
1046309124Sdim  }
1047309124Sdim
1048309124Sdim  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size));
1049309124Sdim  auto V1 = II.getArgOperand(0);
1050309124Sdim  auto V2 = UndefValue::get(VecTy);
1051309124Sdim  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
1052309124Sdim}
1053309124Sdim
1054353358Sdim// TODO, Obvious Missing Transforms:
1055353358Sdim// * Narrow width by halfs excluding zero/undef lanes
1056353358SdimValue *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) {
1057353358Sdim  Value *LoadPtr = II.getArgOperand(0);
1058353358Sdim  unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue();
1059296417Sdim
1060309124Sdim  // If the mask is all ones or undefs, this is a plain vector load of the 1st
1061309124Sdim  // argument.
1062353358Sdim  if (maskIsAllOneOrUndef(II.getArgOperand(2)))
1063353358Sdim    return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
1064353358Sdim                                     "unmaskedload");
1065353358Sdim
1066353358Sdim  // If we can unconditionally load from this address, replace with a
1067353358Sdim  // load/select idiom. TODO: use DT for context sensitive query
1068360784Sdim  if (isDereferenceableAndAlignedPointer(
1069360784Sdim          LoadPtr, II.getType(), MaybeAlign(Alignment),
1070360784Sdim          II.getModule()->getDataLayout(), &II, nullptr)) {
1071353358Sdim    Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
1072353358Sdim                                         "unmaskedload");
1073353358Sdim    return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
1074309124Sdim  }
1075309124Sdim
1076309124Sdim  return nullptr;
1077309124Sdim}
1078309124Sdim
1079353358Sdim// TODO, Obvious Missing Transforms:
1080353358Sdim// * Single constant active lane -> store
1081353358Sdim// * Narrow width by halfs excluding zero/undef lanes
1082353358SdimInstruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) {
1083309124Sdim  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
1084309124Sdim  if (!ConstMask)
1085309124Sdim    return nullptr;
1086309124Sdim
1087309124Sdim  // If the mask is all zeros, this instruction does nothing.
1088309124Sdim  if (ConstMask->isNullValue())
1089353358Sdim    return eraseInstFromFunction(II);
1090309124Sdim
1091309124Sdim  // If the mask is all ones, this is a plain vector store of the 1st argument.
1092309124Sdim  if (ConstMask->isAllOnesValue()) {
1093309124Sdim    Value *StorePtr = II.getArgOperand(1);
1094360784Sdim    MaybeAlign Alignment(
1095360784Sdim        cast<ConstantInt>(II.getArgOperand(2))->getZExtValue());
1096309124Sdim    return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
1097309124Sdim  }
1098309124Sdim
1099353358Sdim  // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
1100353358Sdim  APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
1101353358Sdim  APInt UndefElts(DemandedElts.getBitWidth(), 0);
1102353358Sdim  if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0),
1103353358Sdim                                            DemandedElts, UndefElts)) {
1104353358Sdim    II.setOperand(0, V);
1105353358Sdim    return &II;
1106353358Sdim  }
1107353358Sdim
1108309124Sdim  return nullptr;
1109309124Sdim}
1110309124Sdim
1111353358Sdim// TODO, Obvious Missing Transforms:
1112353358Sdim// * Single constant active lane load -> load
1113353358Sdim// * Dereferenceable address & few lanes -> scalarize speculative load/selects
1114353358Sdim// * Adjacent vector addresses -> masked.load
1115353358Sdim// * Narrow width by halfs excluding zero/undef lanes
1116353358Sdim// * Vector splat address w/known mask -> scalar load
1117353358Sdim// * Vector incrementing address -> vector masked load
1118353358SdimInstruction *InstCombiner::simplifyMaskedGather(IntrinsicInst &II) {
1119353358Sdim  return nullptr;
1120353358Sdim}
1121309124Sdim
1122353358Sdim// TODO, Obvious Missing Transforms:
1123353358Sdim// * Single constant active lane -> store
1124353358Sdim// * Adjacent vector addresses -> masked.store
1125353358Sdim// * Narrow store width by halfs excluding zero/undef lanes
1126353358Sdim// * Vector splat address w/known mask -> scalar store
1127353358Sdim// * Vector incrementing address -> vector masked store
1128353358SdimInstruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) {
1129353358Sdim  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
1130353358Sdim  if (!ConstMask)
1131353358Sdim    return nullptr;
1132353358Sdim
1133353358Sdim  // If the mask is all zeros, a scatter does nothing.
1134353358Sdim  if (ConstMask->isNullValue())
1135353358Sdim    return eraseInstFromFunction(II);
1136353358Sdim
1137353358Sdim  // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
1138353358Sdim  APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
1139353358Sdim  APInt UndefElts(DemandedElts.getBitWidth(), 0);
1140353358Sdim  if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0),
1141353358Sdim                                            DemandedElts, UndefElts)) {
1142353358Sdim    II.setOperand(0, V);
1143353358Sdim    return &II;
1144353358Sdim  }
1145353358Sdim  if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1),
1146353358Sdim                                            DemandedElts, UndefElts)) {
1147353358Sdim    II.setOperand(1, V);
1148353358Sdim    return &II;
1149353358Sdim  }
1150353358Sdim
1151309124Sdim  return nullptr;
1152309124Sdim}
1153309124Sdim
1154341825Sdim/// This function transforms launder.invariant.group and strip.invariant.group
1155341825Sdim/// like:
1156341825Sdim/// launder(launder(%x)) -> launder(%x)       (the result is not the argument)
1157341825Sdim/// launder(strip(%x)) -> launder(%x)
1158341825Sdim/// strip(strip(%x)) -> strip(%x)             (the result is not the argument)
1159341825Sdim/// strip(launder(%x)) -> strip(%x)
1160341825Sdim/// This is legal because it preserves the most recent information about
1161341825Sdim/// the presence or absence of invariant.group.
1162341825Sdimstatic Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
1163341825Sdim                                                    InstCombiner &IC) {
1164341825Sdim  auto *Arg = II.getArgOperand(0);
1165341825Sdim  auto *StrippedArg = Arg->stripPointerCasts();
1166341825Sdim  auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups();
1167341825Sdim  if (StrippedArg == StrippedInvariantGroupsArg)
1168341825Sdim    return nullptr; // No launders/strips to remove.
1169341825Sdim
1170341825Sdim  Value *Result = nullptr;
1171341825Sdim
1172341825Sdim  if (II.getIntrinsicID() == Intrinsic::launder_invariant_group)
1173341825Sdim    Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg);
1174341825Sdim  else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group)
1175341825Sdim    Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg);
1176341825Sdim  else
1177341825Sdim    llvm_unreachable(
1178341825Sdim        "simplifyInvariantGroupIntrinsic only handles launder and strip");
1179341825Sdim  if (Result->getType()->getPointerAddressSpace() !=
1180341825Sdim      II.getType()->getPointerAddressSpace())
1181341825Sdim    Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType());
1182341825Sdim  if (Result->getType() != II.getType())
1183341825Sdim    Result = IC.Builder.CreateBitCast(Result, II.getType());
1184341825Sdim
1185341825Sdim  return cast<Instruction>(Result);
1186341825Sdim}
1187341825Sdim
1188314564Sdimstatic Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
1189314564Sdim  assert((II.getIntrinsicID() == Intrinsic::cttz ||
1190314564Sdim          II.getIntrinsicID() == Intrinsic::ctlz) &&
1191314564Sdim         "Expected cttz or ctlz intrinsic");
1192353358Sdim  bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
1193314564Sdim  Value *Op0 = II.getArgOperand(0);
1194353358Sdim  Value *X;
1195353358Sdim  // ctlz(bitreverse(x)) -> cttz(x)
1196353358Sdim  // cttz(bitreverse(x)) -> ctlz(x)
1197353358Sdim  if (match(Op0, m_BitReverse(m_Value(X)))) {
1198353358Sdim    Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz;
1199353358Sdim    Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType());
1200353358Sdim    return CallInst::Create(F, {X, II.getArgOperand(1)});
1201353358Sdim  }
1202314564Sdim
1203353358Sdim  if (IsTZ) {
1204353358Sdim    // cttz(-x) -> cttz(x)
1205353358Sdim    if (match(Op0, m_Neg(m_Value(X)))) {
1206353358Sdim      II.setOperand(0, X);
1207353358Sdim      return &II;
1208353358Sdim    }
1209353358Sdim
1210353358Sdim    // cttz(abs(x)) -> cttz(x)
1211353358Sdim    // cttz(nabs(x)) -> cttz(x)
1212353358Sdim    Value *Y;
1213353358Sdim    SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
1214353358Sdim    if (SPF == SPF_ABS || SPF == SPF_NABS) {
1215353358Sdim      II.setOperand(0, X);
1216353358Sdim      return &II;
1217353358Sdim    }
1218353358Sdim  }
1219353358Sdim
1220321369Sdim  KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
1221314564Sdim
1222314564Sdim  // Create a mask for bits above (ctlz) or below (cttz) the first known one.
1223321369Sdim  unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
1224321369Sdim                                : Known.countMaxLeadingZeros();
1225321369Sdim  unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
1226321369Sdim                                : Known.countMinLeadingZeros();
1227314564Sdim
1228314564Sdim  // If all bits above (ctlz) or below (cttz) the first known one are known
1229314564Sdim  // zero, this value is constant.
1230314564Sdim  // FIXME: This should be in InstSimplify because we're replacing an
1231314564Sdim  // instruction with a constant.
1232321369Sdim  if (PossibleZeros == DefiniteZeros) {
1233321369Sdim    auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros);
1234314564Sdim    return IC.replaceInstUsesWith(II, C);
1235314564Sdim  }
1236314564Sdim
1237314564Sdim  // If the input to cttz/ctlz is known to be non-zero,
1238314564Sdim  // then change the 'ZeroIsUndef' parameter to 'true'
1239314564Sdim  // because we know the zero behavior can't affect the result.
1240321369Sdim  if (!Known.One.isNullValue() ||
1241321369Sdim      isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
1242321369Sdim                     &IC.getDominatorTree())) {
1243314564Sdim    if (!match(II.getArgOperand(1), m_One())) {
1244321369Sdim      II.setOperand(1, IC.Builder.getTrue());
1245314564Sdim      return &II;
1246314564Sdim    }
1247314564Sdim  }
1248314564Sdim
1249321369Sdim  // Add range metadata since known bits can't completely reflect what we know.
1250321369Sdim  // TODO: Handle splat vectors.
1251321369Sdim  auto *IT = dyn_cast<IntegerType>(Op0->getType());
1252321369Sdim  if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
1253321369Sdim    Metadata *LowAndHigh[] = {
1254321369Sdim        ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)),
1255321369Sdim        ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))};
1256321369Sdim    II.setMetadata(LLVMContext::MD_range,
1257321369Sdim                   MDNode::get(II.getContext(), LowAndHigh));
1258321369Sdim    return &II;
1259321369Sdim  }
1260321369Sdim
1261314564Sdim  return nullptr;
1262314564Sdim}
1263314564Sdim
1264321369Sdimstatic Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
1265321369Sdim  assert(II.getIntrinsicID() == Intrinsic::ctpop &&
1266321369Sdim         "Expected ctpop intrinsic");
1267321369Sdim  Value *Op0 = II.getArgOperand(0);
1268353358Sdim  Value *X;
1269353358Sdim  // ctpop(bitreverse(x)) -> ctpop(x)
1270353358Sdim  // ctpop(bswap(x)) -> ctpop(x)
1271353358Sdim  if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) {
1272353358Sdim    II.setOperand(0, X);
1273353358Sdim    return &II;
1274353358Sdim  }
1275353358Sdim
1276321369Sdim  // FIXME: Try to simplify vectors of integers.
1277321369Sdim  auto *IT = dyn_cast<IntegerType>(Op0->getType());
1278321369Sdim  if (!IT)
1279321369Sdim    return nullptr;
1280321369Sdim
1281321369Sdim  unsigned BitWidth = IT->getBitWidth();
1282321369Sdim  KnownBits Known(BitWidth);
1283321369Sdim  IC.computeKnownBits(Op0, Known, 0, &II);
1284321369Sdim
1285321369Sdim  unsigned MinCount = Known.countMinPopulation();
1286321369Sdim  unsigned MaxCount = Known.countMaxPopulation();
1287321369Sdim
1288321369Sdim  // Add range metadata since known bits can't completely reflect what we know.
1289321369Sdim  if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
1290321369Sdim    Metadata *LowAndHigh[] = {
1291321369Sdim        ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)),
1292321369Sdim        ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))};
1293321369Sdim    II.setMetadata(LLVMContext::MD_range,
1294321369Sdim                   MDNode::get(II.getContext(), LowAndHigh));
1295321369Sdim    return &II;
1296321369Sdim  }
1297321369Sdim
1298321369Sdim  return nullptr;
1299321369Sdim}
1300321369Sdim
1301309124Sdim// TODO: If the x86 backend knew how to convert a bool vector mask back to an
1302309124Sdim// XMM register mask efficiently, we could transform all x86 masked intrinsics
1303309124Sdim// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
1304309124Sdimstatic Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
1305309124Sdim  Value *Ptr = II.getOperand(0);
1306309124Sdim  Value *Mask = II.getOperand(1);
1307309124Sdim  Constant *ZeroVec = Constant::getNullValue(II.getType());
1308309124Sdim
1309309124Sdim  // Special case a zero mask since that's not a ConstantDataVector.
1310309124Sdim  // This masked load instruction creates a zero vector.
1311309124Sdim  if (isa<ConstantAggregateZero>(Mask))
1312309124Sdim    return IC.replaceInstUsesWith(II, ZeroVec);
1313309124Sdim
1314309124Sdim  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
1315309124Sdim  if (!ConstMask)
1316309124Sdim    return nullptr;
1317309124Sdim
1318309124Sdim  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
1319309124Sdim  // to allow target-independent optimizations.
1320309124Sdim
1321309124Sdim  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
1322309124Sdim  // the LLVM intrinsic definition for the pointer argument.
1323309124Sdim  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
1324309124Sdim  PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
1325321369Sdim  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
1326309124Sdim
1327309124Sdim  // Second, convert the x86 XMM integer vector mask to a vector of bools based
1328309124Sdim  // on each element's most significant bit (the sign bit).
1329309124Sdim  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
1330309124Sdim
1331309124Sdim  // The pass-through vector for an x86 masked load is a zero vector.
1332309124Sdim  CallInst *NewMaskedLoad =
1333321369Sdim      IC.Builder.CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec);
1334309124Sdim  return IC.replaceInstUsesWith(II, NewMaskedLoad);
1335309124Sdim}
1336309124Sdim
1337309124Sdim// TODO: If the x86 backend knew how to convert a bool vector mask back to an
1338309124Sdim// XMM register mask efficiently, we could transform all x86 masked intrinsics
1339309124Sdim// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
1340309124Sdimstatic bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
1341309124Sdim  Value *Ptr = II.getOperand(0);
1342309124Sdim  Value *Mask = II.getOperand(1);
1343309124Sdim  Value *Vec = II.getOperand(2);
1344309124Sdim
1345309124Sdim  // Special case a zero mask since that's not a ConstantDataVector:
1346309124Sdim  // this masked store instruction does nothing.
1347309124Sdim  if (isa<ConstantAggregateZero>(Mask)) {
1348309124Sdim    IC.eraseInstFromFunction(II);
1349309124Sdim    return true;
1350309124Sdim  }
1351309124Sdim
1352309124Sdim  // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
1353309124Sdim  // anything else at this level.
1354309124Sdim  if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
1355309124Sdim    return false;
1356309124Sdim
1357309124Sdim  auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
1358309124Sdim  if (!ConstMask)
1359309124Sdim    return false;
1360309124Sdim
1361309124Sdim  // The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
1362309124Sdim  // to allow target-independent optimizations.
1363309124Sdim
1364309124Sdim  // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
1365309124Sdim  // the LLVM intrinsic definition for the pointer argument.
1366309124Sdim  unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
1367309124Sdim  PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
1368321369Sdim  Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
1369309124Sdim
1370309124Sdim  // Second, convert the x86 XMM integer vector mask to a vector of bools based
1371309124Sdim  // on each element's most significant bit (the sign bit).
1372309124Sdim  Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
1373309124Sdim
1374321369Sdim  IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask);
1375309124Sdim
1376309124Sdim  // 'Replace uses' doesn't work for stores. Erase the original masked store.
1377309124Sdim  IC.eraseInstFromFunction(II);
1378309124Sdim  return true;
1379309124Sdim}
1380309124Sdim
1381321369Sdim// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
1382321369Sdim//
1383321369Sdim// A single NaN input is folded to minnum, so we rely on that folding for
1384321369Sdim// handling NaNs.
1385321369Sdimstatic APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
1386321369Sdim                           const APFloat &Src2) {
1387321369Sdim  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
1388321369Sdim
1389321369Sdim  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
1390321369Sdim  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
1391321369Sdim  if (Cmp0 == APFloat::cmpEqual)
1392321369Sdim    return maxnum(Src1, Src2);
1393321369Sdim
1394321369Sdim  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
1395321369Sdim  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
1396321369Sdim  if (Cmp1 == APFloat::cmpEqual)
1397321369Sdim    return maxnum(Src0, Src2);
1398321369Sdim
1399321369Sdim  return maxnum(Src0, Src1);
1400321369Sdim}
1401321369Sdim
1402341825Sdim/// Convert a table lookup to shufflevector if the mask is constant.
1403341825Sdim/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
1404341825Sdim/// which case we could lower the shufflevector with rev64 instructions
1405341825Sdim/// as it's actually a byte reverse.
1406341825Sdimstatic Value *simplifyNeonTbl1(const IntrinsicInst &II,
1407341825Sdim                               InstCombiner::BuilderTy &Builder) {
1408341825Sdim  // Bail out if the mask is not a constant.
1409341825Sdim  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
1410341825Sdim  if (!C)
1411341825Sdim    return nullptr;
1412341825Sdim
1413341825Sdim  auto *VecTy = cast<VectorType>(II.getType());
1414341825Sdim  unsigned NumElts = VecTy->getNumElements();
1415341825Sdim
1416341825Sdim  // Only perform this transformation for <8 x i8> vector types.
1417341825Sdim  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
1418341825Sdim    return nullptr;
1419341825Sdim
1420341825Sdim  uint32_t Indexes[8];
1421341825Sdim
1422341825Sdim  for (unsigned I = 0; I < NumElts; ++I) {
1423341825Sdim    Constant *COp = C->getAggregateElement(I);
1424341825Sdim
1425341825Sdim    if (!COp || !isa<ConstantInt>(COp))
1426341825Sdim      return nullptr;
1427341825Sdim
1428341825Sdim    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
1429341825Sdim
1430341825Sdim    // Make sure the mask indices are in range.
1431341825Sdim    if (Indexes[I] >= NumElts)
1432341825Sdim      return nullptr;
1433341825Sdim  }
1434341825Sdim
1435341825Sdim  auto *ShuffleMask = ConstantDataVector::get(II.getContext(),
1436341825Sdim                                              makeArrayRef(Indexes));
1437341825Sdim  auto *V1 = II.getArgOperand(0);
1438341825Sdim  auto *V2 = Constant::getNullValue(V1->getType());
1439341825Sdim  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
1440341825Sdim}
1441341825Sdim
1442341825Sdim/// Convert a vector load intrinsic into a simple llvm load instruction.
1443341825Sdim/// This is beneficial when the underlying object being addressed comes
1444341825Sdim/// from a constant, since we get constant-folding for free.
1445341825Sdimstatic Value *simplifyNeonVld1(const IntrinsicInst &II,
1446341825Sdim                               unsigned MemAlign,
1447341825Sdim                               InstCombiner::BuilderTy &Builder) {
1448341825Sdim  auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
1449341825Sdim
1450341825Sdim  if (!IntrAlign)
1451341825Sdim    return nullptr;
1452341825Sdim
1453341825Sdim  unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign ?
1454341825Sdim                       MemAlign : IntrAlign->getLimitedValue();
1455341825Sdim
1456341825Sdim  if (!isPowerOf2_32(Alignment))
1457341825Sdim    return nullptr;
1458341825Sdim
1459341825Sdim  auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
1460341825Sdim                                          PointerType::get(II.getType(), 0));
1461353358Sdim  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Alignment);
1462341825Sdim}
1463341825Sdim
1464309124Sdim// Returns true iff the 2 intrinsics have the same operands, limiting the
1465309124Sdim// comparison to the first NumOperands.
1466309124Sdimstatic bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
1467309124Sdim                             unsigned NumOperands) {
1468309124Sdim  assert(I.getNumArgOperands() >= NumOperands && "Not enough operands");
1469309124Sdim  assert(E.getNumArgOperands() >= NumOperands && "Not enough operands");
1470309124Sdim  for (unsigned i = 0; i < NumOperands; i++)
1471309124Sdim    if (I.getArgOperand(i) != E.getArgOperand(i))
1472309124Sdim      return false;
1473309124Sdim  return true;
1474309124Sdim}
1475309124Sdim
1476309124Sdim// Remove trivially empty start/end intrinsic ranges, i.e. a start
1477309124Sdim// immediately followed by an end (ignoring debuginfo or other
1478309124Sdim// start/end intrinsics in between). As this handles only the most trivial
1479309124Sdim// cases, tracking the nesting level is not needed:
1480309124Sdim//
1481309124Sdim//   call @llvm.foo.start(i1 0) ; &I
1482309124Sdim//   call @llvm.foo.start(i1 0)
1483309124Sdim//   call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed
1484309124Sdim//   call @llvm.foo.end(i1 0)
1485309124Sdimstatic bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID,
1486309124Sdim                                      unsigned EndID, InstCombiner &IC) {
1487309124Sdim  assert(I.getIntrinsicID() == StartID &&
1488309124Sdim         "Start intrinsic does not have expected ID");
1489309124Sdim  BasicBlock::iterator BI(I), BE(I.getParent()->end());
1490309124Sdim  for (++BI; BI != BE; ++BI) {
1491309124Sdim    if (auto *E = dyn_cast<IntrinsicInst>(BI)) {
1492309124Sdim      if (isa<DbgInfoIntrinsic>(E) || E->getIntrinsicID() == StartID)
1493309124Sdim        continue;
1494309124Sdim      if (E->getIntrinsicID() == EndID &&
1495309124Sdim          haveSameOperands(I, *E, E->getNumArgOperands())) {
1496309124Sdim        IC.eraseInstFromFunction(*E);
1497309124Sdim        IC.eraseInstFromFunction(I);
1498309124Sdim        return true;
1499309124Sdim      }
1500309124Sdim    }
1501309124Sdim    break;
1502309124Sdim  }
1503309124Sdim
1504309124Sdim  return false;
1505309124Sdim}
1506309124Sdim
1507321369Sdim// Convert NVVM intrinsics to target-generic LLVM code where possible.
1508321369Sdimstatic Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
1509321369Sdim  // Each NVVM intrinsic we can simplify can be replaced with one of:
1510321369Sdim  //
1511321369Sdim  //  * an LLVM intrinsic,
1512321369Sdim  //  * an LLVM cast operation,
1513321369Sdim  //  * an LLVM binary operation, or
1514321369Sdim  //  * ad-hoc LLVM IR for the particular operation.
1515321369Sdim
1516321369Sdim  // Some transformations are only valid when the module's
1517321369Sdim  // flush-denormals-to-zero (ftz) setting is true/false, whereas other
1518321369Sdim  // transformations are valid regardless of the module's ftz setting.
1519321369Sdim  enum FtzRequirementTy {
1520321369Sdim    FTZ_Any,       // Any ftz setting is ok.
1521321369Sdim    FTZ_MustBeOn,  // Transformation is valid only if ftz is on.
1522321369Sdim    FTZ_MustBeOff, // Transformation is valid only if ftz is off.
1523321369Sdim  };
1524321369Sdim  // Classes of NVVM intrinsics that can't be replaced one-to-one with a
1525321369Sdim  // target-generic intrinsic, cast op, or binary op but that we can nonetheless
1526321369Sdim  // simplify.
1527321369Sdim  enum SpecialCase {
1528321369Sdim    SPC_Reciprocal,
1529321369Sdim  };
1530321369Sdim
1531321369Sdim  // SimplifyAction is a poor-man's variant (plus an additional flag) that
1532321369Sdim  // represents how to replace an NVVM intrinsic with target-generic LLVM IR.
1533321369Sdim  struct SimplifyAction {
1534321369Sdim    // Invariant: At most one of these Optionals has a value.
1535321369Sdim    Optional<Intrinsic::ID> IID;
1536321369Sdim    Optional<Instruction::CastOps> CastOp;
1537321369Sdim    Optional<Instruction::BinaryOps> BinaryOp;
1538321369Sdim    Optional<SpecialCase> Special;
1539321369Sdim
1540321369Sdim    FtzRequirementTy FtzRequirement = FTZ_Any;
1541321369Sdim
1542321369Sdim    SimplifyAction() = default;
1543321369Sdim
1544321369Sdim    SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
1545321369Sdim        : IID(IID), FtzRequirement(FtzReq) {}
1546321369Sdim
1547321369Sdim    // Cast operations don't have anything to do with FTZ, so we skip that
1548321369Sdim    // argument.
1549321369Sdim    SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}
1550321369Sdim
1551321369Sdim    SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
1552321369Sdim        : BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}
1553321369Sdim
1554321369Sdim    SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
1555321369Sdim        : Special(Special), FtzRequirement(FtzReq) {}
1556321369Sdim  };
1557321369Sdim
1558321369Sdim  // Try to generate a SimplifyAction describing how to replace our
1559321369Sdim  // IntrinsicInstr with target-generic LLVM IR.
1560321369Sdim  const SimplifyAction Action = [II]() -> SimplifyAction {
1561321369Sdim    switch (II->getIntrinsicID()) {
1562321369Sdim    // NVVM intrinsics that map directly to LLVM intrinsics.
1563321369Sdim    case Intrinsic::nvvm_ceil_d:
1564321369Sdim      return {Intrinsic::ceil, FTZ_Any};
1565321369Sdim    case Intrinsic::nvvm_ceil_f:
1566321369Sdim      return {Intrinsic::ceil, FTZ_MustBeOff};
1567321369Sdim    case Intrinsic::nvvm_ceil_ftz_f:
1568321369Sdim      return {Intrinsic::ceil, FTZ_MustBeOn};
1569321369Sdim    case Intrinsic::nvvm_fabs_d:
1570321369Sdim      return {Intrinsic::fabs, FTZ_Any};
1571321369Sdim    case Intrinsic::nvvm_fabs_f:
1572321369Sdim      return {Intrinsic::fabs, FTZ_MustBeOff};
1573321369Sdim    case Intrinsic::nvvm_fabs_ftz_f:
1574321369Sdim      return {Intrinsic::fabs, FTZ_MustBeOn};
1575321369Sdim    case Intrinsic::nvvm_floor_d:
1576321369Sdim      return {Intrinsic::floor, FTZ_Any};
1577321369Sdim    case Intrinsic::nvvm_floor_f:
1578321369Sdim      return {Intrinsic::floor, FTZ_MustBeOff};
1579321369Sdim    case Intrinsic::nvvm_floor_ftz_f:
1580321369Sdim      return {Intrinsic::floor, FTZ_MustBeOn};
1581321369Sdim    case Intrinsic::nvvm_fma_rn_d:
1582321369Sdim      return {Intrinsic::fma, FTZ_Any};
1583321369Sdim    case Intrinsic::nvvm_fma_rn_f:
1584321369Sdim      return {Intrinsic::fma, FTZ_MustBeOff};
1585321369Sdim    case Intrinsic::nvvm_fma_rn_ftz_f:
1586321369Sdim      return {Intrinsic::fma, FTZ_MustBeOn};
1587321369Sdim    case Intrinsic::nvvm_fmax_d:
1588321369Sdim      return {Intrinsic::maxnum, FTZ_Any};
1589321369Sdim    case Intrinsic::nvvm_fmax_f:
1590321369Sdim      return {Intrinsic::maxnum, FTZ_MustBeOff};
1591321369Sdim    case Intrinsic::nvvm_fmax_ftz_f:
1592321369Sdim      return {Intrinsic::maxnum, FTZ_MustBeOn};
1593321369Sdim    case Intrinsic::nvvm_fmin_d:
1594321369Sdim      return {Intrinsic::minnum, FTZ_Any};
1595321369Sdim    case Intrinsic::nvvm_fmin_f:
1596321369Sdim      return {Intrinsic::minnum, FTZ_MustBeOff};
1597321369Sdim    case Intrinsic::nvvm_fmin_ftz_f:
1598321369Sdim      return {Intrinsic::minnum, FTZ_MustBeOn};
1599321369Sdim    case Intrinsic::nvvm_round_d:
1600321369Sdim      return {Intrinsic::round, FTZ_Any};
1601321369Sdim    case Intrinsic::nvvm_round_f:
1602321369Sdim      return {Intrinsic::round, FTZ_MustBeOff};
1603321369Sdim    case Intrinsic::nvvm_round_ftz_f:
1604321369Sdim      return {Intrinsic::round, FTZ_MustBeOn};
1605321369Sdim    case Intrinsic::nvvm_sqrt_rn_d:
1606321369Sdim      return {Intrinsic::sqrt, FTZ_Any};
1607321369Sdim    case Intrinsic::nvvm_sqrt_f:
1608321369Sdim      // nvvm_sqrt_f is a special case.  For  most intrinsics, foo_ftz_f is the
1609321369Sdim      // ftz version, and foo_f is the non-ftz version.  But nvvm_sqrt_f adopts
1610321369Sdim      // the ftz-ness of the surrounding code.  sqrt_rn_f and sqrt_rn_ftz_f are
1611321369Sdim      // the versions with explicit ftz-ness.
1612321369Sdim      return {Intrinsic::sqrt, FTZ_Any};
1613321369Sdim    case Intrinsic::nvvm_sqrt_rn_f:
1614321369Sdim      return {Intrinsic::sqrt, FTZ_MustBeOff};
1615321369Sdim    case Intrinsic::nvvm_sqrt_rn_ftz_f:
1616321369Sdim      return {Intrinsic::sqrt, FTZ_MustBeOn};
1617321369Sdim    case Intrinsic::nvvm_trunc_d:
1618321369Sdim      return {Intrinsic::trunc, FTZ_Any};
1619321369Sdim    case Intrinsic::nvvm_trunc_f:
1620321369Sdim      return {Intrinsic::trunc, FTZ_MustBeOff};
1621321369Sdim    case Intrinsic::nvvm_trunc_ftz_f:
1622321369Sdim      return {Intrinsic::trunc, FTZ_MustBeOn};
1623321369Sdim
1624321369Sdim    // NVVM intrinsics that map to LLVM cast operations.
1625321369Sdim    //
1626321369Sdim    // Note that llvm's target-generic conversion operators correspond to the rz
1627321369Sdim    // (round to zero) versions of the nvvm conversion intrinsics, even though
1628321369Sdim    // most everything else here uses the rn (round to nearest even) nvvm ops.
1629321369Sdim    case Intrinsic::nvvm_d2i_rz:
1630321369Sdim    case Intrinsic::nvvm_f2i_rz:
1631321369Sdim    case Intrinsic::nvvm_d2ll_rz:
1632321369Sdim    case Intrinsic::nvvm_f2ll_rz:
1633321369Sdim      return {Instruction::FPToSI};
1634321369Sdim    case Intrinsic::nvvm_d2ui_rz:
1635321369Sdim    case Intrinsic::nvvm_f2ui_rz:
1636321369Sdim    case Intrinsic::nvvm_d2ull_rz:
1637321369Sdim    case Intrinsic::nvvm_f2ull_rz:
1638321369Sdim      return {Instruction::FPToUI};
1639321369Sdim    case Intrinsic::nvvm_i2d_rz:
1640321369Sdim    case Intrinsic::nvvm_i2f_rz:
1641321369Sdim    case Intrinsic::nvvm_ll2d_rz:
1642321369Sdim    case Intrinsic::nvvm_ll2f_rz:
1643321369Sdim      return {Instruction::SIToFP};
1644321369Sdim    case Intrinsic::nvvm_ui2d_rz:
1645321369Sdim    case Intrinsic::nvvm_ui2f_rz:
1646321369Sdim    case Intrinsic::nvvm_ull2d_rz:
1647321369Sdim    case Intrinsic::nvvm_ull2f_rz:
1648321369Sdim      return {Instruction::UIToFP};
1649321369Sdim
1650321369Sdim    // NVVM intrinsics that map to LLVM binary ops.
1651321369Sdim    case Intrinsic::nvvm_add_rn_d:
1652321369Sdim      return {Instruction::FAdd, FTZ_Any};
1653321369Sdim    case Intrinsic::nvvm_add_rn_f:
1654321369Sdim      return {Instruction::FAdd, FTZ_MustBeOff};
1655321369Sdim    case Intrinsic::nvvm_add_rn_ftz_f:
1656321369Sdim      return {Instruction::FAdd, FTZ_MustBeOn};
1657321369Sdim    case Intrinsic::nvvm_mul_rn_d:
1658321369Sdim      return {Instruction::FMul, FTZ_Any};
1659321369Sdim    case Intrinsic::nvvm_mul_rn_f:
1660321369Sdim      return {Instruction::FMul, FTZ_MustBeOff};
1661321369Sdim    case Intrinsic::nvvm_mul_rn_ftz_f:
1662321369Sdim      return {Instruction::FMul, FTZ_MustBeOn};
1663321369Sdim    case Intrinsic::nvvm_div_rn_d:
1664321369Sdim      return {Instruction::FDiv, FTZ_Any};
1665321369Sdim    case Intrinsic::nvvm_div_rn_f:
1666321369Sdim      return {Instruction::FDiv, FTZ_MustBeOff};
1667321369Sdim    case Intrinsic::nvvm_div_rn_ftz_f:
1668321369Sdim      return {Instruction::FDiv, FTZ_MustBeOn};
1669321369Sdim
1670321369Sdim    // The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
1671321369Sdim    // need special handling.
1672321369Sdim    //
1673321369Sdim    // We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
1674321369Sdim    // as well.
1675321369Sdim    case Intrinsic::nvvm_rcp_rn_d:
1676321369Sdim      return {SPC_Reciprocal, FTZ_Any};
1677321369Sdim    case Intrinsic::nvvm_rcp_rn_f:
1678321369Sdim      return {SPC_Reciprocal, FTZ_MustBeOff};
1679321369Sdim    case Intrinsic::nvvm_rcp_rn_ftz_f:
1680321369Sdim      return {SPC_Reciprocal, FTZ_MustBeOn};
1681321369Sdim
1682321369Sdim    // We do not currently simplify intrinsics that give an approximate answer.
1683321369Sdim    // These include:
1684321369Sdim    //
1685321369Sdim    //   - nvvm_cos_approx_{f,ftz_f}
1686321369Sdim    //   - nvvm_ex2_approx_{d,f,ftz_f}
1687321369Sdim    //   - nvvm_lg2_approx_{d,f,ftz_f}
1688321369Sdim    //   - nvvm_sin_approx_{f,ftz_f}
1689321369Sdim    //   - nvvm_sqrt_approx_{f,ftz_f}
1690321369Sdim    //   - nvvm_rsqrt_approx_{d,f,ftz_f}
1691321369Sdim    //   - nvvm_div_approx_{ftz_d,ftz_f,f}
1692321369Sdim    //   - nvvm_rcp_approx_ftz_d
1693321369Sdim    //
1694321369Sdim    // Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
1695321369Sdim    // means that fastmath is enabled in the intrinsic.  Unfortunately only
1696321369Sdim    // binary operators (currently) have a fastmath bit in SelectionDAG, so this
1697321369Sdim    // information gets lost and we can't select on it.
1698321369Sdim    //
1699321369Sdim    // TODO: div and rcp are lowered to a binary op, so these we could in theory
1700321369Sdim    // lower them to "fast fdiv".
1701321369Sdim
1702321369Sdim    default:
1703321369Sdim      return {};
1704321369Sdim    }
1705321369Sdim  }();
1706321369Sdim
1707321369Sdim  // If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
1708321369Sdim  // can bail out now.  (Notice that in the case that IID is not an NVVM
1709321369Sdim  // intrinsic, we don't have to look up any module metadata, as
1710321369Sdim  // FtzRequirementTy will be FTZ_Any.)
1711321369Sdim  if (Action.FtzRequirement != FTZ_Any) {
1712321369Sdim    bool FtzEnabled =
1713321369Sdim        II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() ==
1714321369Sdim        "true";
1715321369Sdim
1716321369Sdim    if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
1717321369Sdim      return nullptr;
1718321369Sdim  }
1719321369Sdim
1720321369Sdim  // Simplify to target-generic intrinsic.
1721321369Sdim  if (Action.IID) {
1722321369Sdim    SmallVector<Value *, 4> Args(II->arg_operands());
1723321369Sdim    // All the target-generic intrinsics currently of interest to us have one
1724321369Sdim    // type argument, equal to that of the nvvm intrinsic's argument.
1725321369Sdim    Type *Tys[] = {II->getArgOperand(0)->getType()};
1726321369Sdim    return CallInst::Create(
1727321369Sdim        Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
1728321369Sdim  }
1729321369Sdim
1730321369Sdim  // Simplify to target-generic binary op.
1731321369Sdim  if (Action.BinaryOp)
1732321369Sdim    return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
1733321369Sdim                                  II->getArgOperand(1), II->getName());
1734321369Sdim
1735321369Sdim  // Simplify to target-generic cast op.
1736321369Sdim  if (Action.CastOp)
1737321369Sdim    return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
1738321369Sdim                            II->getName());
1739321369Sdim
1740321369Sdim  // All that's left are the special cases.
1741321369Sdim  if (!Action.Special)
1742321369Sdim    return nullptr;
1743321369Sdim
1744321369Sdim  switch (*Action.Special) {
1745321369Sdim  case SPC_Reciprocal:
1746321369Sdim    // Simplify reciprocal.
1747321369Sdim    return BinaryOperator::Create(
1748321369Sdim        Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
1749321369Sdim        II->getArgOperand(0), II->getName());
1750321369Sdim  }
1751321369Sdim  llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
1752321369Sdim}
1753321369Sdim
1754309124SdimInstruction *InstCombiner::visitVAStartInst(VAStartInst &I) {
1755309124Sdim  removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this);
1756309124Sdim  return nullptr;
1757309124Sdim}
1758309124Sdim
1759309124SdimInstruction *InstCombiner::visitVACopyInst(VACopyInst &I) {
1760309124Sdim  removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this);
1761309124Sdim  return nullptr;
1762309124Sdim}
1763309124Sdim
1764344779Sdimstatic Instruction *canonicalizeConstantArg0ToArg1(CallInst &Call) {
1765344779Sdim  assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap");
1766344779Sdim  Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1);
1767344779Sdim  if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) {
1768344779Sdim    Call.setArgOperand(0, Arg1);
1769344779Sdim    Call.setArgOperand(1, Arg0);
1770344779Sdim    return &Call;
1771344779Sdim  }
1772344779Sdim  return nullptr;
1773344779Sdim}
1774344779Sdim
1775353358SdimInstruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
1776353358Sdim  WithOverflowInst *WO = cast<WithOverflowInst>(II);
1777353358Sdim  Value *OperationResult = nullptr;
1778353358Sdim  Constant *OverflowResult = nullptr;
1779353358Sdim  if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
1780353358Sdim                            WO->getRHS(), *WO, OperationResult, OverflowResult))
1781353358Sdim    return CreateOverflowTuple(WO, OperationResult, OverflowResult);
1782353358Sdim  return nullptr;
1783353358Sdim}
1784353358Sdim
1785309124Sdim/// CallInst simplification. This mostly only handles folding of intrinsic
1786353358Sdim/// instructions. For normal calls, it allows visitCallBase to do the heavy
1787309124Sdim/// lifting.
1788202375SrdivackyInstruction *InstCombiner::visitCallInst(CallInst &CI) {
1789327952Sdim  if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI)))
1790309124Sdim    return replaceInstUsesWith(CI, V);
1791288943Sdim
1792314564Sdim  if (isFreeCall(&CI, &TLI))
1793202375Srdivacky    return visitFree(CI);
1794202375Srdivacky
1795202375Srdivacky  // If the caller function is nounwind, mark the call as nounwind, even if the
1796202375Srdivacky  // callee isn't.
1797314564Sdim  if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) {
1798202375Srdivacky    CI.setDoesNotThrow();
1799202375Srdivacky    return &CI;
1800202375Srdivacky  }
1801234353Sdim
1802202375Srdivacky  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
1803353358Sdim  if (!II) return visitCallBase(CI);
1804210299Sed
1805353358Sdim  // Intrinsics cannot occur in an invoke or a callbr, so handle them here
1806353358Sdim  // instead of in visitCallBase.
1807341825Sdim  if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
1808202375Srdivacky    bool Changed = false;
1809202375Srdivacky
1810202375Srdivacky    // memmove/cpy/set of zero bytes is a noop.
1811202375Srdivacky    if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
1812218893Sdim      if (NumBytes->isNullValue())
1813309124Sdim        return eraseInstFromFunction(CI);
1814202375Srdivacky
1815202375Srdivacky      if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
1816202375Srdivacky        if (CI->getZExtValue() == 1) {
1817202375Srdivacky          // Replace the instruction with just byte operations.  We would
1818202375Srdivacky          // transform other cases to loads/stores, but we don't know if
1819202375Srdivacky          // alignment is sufficient.
1820202375Srdivacky        }
1821202375Srdivacky    }
1822234353Sdim
1823218893Sdim    // No other transformations apply to volatile transfers.
1824341825Sdim    if (auto *M = dyn_cast<MemIntrinsic>(MI))
1825341825Sdim      if (M->isVolatile())
1826341825Sdim        return nullptr;
1827202375Srdivacky
1828202375Srdivacky    // If we have a memmove and the source operation is a constant global,
1829202375Srdivacky    // then the source and dest pointers can't alias, so we can change this
1830202375Srdivacky    // into a call to memcpy.
1831341825Sdim    if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
1832202375Srdivacky      if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
1833202375Srdivacky        if (GVSrc->isConstant()) {
1834296417Sdim          Module *M = CI.getModule();
1835341825Sdim          Intrinsic::ID MemCpyID =
1836341825Sdim              isa<AtomicMemMoveInst>(MMI)
1837341825Sdim                  ? Intrinsic::memcpy_element_unordered_atomic
1838341825Sdim                  : Intrinsic::memcpy;
1839224145Sdim          Type *Tys[3] = { CI.getArgOperand(0)->getType(),
1840224145Sdim                           CI.getArgOperand(1)->getType(),
1841224145Sdim                           CI.getArgOperand(2)->getType() };
1842224145Sdim          CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
1843202375Srdivacky          Changed = true;
1844202375Srdivacky        }
1845202375Srdivacky    }
1846202375Srdivacky
1847341825Sdim    if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
1848202375Srdivacky      // memmove(x,x,size) -> noop.
1849202375Srdivacky      if (MTI->getSource() == MTI->getDest())
1850309124Sdim        return eraseInstFromFunction(CI);
1851202375Srdivacky    }
1852202375Srdivacky
1853202375Srdivacky    // If we can determine a pointer alignment that is bigger than currently
1854202375Srdivacky    // set, update the alignment.
1855341825Sdim    if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
1856341825Sdim      if (Instruction *I = SimplifyAnyMemTransfer(MTI))
1857202375Srdivacky        return I;
1858341825Sdim    } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) {
1859341825Sdim      if (Instruction *I = SimplifyAnyMemSet(MSI))
1860202375Srdivacky        return I;
1861202375Srdivacky    }
1862210299Sed
1863202375Srdivacky    if (Changed) return II;
1864202375Srdivacky  }
1865234353Sdim
1866353358Sdim  // For vector result intrinsics, use the generic demanded vector support.
1867353358Sdim  if (II->getType()->isVectorTy()) {
1868353358Sdim    auto VWidth = II->getType()->getVectorNumElements();
1869353358Sdim    APInt UndefElts(VWidth, 0);
1870353358Sdim    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
1871353358Sdim    if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
1872353358Sdim      if (V != II)
1873353358Sdim        return replaceInstUsesWith(*II, V);
1874353358Sdim      return II;
1875353358Sdim    }
1876353358Sdim  }
1877353358Sdim
1878321369Sdim  if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
1879321369Sdim    return I;
1880321369Sdim
1881309124Sdim  auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
1882309124Sdim                                              unsigned DemandedWidth) {
1883296417Sdim    APInt UndefElts(Width, 0);
1884296417Sdim    APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
1885296417Sdim    return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
1886296417Sdim  };
1887296417Sdim
1888353358Sdim  Intrinsic::ID IID = II->getIntrinsicID();
1889353358Sdim  switch (IID) {
1890202375Srdivacky  default: break;
1891314564Sdim  case Intrinsic::objectsize:
1892353358Sdim    if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
1893353358Sdim      return replaceInstUsesWith(CI, V);
1894276479Sdim    return nullptr;
1895249423Sdim  case Intrinsic::bswap: {
1896249423Sdim    Value *IIOperand = II->getArgOperand(0);
1897276479Sdim    Value *X = nullptr;
1898249423Sdim
1899202375Srdivacky    // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
1900249423Sdim    if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
1901249423Sdim      unsigned C = X->getType()->getPrimitiveSizeInBits() -
1902249423Sdim        IIOperand->getType()->getPrimitiveSizeInBits();
1903249423Sdim      Value *CV = ConstantInt::get(X->getType(), C);
1904321369Sdim      Value *V = Builder.CreateLShr(X, CV);
1905249423Sdim      return new TruncInst(V, IIOperand->getType());
1906202375Srdivacky    }
1907249423Sdim    break;
1908249423Sdim  }
1909309124Sdim  case Intrinsic::masked_load:
1910353358Sdim    if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II))
1911309124Sdim      return replaceInstUsesWith(CI, SimplifiedMaskedOp);
1912309124Sdim    break;
1913309124Sdim  case Intrinsic::masked_store:
1914353358Sdim    return simplifyMaskedStore(*II);
1915309124Sdim  case Intrinsic::masked_gather:
1916353358Sdim    return simplifyMaskedGather(*II);
1917309124Sdim  case Intrinsic::masked_scatter:
1918353358Sdim    return simplifyMaskedScatter(*II);
1919341825Sdim  case Intrinsic::launder_invariant_group:
1920341825Sdim  case Intrinsic::strip_invariant_group:
1921341825Sdim    if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this))
1922341825Sdim      return replaceInstUsesWith(*II, SkippedBarrier);
1923341825Sdim    break;
1924202375Srdivacky  case Intrinsic::powi:
1925210299Sed    if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
1926327952Sdim      // 0 and 1 are handled in instsimplify
1927327952Sdim
1928202375Srdivacky      // powi(x, -1) -> 1/x
1929321369Sdim      if (Power->isMinusOne())
1930202375Srdivacky        return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
1931210299Sed                                          II->getArgOperand(0));
1932327952Sdim      // powi(x, 2) -> x*x
1933327952Sdim      if (Power->equalsInt(2))
1934327952Sdim        return BinaryOperator::CreateFMul(II->getArgOperand(0),
1935327952Sdim                                          II->getArgOperand(0));
1936202375Srdivacky    }
1937202375Srdivacky    break;
1938234353Sdim
1939314564Sdim  case Intrinsic::cttz:
1940314564Sdim  case Intrinsic::ctlz:
1941314564Sdim    if (auto *I = foldCttzCtlz(*II, *this))
1942314564Sdim      return I;
1943202375Srdivacky    break;
1944234353Sdim
1945321369Sdim  case Intrinsic::ctpop:
1946321369Sdim    if (auto *I = foldCtpop(*II, *this))
1947321369Sdim      return I;
1948321369Sdim    break;
1949321369Sdim
1950344779Sdim  case Intrinsic::fshl:
1951344779Sdim  case Intrinsic::fshr: {
1952353358Sdim    Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1);
1953353358Sdim    Type *Ty = II->getType();
1954353358Sdim    unsigned BitWidth = Ty->getScalarSizeInBits();
1955353358Sdim    Constant *ShAmtC;
1956353358Sdim    if (match(II->getArgOperand(2), m_Constant(ShAmtC)) &&
1957353358Sdim        !isa<ConstantExpr>(ShAmtC) && !ShAmtC->containsConstantExpression()) {
1958353358Sdim      // Canonicalize a shift amount constant operand to modulo the bit-width.
1959353358Sdim      Constant *WidthC = ConstantInt::get(Ty, BitWidth);
1960353358Sdim      Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC);
1961353358Sdim      if (ModuloC != ShAmtC) {
1962353358Sdim        II->setArgOperand(2, ModuloC);
1963353358Sdim        return II;
1964353358Sdim      }
1965353358Sdim      assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) ==
1966353358Sdim                 ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) &&
1967353358Sdim             "Shift amount expected to be modulo bitwidth");
1968344779Sdim
1969353358Sdim      // Canonicalize funnel shift right by constant to funnel shift left. This
1970353358Sdim      // is not entirely arbitrary. For historical reasons, the backend may
1971353358Sdim      // recognize rotate left patterns but miss rotate right patterns.
1972353358Sdim      if (IID == Intrinsic::fshr) {
1973353358Sdim        // fshr X, Y, C --> fshl X, Y, (BitWidth - C)
1974353358Sdim        Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC);
1975353358Sdim        Module *Mod = II->getModule();
1976353358Sdim        Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty);
1977353358Sdim        return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC });
1978353358Sdim      }
1979353358Sdim      assert(IID == Intrinsic::fshl &&
1980353358Sdim             "All funnel shifts by simple constants should go left");
1981344779Sdim
1982353358Sdim      // fshl(X, 0, C) --> shl X, C
1983353358Sdim      // fshl(X, undef, C) --> shl X, C
1984353358Sdim      if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef()))
1985353358Sdim        return BinaryOperator::CreateShl(Op0, ShAmtC);
1986353358Sdim
1987353358Sdim      // fshl(0, X, C) --> lshr X, (BW-C)
1988353358Sdim      // fshl(undef, X, C) --> lshr X, (BW-C)
1989353358Sdim      if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef()))
1990353358Sdim        return BinaryOperator::CreateLShr(Op1,
1991353358Sdim                                          ConstantExpr::getSub(WidthC, ShAmtC));
1992353358Sdim
1993353358Sdim      // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form)
1994353358Sdim      if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) {
1995353358Sdim        Module *Mod = II->getModule();
1996353358Sdim        Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty);
1997353358Sdim        return CallInst::Create(Bswap, { Op0 });
1998353358Sdim      }
1999344779Sdim    }
2000344779Sdim
2001353358Sdim    // Left or right might be masked.
2002353358Sdim    if (SimplifyDemandedInstructionBits(*II))
2003353358Sdim      return &CI;
2004353358Sdim
2005344779Sdim    // The shift amount (operand 2) of a funnel shift is modulo the bitwidth,
2006344779Sdim    // so only the low bits of the shift amount are demanded if the bitwidth is
2007344779Sdim    // a power-of-2.
2008344779Sdim    if (!isPowerOf2_32(BitWidth))
2009344779Sdim      break;
2010344779Sdim    APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth));
2011344779Sdim    KnownBits Op2Known(BitWidth);
2012344779Sdim    if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known))
2013344779Sdim      return &CI;
2014344779Sdim    break;
2015344779Sdim  }
2016288943Sdim  case Intrinsic::uadd_with_overflow:
2017353358Sdim  case Intrinsic::sadd_with_overflow: {
2018353358Sdim    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
2019353358Sdim      return I;
2020353358Sdim    if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
2021353358Sdim      return I;
2022353358Sdim
2023353358Sdim    // Given 2 constant operands whose sum does not overflow:
2024353358Sdim    // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1
2025353358Sdim    // saddo (X +nsw C0), C1 -> saddo X, C0 + C1
2026353358Sdim    Value *X;
2027353358Sdim    const APInt *C0, *C1;
2028353358Sdim    Value *Arg0 = II->getArgOperand(0);
2029353358Sdim    Value *Arg1 = II->getArgOperand(1);
2030353358Sdim    bool IsSigned = IID == Intrinsic::sadd_with_overflow;
2031353358Sdim    bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0)))
2032353358Sdim                             : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0)));
2033353358Sdim    if (HasNWAdd && match(Arg1, m_APInt(C1))) {
2034353358Sdim      bool Overflow;
2035353358Sdim      APInt NewC =
2036353358Sdim          IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow);
2037353358Sdim      if (!Overflow)
2038353358Sdim        return replaceInstUsesWith(
2039353358Sdim            *II, Builder.CreateBinaryIntrinsic(
2040353358Sdim                     IID, X, ConstantInt::get(Arg1->getType(), NewC)));
2041353358Sdim    }
2042353358Sdim    break;
2043353358Sdim  }
2044353358Sdim
2045288943Sdim  case Intrinsic::umul_with_overflow:
2046288943Sdim  case Intrinsic::smul_with_overflow:
2047344779Sdim    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
2048344779Sdim      return I;
2049314564Sdim    LLVM_FALLTHROUGH;
2050202375Srdivacky
2051202375Srdivacky  case Intrinsic::usub_with_overflow:
2052353358Sdim    if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
2053353358Sdim      return I;
2054353358Sdim    break;
2055353358Sdim
2056280031Sdim  case Intrinsic::ssub_with_overflow: {
2057353358Sdim    if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
2058353358Sdim      return I;
2059234353Sdim
2060353358Sdim    Constant *C;
2061353358Sdim    Value *Arg0 = II->getArgOperand(0);
2062353358Sdim    Value *Arg1 = II->getArgOperand(1);
2063353358Sdim    // Given a constant C that is not the minimum signed value
2064353358Sdim    // for an integer of a given bit width:
2065353358Sdim    //
2066353358Sdim    // ssubo X, C -> saddo X, -C
2067353358Sdim    if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) {
2068353358Sdim      Value *NegVal = ConstantExpr::getNeg(C);
2069353358Sdim      // Build a saddo call that is equivalent to the discovered
2070353358Sdim      // ssubo call.
2071353358Sdim      return replaceInstUsesWith(
2072353358Sdim          *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow,
2073353358Sdim                                             Arg0, NegVal));
2074353358Sdim    }
2075288943Sdim
2076202375Srdivacky    break;
2077280031Sdim  }
2078202375Srdivacky
2079344779Sdim  case Intrinsic::uadd_sat:
2080344779Sdim  case Intrinsic::sadd_sat:
2081344779Sdim    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
2082344779Sdim      return I;
2083344779Sdim    LLVM_FALLTHROUGH;
2084344779Sdim  case Intrinsic::usub_sat:
2085344779Sdim  case Intrinsic::ssub_sat: {
2086353358Sdim    SaturatingInst *SI = cast<SaturatingInst>(II);
2087353358Sdim    Type *Ty = SI->getType();
2088353358Sdim    Value *Arg0 = SI->getLHS();
2089353358Sdim    Value *Arg1 = SI->getRHS();
2090344779Sdim
2091344779Sdim    // Make use of known overflow information.
2092353358Sdim    OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(),
2093353358Sdim                                        Arg0, Arg1, SI);
2094353358Sdim    switch (OR) {
2095353358Sdim      case OverflowResult::MayOverflow:
2096353358Sdim        break;
2097353358Sdim      case OverflowResult::NeverOverflows:
2098353358Sdim        if (SI->isSigned())
2099353358Sdim          return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1);
2100353358Sdim        else
2101353358Sdim          return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1);
2102353358Sdim      case OverflowResult::AlwaysOverflowsLow: {
2103353358Sdim        unsigned BitWidth = Ty->getScalarSizeInBits();
2104353358Sdim        APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned());
2105353358Sdim        return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min));
2106353358Sdim      }
2107353358Sdim      case OverflowResult::AlwaysOverflowsHigh: {
2108353358Sdim        unsigned BitWidth = Ty->getScalarSizeInBits();
2109353358Sdim        APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned());
2110353358Sdim        return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max));
2111353358Sdim      }
2112280031Sdim    }
2113341825Sdim
2114344779Sdim    // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN
2115344779Sdim    Constant *C;
2116344779Sdim    if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) &&
2117344779Sdim        C->isNotMinSignedValue()) {
2118344779Sdim      Value *NegVal = ConstantExpr::getNeg(C);
2119344779Sdim      return replaceInstUsesWith(
2120344779Sdim          *II, Builder.CreateBinaryIntrinsic(
2121344779Sdim              Intrinsic::sadd_sat, Arg0, NegVal));
2122344779Sdim    }
2123341825Sdim
2124344779Sdim    // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2))
2125344779Sdim    // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2))
2126344779Sdim    // if Val and Val2 have the same sign
2127344779Sdim    if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) {
2128344779Sdim      Value *X;
2129344779Sdim      const APInt *Val, *Val2;
2130344779Sdim      APInt NewVal;
2131344779Sdim      bool IsUnsigned =
2132344779Sdim          IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat;
2133353358Sdim      if (Other->getIntrinsicID() == IID &&
2134344779Sdim          match(Arg1, m_APInt(Val)) &&
2135344779Sdim          match(Other->getArgOperand(0), m_Value(X)) &&
2136344779Sdim          match(Other->getArgOperand(1), m_APInt(Val2))) {
2137344779Sdim        if (IsUnsigned)
2138344779Sdim          NewVal = Val->uadd_sat(*Val2);
2139344779Sdim        else if (Val->isNonNegative() == Val2->isNonNegative()) {
2140344779Sdim          bool Overflow;
2141344779Sdim          NewVal = Val->sadd_ov(*Val2, Overflow);
2142344779Sdim          if (Overflow) {
2143344779Sdim            // Both adds together may add more than SignedMaxValue
2144344779Sdim            // without saturating the final result.
2145344779Sdim            break;
2146344779Sdim          }
2147344779Sdim        } else {
2148344779Sdim          // Cannot fold saturated addition with different signs.
2149344779Sdim          break;
2150344779Sdim        }
2151344779Sdim
2152344779Sdim        return replaceInstUsesWith(
2153344779Sdim            *II, Builder.CreateBinaryIntrinsic(
2154344779Sdim                     IID, X, ConstantInt::get(II->getType(), NewVal)));
2155344779Sdim      }
2156344779Sdim    }
2157344779Sdim    break;
2158344779Sdim  }
2159344779Sdim
2160344779Sdim  case Intrinsic::minnum:
2161344779Sdim  case Intrinsic::maxnum:
2162344779Sdim  case Intrinsic::minimum:
2163344779Sdim  case Intrinsic::maximum: {
2164344779Sdim    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
2165344779Sdim      return I;
2166344779Sdim    Value *Arg0 = II->getArgOperand(0);
2167344779Sdim    Value *Arg1 = II->getArgOperand(1);
2168341825Sdim    Value *X, *Y;
2169341825Sdim    if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
2170341825Sdim        (Arg0->hasOneUse() || Arg1->hasOneUse())) {
2171341825Sdim      // If both operands are negated, invert the call and negate the result:
2172344779Sdim      // min(-X, -Y) --> -(max(X, Y))
2173344779Sdim      // max(-X, -Y) --> -(min(X, Y))
2174344779Sdim      Intrinsic::ID NewIID;
2175344779Sdim      switch (IID) {
2176344779Sdim      case Intrinsic::maxnum:
2177344779Sdim        NewIID = Intrinsic::minnum;
2178344779Sdim        break;
2179344779Sdim      case Intrinsic::minnum:
2180344779Sdim        NewIID = Intrinsic::maxnum;
2181344779Sdim        break;
2182344779Sdim      case Intrinsic::maximum:
2183344779Sdim        NewIID = Intrinsic::minimum;
2184344779Sdim        break;
2185344779Sdim      case Intrinsic::minimum:
2186344779Sdim        NewIID = Intrinsic::maximum;
2187344779Sdim        break;
2188344779Sdim      default:
2189344779Sdim        llvm_unreachable("unexpected intrinsic ID");
2190344779Sdim      }
2191344779Sdim      Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II);
2192341825Sdim      Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall);
2193341825Sdim      FNeg->copyIRFlags(II);
2194341825Sdim      return FNeg;
2195341825Sdim    }
2196344779Sdim
2197344779Sdim    // m(m(X, C2), C1) -> m(X, C)
2198344779Sdim    const APFloat *C1, *C2;
2199344779Sdim    if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) {
2200344779Sdim      if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) &&
2201344779Sdim          ((match(M->getArgOperand(0), m_Value(X)) &&
2202344779Sdim            match(M->getArgOperand(1), m_APFloat(C2))) ||
2203344779Sdim           (match(M->getArgOperand(1), m_Value(X)) &&
2204344779Sdim            match(M->getArgOperand(0), m_APFloat(C2))))) {
2205344779Sdim        APFloat Res(0.0);
2206344779Sdim        switch (IID) {
2207344779Sdim        case Intrinsic::maxnum:
2208344779Sdim          Res = maxnum(*C1, *C2);
2209344779Sdim          break;
2210344779Sdim        case Intrinsic::minnum:
2211344779Sdim          Res = minnum(*C1, *C2);
2212344779Sdim          break;
2213344779Sdim        case Intrinsic::maximum:
2214344779Sdim          Res = maximum(*C1, *C2);
2215344779Sdim          break;
2216344779Sdim        case Intrinsic::minimum:
2217344779Sdim          Res = minimum(*C1, *C2);
2218344779Sdim          break;
2219344779Sdim        default:
2220344779Sdim          llvm_unreachable("unexpected intrinsic ID");
2221344779Sdim        }
2222344779Sdim        Instruction *NewCall = Builder.CreateBinaryIntrinsic(
2223344779Sdim            IID, X, ConstantFP::get(Arg0->getType(), Res));
2224344779Sdim        NewCall->copyIRFlags(II);
2225344779Sdim        return replaceInstUsesWith(*II, NewCall);
2226344779Sdim      }
2227344779Sdim    }
2228344779Sdim
2229280031Sdim    break;
2230280031Sdim  }
2231314564Sdim  case Intrinsic::fmuladd: {
2232321369Sdim    // Canonicalize fast fmuladd to the separate fmul + fadd.
2233327952Sdim    if (II->isFast()) {
2234321369Sdim      BuilderTy::FastMathFlagGuard Guard(Builder);
2235321369Sdim      Builder.setFastMathFlags(II->getFastMathFlags());
2236321369Sdim      Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
2237321369Sdim                                      II->getArgOperand(1));
2238321369Sdim      Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2));
2239321369Sdim      Add->takeName(II);
2240321369Sdim      return replaceInstUsesWith(*II, Add);
2241321369Sdim    }
2242321369Sdim
2243360784Sdim    // Try to simplify the underlying FMul.
2244360784Sdim    if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
2245360784Sdim                                    II->getFastMathFlags(),
2246360784Sdim                                    SQ.getWithInstruction(II))) {
2247360784Sdim      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
2248360784Sdim      FAdd->copyFastMathFlags(II);
2249360784Sdim      return FAdd;
2250360784Sdim    }
2251360784Sdim
2252321369Sdim    LLVM_FALLTHROUGH;
2253321369Sdim  }
2254321369Sdim  case Intrinsic::fma: {
2255344779Sdim    if (Instruction *I = canonicalizeConstantArg0ToArg1(CI))
2256344779Sdim      return I;
2257344779Sdim
2258344779Sdim    // fma fneg(x), fneg(y), z -> fma x, y, z
2259314564Sdim    Value *Src0 = II->getArgOperand(0);
2260314564Sdim    Value *Src1 = II->getArgOperand(1);
2261341825Sdim    Value *X, *Y;
2262341825Sdim    if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
2263341825Sdim      II->setArgOperand(0, X);
2264341825Sdim      II->setArgOperand(1, Y);
2265314564Sdim      return II;
2266314564Sdim    }
2267314564Sdim
2268314564Sdim    // fma fabs(x), fabs(x), z -> fma x, x, z
2269341825Sdim    if (match(Src0, m_FAbs(m_Value(X))) &&
2270341825Sdim        match(Src1, m_FAbs(m_Specific(X)))) {
2271341825Sdim      II->setArgOperand(0, X);
2272341825Sdim      II->setArgOperand(1, X);
2273314564Sdim      return II;
2274314564Sdim    }
2275314564Sdim
2276360784Sdim    // Try to simplify the underlying FMul. We can only apply simplifications
2277360784Sdim    // that do not require rounding.
2278360784Sdim    if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1),
2279360784Sdim                                   II->getFastMathFlags(),
2280360784Sdim                                   SQ.getWithInstruction(II))) {
2281360784Sdim      auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
2282341825Sdim      FAdd->copyFastMathFlags(II);
2283341825Sdim      return FAdd;
2284314564Sdim    }
2285314564Sdim
2286314564Sdim    break;
2287314564Sdim  }
2288360784Sdim  case Intrinsic::copysign: {
2289360784Sdim    if (SignBitMustBeZero(II->getArgOperand(1), &TLI)) {
2290360784Sdim      // If we know that the sign argument is positive, reduce to FABS:
2291360784Sdim      // copysign X, Pos --> fabs X
2292360784Sdim      Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
2293360784Sdim                                                 II->getArgOperand(0), II);
2294360784Sdim      return replaceInstUsesWith(*II, Fabs);
2295360784Sdim    }
2296360784Sdim    // TODO: There should be a ValueTracking sibling like SignBitMustBeOne.
2297360784Sdim    const APFloat *C;
2298360784Sdim    if (match(II->getArgOperand(1), m_APFloat(C)) && C->isNegative()) {
2299360784Sdim      // If we know that the sign argument is negative, reduce to FNABS:
2300360784Sdim      // copysign X, Neg --> fneg (fabs X)
2301360784Sdim      Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
2302360784Sdim                                                 II->getArgOperand(0), II);
2303360784Sdim      return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II));
2304360784Sdim    }
2305360784Sdim
2306360784Sdim    // Propagate sign argument through nested calls:
2307360784Sdim    // copysign X, (copysign ?, SignArg) --> copysign X, SignArg
2308360784Sdim    Value *SignArg;
2309360784Sdim    if (match(II->getArgOperand(1),
2310360784Sdim              m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg)))) {
2311360784Sdim      II->setArgOperand(1, SignArg);
2312360784Sdim      return II;
2313360784Sdim    }
2314360784Sdim
2315360784Sdim    break;
2316360784Sdim  }
2317314564Sdim  case Intrinsic::fabs: {
2318314564Sdim    Value *Cond;
2319314564Sdim    Constant *LHS, *RHS;
2320314564Sdim    if (match(II->getArgOperand(0),
2321314564Sdim              m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
2322321369Sdim      CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS});
2323321369Sdim      CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS});
2324314564Sdim      return SelectInst::Create(Cond, Call0, Call1);
2325314564Sdim    }
2326314564Sdim
2327321369Sdim    LLVM_FALLTHROUGH;
2328321369Sdim  }
2329321369Sdim  case Intrinsic::ceil:
2330321369Sdim  case Intrinsic::floor:
2331321369Sdim  case Intrinsic::round:
2332321369Sdim  case Intrinsic::nearbyint:
2333321369Sdim  case Intrinsic::rint:
2334321369Sdim  case Intrinsic::trunc: {
2335321369Sdim    Value *ExtSrc;
2336341825Sdim    if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) {
2337341825Sdim      // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x)
2338353358Sdim      Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II);
2339341825Sdim      return new FPExtInst(NarrowII, II->getType());
2340321369Sdim    }
2341314564Sdim    break;
2342314564Sdim  }
2343314564Sdim  case Intrinsic::cos:
2344314564Sdim  case Intrinsic::amdgcn_cos: {
2345344779Sdim    Value *X;
2346314564Sdim    Value *Src = II->getArgOperand(0);
2347344779Sdim    if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) {
2348314564Sdim      // cos(-x) -> cos(x)
2349314564Sdim      // cos(fabs(x)) -> cos(x)
2350344779Sdim      II->setArgOperand(0, X);
2351314564Sdim      return II;
2352314564Sdim    }
2353314564Sdim    break;
2354314564Sdim  }
2355344779Sdim  case Intrinsic::sin: {
2356344779Sdim    Value *X;
2357344779Sdim    if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) {
2358344779Sdim      // sin(-x) --> -sin(x)
2359344779Sdim      Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II);
2360344779Sdim      Instruction *FNeg = BinaryOperator::CreateFNeg(NewSin);
2361344779Sdim      FNeg->copyFastMathFlags(II);
2362344779Sdim      return FNeg;
2363344779Sdim    }
2364344779Sdim    break;
2365344779Sdim  }
2366202375Srdivacky  case Intrinsic::ppc_altivec_lvx:
2367202375Srdivacky  case Intrinsic::ppc_altivec_lvxl:
2368221345Sdim    // Turn PPC lvx -> load if the pointer is known aligned.
2369314564Sdim    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
2370314564Sdim                                   &DT) >= 16) {
2371321369Sdim      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
2372202375Srdivacky                                         PointerType::getUnqual(II->getType()));
2373353358Sdim      return new LoadInst(II->getType(), Ptr);
2374202375Srdivacky    }
2375202375Srdivacky    break;
2376280031Sdim  case Intrinsic::ppc_vsx_lxvw4x:
2377280031Sdim  case Intrinsic::ppc_vsx_lxvd2x: {
2378280031Sdim    // Turn PPC VSX loads into normal loads.
2379321369Sdim    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
2380321369Sdim                                       PointerType::getUnqual(II->getType()));
2381360784Sdim    return new LoadInst(II->getType(), Ptr, Twine(""), false, Align::None());
2382280031Sdim  }
2383202375Srdivacky  case Intrinsic::ppc_altivec_stvx:
2384202375Srdivacky  case Intrinsic::ppc_altivec_stvxl:
2385202375Srdivacky    // Turn stvx -> store if the pointer is known aligned.
2386314564Sdim    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
2387314564Sdim                                   &DT) >= 16) {
2388234353Sdim      Type *OpPtrTy =
2389210299Sed        PointerType::getUnqual(II->getArgOperand(0)->getType());
2390321369Sdim      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
2391210299Sed      return new StoreInst(II->getArgOperand(0), Ptr);
2392202375Srdivacky    }
2393202375Srdivacky    break;
2394280031Sdim  case Intrinsic::ppc_vsx_stxvw4x:
2395280031Sdim  case Intrinsic::ppc_vsx_stxvd2x: {
2396280031Sdim    // Turn PPC VSX stores into normal stores.
2397280031Sdim    Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
2398321369Sdim    Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
2399360784Sdim    return new StoreInst(II->getArgOperand(0), Ptr, false, Align::None());
2400280031Sdim  }
2401288943Sdim  case Intrinsic::ppc_qpx_qvlfs:
2402288943Sdim    // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
2403314564Sdim    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
2404314564Sdim                                   &DT) >= 16) {
2405321369Sdim      Type *VTy = VectorType::get(Builder.getFloatTy(),
2406288943Sdim                                  II->getType()->getVectorNumElements());
2407321369Sdim      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
2408288943Sdim                                         PointerType::getUnqual(VTy));
2409353358Sdim      Value *Load = Builder.CreateLoad(VTy, Ptr);
2410288943Sdim      return new FPExtInst(Load, II->getType());
2411288943Sdim    }
2412288943Sdim    break;
2413288943Sdim  case Intrinsic::ppc_qpx_qvlfd:
2414288943Sdim    // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
2415314564Sdim    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC,
2416314564Sdim                                   &DT) >= 32) {
2417321369Sdim      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
2418288943Sdim                                         PointerType::getUnqual(II->getType()));
2419353358Sdim      return new LoadInst(II->getType(), Ptr);
2420288943Sdim    }
2421288943Sdim    break;
2422288943Sdim  case Intrinsic::ppc_qpx_qvstfs:
2423288943Sdim    // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
2424314564Sdim    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
2425314564Sdim                                   &DT) >= 16) {
2426321369Sdim      Type *VTy = VectorType::get(Builder.getFloatTy(),
2427288943Sdim          II->getArgOperand(0)->getType()->getVectorNumElements());
2428321369Sdim      Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy);
2429288943Sdim      Type *OpPtrTy = PointerType::getUnqual(VTy);
2430321369Sdim      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
2431288943Sdim      return new StoreInst(TOp, Ptr);
2432288943Sdim    }
2433288943Sdim    break;
2434288943Sdim  case Intrinsic::ppc_qpx_qvstfd:
2435288943Sdim    // Turn PPC QPX qvstfd -> store if the pointer is known aligned.
2436314564Sdim    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC,
2437314564Sdim                                   &DT) >= 32) {
2438288943Sdim      Type *OpPtrTy =
2439288943Sdim        PointerType::getUnqual(II->getArgOperand(0)->getType());
2440321369Sdim      Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
2441288943Sdim      return new StoreInst(II->getArgOperand(0), Ptr);
2442288943Sdim    }
2443288943Sdim    break;
2444296417Sdim
2445327952Sdim  case Intrinsic::x86_bmi_bextr_32:
2446327952Sdim  case Intrinsic::x86_bmi_bextr_64:
2447327952Sdim  case Intrinsic::x86_tbm_bextri_u32:
2448327952Sdim  case Intrinsic::x86_tbm_bextri_u64:
2449327952Sdim    // If the RHS is a constant we can try some simplifications.
2450327952Sdim    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
2451327952Sdim      uint64_t Shift = C->getZExtValue();
2452327952Sdim      uint64_t Length = (Shift >> 8) & 0xff;
2453327952Sdim      Shift &= 0xff;
2454327952Sdim      unsigned BitWidth = II->getType()->getIntegerBitWidth();
2455327952Sdim      // If the length is 0 or the shift is out of range, replace with zero.
2456327952Sdim      if (Length == 0 || Shift >= BitWidth)
2457327952Sdim        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
2458327952Sdim      // If the LHS is also a constant, we can completely constant fold this.
2459327952Sdim      if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
2460327952Sdim        uint64_t Result = InC->getZExtValue() >> Shift;
2461327952Sdim        if (Length > BitWidth)
2462327952Sdim          Length = BitWidth;
2463327952Sdim        Result &= maskTrailingOnes<uint64_t>(Length);
2464327952Sdim        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
2465327952Sdim      }
2466327952Sdim      // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
2467327952Sdim      // are only masking bits that a shift already cleared?
2468327952Sdim    }
2469327952Sdim    break;
2470327952Sdim
2471327952Sdim  case Intrinsic::x86_bmi_bzhi_32:
2472327952Sdim  case Intrinsic::x86_bmi_bzhi_64:
2473327952Sdim    // If the RHS is a constant we can try some simplifications.
2474327952Sdim    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
2475327952Sdim      uint64_t Index = C->getZExtValue() & 0xff;
2476327952Sdim      unsigned BitWidth = II->getType()->getIntegerBitWidth();
2477327952Sdim      if (Index >= BitWidth)
2478327952Sdim        return replaceInstUsesWith(CI, II->getArgOperand(0));
2479327952Sdim      if (Index == 0)
2480327952Sdim        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
2481327952Sdim      // If the LHS is also a constant, we can completely constant fold this.
2482327952Sdim      if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
2483327952Sdim        uint64_t Result = InC->getZExtValue();
2484327952Sdim        Result &= maskTrailingOnes<uint64_t>(Index);
2485327952Sdim        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
2486327952Sdim      }
2487327952Sdim      // TODO should we convert this to an AND if the RHS is constant?
2488327952Sdim    }
2489327952Sdim    break;
2490360784Sdim  case Intrinsic::x86_bmi_pext_32:
2491360784Sdim  case Intrinsic::x86_bmi_pext_64:
2492360784Sdim    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
2493360784Sdim      if (MaskC->isNullValue())
2494360784Sdim        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
2495360784Sdim      if (MaskC->isAllOnesValue())
2496360784Sdim        return replaceInstUsesWith(CI, II->getArgOperand(0));
2497327952Sdim
2498360784Sdim      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
2499360784Sdim        uint64_t Src = SrcC->getZExtValue();
2500360784Sdim        uint64_t Mask = MaskC->getZExtValue();
2501360784Sdim        uint64_t Result = 0;
2502360784Sdim        uint64_t BitToSet = 1;
2503360784Sdim
2504360784Sdim        while (Mask) {
2505360784Sdim          // Isolate lowest set bit.
2506360784Sdim          uint64_t BitToTest = Mask & -Mask;
2507360784Sdim          if (BitToTest & Src)
2508360784Sdim            Result |= BitToSet;
2509360784Sdim
2510360784Sdim          BitToSet <<= 1;
2511360784Sdim          // Clear lowest set bit.
2512360784Sdim          Mask &= Mask - 1;
2513360784Sdim        }
2514360784Sdim
2515360784Sdim        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
2516360784Sdim      }
2517360784Sdim    }
2518360784Sdim    break;
2519360784Sdim  case Intrinsic::x86_bmi_pdep_32:
2520360784Sdim  case Intrinsic::x86_bmi_pdep_64:
2521360784Sdim    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
2522360784Sdim      if (MaskC->isNullValue())
2523360784Sdim        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
2524360784Sdim      if (MaskC->isAllOnesValue())
2525360784Sdim        return replaceInstUsesWith(CI, II->getArgOperand(0));
2526360784Sdim
2527360784Sdim      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
2528360784Sdim        uint64_t Src = SrcC->getZExtValue();
2529360784Sdim        uint64_t Mask = MaskC->getZExtValue();
2530360784Sdim        uint64_t Result = 0;
2531360784Sdim        uint64_t BitToTest = 1;
2532360784Sdim
2533360784Sdim        while (Mask) {
2534360784Sdim          // Isolate lowest set bit.
2535360784Sdim          uint64_t BitToSet = Mask & -Mask;
2536360784Sdim          if (BitToTest & Src)
2537360784Sdim            Result |= BitToSet;
2538360784Sdim
2539360784Sdim          BitToTest <<= 1;
2540360784Sdim          // Clear lowest set bit;
2541360784Sdim          Mask &= Mask - 1;
2542360784Sdim        }
2543360784Sdim
2544360784Sdim        return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
2545360784Sdim      }
2546360784Sdim    }
2547360784Sdim    break;
2548360784Sdim
2549296417Sdim  case Intrinsic::x86_vcvtph2ps_128:
2550296417Sdim  case Intrinsic::x86_vcvtph2ps_256: {
2551296417Sdim    auto Arg = II->getArgOperand(0);
2552296417Sdim    auto ArgType = cast<VectorType>(Arg->getType());
2553296417Sdim    auto RetType = cast<VectorType>(II->getType());
2554296417Sdim    unsigned ArgWidth = ArgType->getNumElements();
2555296417Sdim    unsigned RetWidth = RetType->getNumElements();
2556296417Sdim    assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths");
2557296417Sdim    assert(ArgType->isIntOrIntVectorTy() &&
2558296417Sdim           ArgType->getScalarSizeInBits() == 16 &&
2559296417Sdim           "CVTPH2PS input type should be 16-bit integer vector");
2560296417Sdim    assert(RetType->getScalarType()->isFloatTy() &&
2561296417Sdim           "CVTPH2PS output type should be 32-bit float vector");
2562296417Sdim
2563296417Sdim    // Constant folding: Convert to generic half to single conversion.
2564296417Sdim    if (isa<ConstantAggregateZero>(Arg))
2565309124Sdim      return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
2566296417Sdim
2567296417Sdim    if (isa<ConstantDataVector>(Arg)) {
2568296417Sdim      auto VectorHalfAsShorts = Arg;
2569296417Sdim      if (RetWidth < ArgWidth) {
2570309124Sdim        SmallVector<uint32_t, 8> SubVecMask;
2571296417Sdim        for (unsigned i = 0; i != RetWidth; ++i)
2572296417Sdim          SubVecMask.push_back((int)i);
2573321369Sdim        VectorHalfAsShorts = Builder.CreateShuffleVector(
2574296417Sdim            Arg, UndefValue::get(ArgType), SubVecMask);
2575296417Sdim      }
2576296417Sdim
2577296417Sdim      auto VectorHalfType =
2578296417Sdim          VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
2579296417Sdim      auto VectorHalfs =
2580321369Sdim          Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType);
2581321369Sdim      auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType);
2582309124Sdim      return replaceInstUsesWith(*II, VectorFloats);
2583296417Sdim    }
2584296417Sdim
2585296417Sdim    // We only use the lowest lanes of the argument.
2586296417Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) {
2587296417Sdim      II->setArgOperand(0, V);
2588296417Sdim      return II;
2589296417Sdim    }
2590296417Sdim    break;
2591296417Sdim  }
2592296417Sdim
2593218893Sdim  case Intrinsic::x86_sse_cvtss2si:
2594218893Sdim  case Intrinsic::x86_sse_cvtss2si64:
2595218893Sdim  case Intrinsic::x86_sse_cvttss2si:
2596218893Sdim  case Intrinsic::x86_sse_cvttss2si64:
2597218893Sdim  case Intrinsic::x86_sse2_cvtsd2si:
2598218893Sdim  case Intrinsic::x86_sse2_cvtsd2si64:
2599218893Sdim  case Intrinsic::x86_sse2_cvttsd2si:
2600314564Sdim  case Intrinsic::x86_sse2_cvttsd2si64:
2601314564Sdim  case Intrinsic::x86_avx512_vcvtss2si32:
2602314564Sdim  case Intrinsic::x86_avx512_vcvtss2si64:
2603314564Sdim  case Intrinsic::x86_avx512_vcvtss2usi32:
2604314564Sdim  case Intrinsic::x86_avx512_vcvtss2usi64:
2605314564Sdim  case Intrinsic::x86_avx512_vcvtsd2si32:
2606314564Sdim  case Intrinsic::x86_avx512_vcvtsd2si64:
2607314564Sdim  case Intrinsic::x86_avx512_vcvtsd2usi32:
2608314564Sdim  case Intrinsic::x86_avx512_vcvtsd2usi64:
2609314564Sdim  case Intrinsic::x86_avx512_cvttss2si:
2610314564Sdim  case Intrinsic::x86_avx512_cvttss2si64:
2611314564Sdim  case Intrinsic::x86_avx512_cvttss2usi:
2612314564Sdim  case Intrinsic::x86_avx512_cvttss2usi64:
2613314564Sdim  case Intrinsic::x86_avx512_cvttsd2si:
2614314564Sdim  case Intrinsic::x86_avx512_cvttsd2si64:
2615314564Sdim  case Intrinsic::x86_avx512_cvttsd2usi:
2616314564Sdim  case Intrinsic::x86_avx512_cvttsd2usi64: {
2617218893Sdim    // These intrinsics only demand the 0th element of their input vectors. If
2618202375Srdivacky    // we can simplify the input based on that, do so now.
2619296417Sdim    Value *Arg = II->getArgOperand(0);
2620296417Sdim    unsigned VWidth = Arg->getType()->getVectorNumElements();
2621296417Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
2622210299Sed      II->setArgOperand(0, V);
2623202375Srdivacky      return II;
2624202375Srdivacky    }
2625202375Srdivacky    break;
2626202375Srdivacky  }
2627218893Sdim
2628309124Sdim  case Intrinsic::x86_mmx_pmovmskb:
2629309124Sdim  case Intrinsic::x86_sse_movmsk_ps:
2630309124Sdim  case Intrinsic::x86_sse2_movmsk_pd:
2631309124Sdim  case Intrinsic::x86_sse2_pmovmskb_128:
2632309124Sdim  case Intrinsic::x86_avx_movmsk_pd_256:
2633309124Sdim  case Intrinsic::x86_avx_movmsk_ps_256:
2634327952Sdim  case Intrinsic::x86_avx2_pmovmskb:
2635344779Sdim    if (Value *V = simplifyX86movmsk(*II, Builder))
2636309124Sdim      return replaceInstUsesWith(*II, V);
2637309124Sdim    break;
2638309124Sdim
2639309124Sdim  case Intrinsic::x86_sse_comieq_ss:
2640309124Sdim  case Intrinsic::x86_sse_comige_ss:
2641309124Sdim  case Intrinsic::x86_sse_comigt_ss:
2642309124Sdim  case Intrinsic::x86_sse_comile_ss:
2643309124Sdim  case Intrinsic::x86_sse_comilt_ss:
2644309124Sdim  case Intrinsic::x86_sse_comineq_ss:
2645309124Sdim  case Intrinsic::x86_sse_ucomieq_ss:
2646309124Sdim  case Intrinsic::x86_sse_ucomige_ss:
2647309124Sdim  case Intrinsic::x86_sse_ucomigt_ss:
2648309124Sdim  case Intrinsic::x86_sse_ucomile_ss:
2649309124Sdim  case Intrinsic::x86_sse_ucomilt_ss:
2650309124Sdim  case Intrinsic::x86_sse_ucomineq_ss:
2651309124Sdim  case Intrinsic::x86_sse2_comieq_sd:
2652309124Sdim  case Intrinsic::x86_sse2_comige_sd:
2653309124Sdim  case Intrinsic::x86_sse2_comigt_sd:
2654309124Sdim  case Intrinsic::x86_sse2_comile_sd:
2655309124Sdim  case Intrinsic::x86_sse2_comilt_sd:
2656309124Sdim  case Intrinsic::x86_sse2_comineq_sd:
2657309124Sdim  case Intrinsic::x86_sse2_ucomieq_sd:
2658309124Sdim  case Intrinsic::x86_sse2_ucomige_sd:
2659309124Sdim  case Intrinsic::x86_sse2_ucomigt_sd:
2660309124Sdim  case Intrinsic::x86_sse2_ucomile_sd:
2661309124Sdim  case Intrinsic::x86_sse2_ucomilt_sd:
2662314564Sdim  case Intrinsic::x86_sse2_ucomineq_sd:
2663314564Sdim  case Intrinsic::x86_avx512_vcomi_ss:
2664314564Sdim  case Intrinsic::x86_avx512_vcomi_sd:
2665314564Sdim  case Intrinsic::x86_avx512_mask_cmp_ss:
2666314564Sdim  case Intrinsic::x86_avx512_mask_cmp_sd: {
2667309124Sdim    // These intrinsics only demand the 0th element of their input vectors. If
2668309124Sdim    // we can simplify the input based on that, do so now.
2669309124Sdim    bool MadeChange = false;
2670309124Sdim    Value *Arg0 = II->getArgOperand(0);
2671309124Sdim    Value *Arg1 = II->getArgOperand(1);
2672309124Sdim    unsigned VWidth = Arg0->getType()->getVectorNumElements();
2673309124Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
2674309124Sdim      II->setArgOperand(0, V);
2675309124Sdim      MadeChange = true;
2676309124Sdim    }
2677309124Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
2678309124Sdim      II->setArgOperand(1, V);
2679309124Sdim      MadeChange = true;
2680309124Sdim    }
2681309124Sdim    if (MadeChange)
2682309124Sdim      return II;
2683309124Sdim    break;
2684309124Sdim  }
2685341825Sdim  case Intrinsic::x86_avx512_cmp_pd_128:
2686341825Sdim  case Intrinsic::x86_avx512_cmp_pd_256:
2687341825Sdim  case Intrinsic::x86_avx512_cmp_pd_512:
2688341825Sdim  case Intrinsic::x86_avx512_cmp_ps_128:
2689341825Sdim  case Intrinsic::x86_avx512_cmp_ps_256:
2690341825Sdim  case Intrinsic::x86_avx512_cmp_ps_512: {
2691321369Sdim    // Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
2692321369Sdim    Value *Arg0 = II->getArgOperand(0);
2693321369Sdim    Value *Arg1 = II->getArgOperand(1);
2694341825Sdim    bool Arg0IsZero = match(Arg0, m_PosZeroFP());
2695321369Sdim    if (Arg0IsZero)
2696321369Sdim      std::swap(Arg0, Arg1);
2697321369Sdim    Value *A, *B;
2698321369Sdim    // This fold requires only the NINF(not +/- inf) since inf minus
2699321369Sdim    // inf is nan.
2700321369Sdim    // NSZ(No Signed Zeros) is not needed because zeros of any sign are
2701321369Sdim    // equal for both compares.
2702321369Sdim    // NNAN is not needed because nans compare the same for both compares.
2703321369Sdim    // The compare intrinsic uses the above assumptions and therefore
2704321369Sdim    // doesn't require additional flags.
2705321369Sdim    if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
2706341825Sdim         match(Arg1, m_PosZeroFP()) && isa<Instruction>(Arg0) &&
2707321369Sdim         cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
2708321369Sdim      if (Arg0IsZero)
2709321369Sdim        std::swap(A, B);
2710321369Sdim      II->setArgOperand(0, A);
2711321369Sdim      II->setArgOperand(1, B);
2712321369Sdim      return II;
2713321369Sdim    }
2714321369Sdim    break;
2715321369Sdim  }
2716309124Sdim
2717341825Sdim  case Intrinsic::x86_avx512_add_ps_512:
2718341825Sdim  case Intrinsic::x86_avx512_div_ps_512:
2719341825Sdim  case Intrinsic::x86_avx512_mul_ps_512:
2720341825Sdim  case Intrinsic::x86_avx512_sub_ps_512:
2721341825Sdim  case Intrinsic::x86_avx512_add_pd_512:
2722341825Sdim  case Intrinsic::x86_avx512_div_pd_512:
2723341825Sdim  case Intrinsic::x86_avx512_mul_pd_512:
2724341825Sdim  case Intrinsic::x86_avx512_sub_pd_512:
2725314564Sdim    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2726314564Sdim    // IR operations.
2727341825Sdim    if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
2728314564Sdim      if (R->getValue() == 4) {
2729314564Sdim        Value *Arg0 = II->getArgOperand(0);
2730314564Sdim        Value *Arg1 = II->getArgOperand(1);
2731314564Sdim
2732314564Sdim        Value *V;
2733353358Sdim        switch (IID) {
2734314564Sdim        default: llvm_unreachable("Case stmts out of sync!");
2735341825Sdim        case Intrinsic::x86_avx512_add_ps_512:
2736341825Sdim        case Intrinsic::x86_avx512_add_pd_512:
2737321369Sdim          V = Builder.CreateFAdd(Arg0, Arg1);
2738314564Sdim          break;
2739341825Sdim        case Intrinsic::x86_avx512_sub_ps_512:
2740341825Sdim        case Intrinsic::x86_avx512_sub_pd_512:
2741321369Sdim          V = Builder.CreateFSub(Arg0, Arg1);
2742314564Sdim          break;
2743341825Sdim        case Intrinsic::x86_avx512_mul_ps_512:
2744341825Sdim        case Intrinsic::x86_avx512_mul_pd_512:
2745321369Sdim          V = Builder.CreateFMul(Arg0, Arg1);
2746314564Sdim          break;
2747341825Sdim        case Intrinsic::x86_avx512_div_ps_512:
2748341825Sdim        case Intrinsic::x86_avx512_div_pd_512:
2749321369Sdim          V = Builder.CreateFDiv(Arg0, Arg1);
2750314564Sdim          break;
2751314564Sdim        }
2752314564Sdim
2753314564Sdim        return replaceInstUsesWith(*II, V);
2754314564Sdim      }
2755314564Sdim    }
2756314564Sdim    break;
2757314564Sdim
2758314564Sdim  case Intrinsic::x86_avx512_mask_add_ss_round:
2759314564Sdim  case Intrinsic::x86_avx512_mask_div_ss_round:
2760314564Sdim  case Intrinsic::x86_avx512_mask_mul_ss_round:
2761314564Sdim  case Intrinsic::x86_avx512_mask_sub_ss_round:
2762314564Sdim  case Intrinsic::x86_avx512_mask_add_sd_round:
2763314564Sdim  case Intrinsic::x86_avx512_mask_div_sd_round:
2764314564Sdim  case Intrinsic::x86_avx512_mask_mul_sd_round:
2765314564Sdim  case Intrinsic::x86_avx512_mask_sub_sd_round:
2766314564Sdim    // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
2767314564Sdim    // IR operations.
2768314564Sdim    if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
2769314564Sdim      if (R->getValue() == 4) {
2770314564Sdim        // Extract the element as scalars.
2771314564Sdim        Value *Arg0 = II->getArgOperand(0);
2772314564Sdim        Value *Arg1 = II->getArgOperand(1);
2773321369Sdim        Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0);
2774321369Sdim        Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0);
2775314564Sdim
2776314564Sdim        Value *V;
2777353358Sdim        switch (IID) {
2778314564Sdim        default: llvm_unreachable("Case stmts out of sync!");
2779314564Sdim        case Intrinsic::x86_avx512_mask_add_ss_round:
2780314564Sdim        case Intrinsic::x86_avx512_mask_add_sd_round:
2781321369Sdim          V = Builder.CreateFAdd(LHS, RHS);
2782314564Sdim          break;
2783314564Sdim        case Intrinsic::x86_avx512_mask_sub_ss_round:
2784314564Sdim        case Intrinsic::x86_avx512_mask_sub_sd_round:
2785321369Sdim          V = Builder.CreateFSub(LHS, RHS);
2786314564Sdim          break;
2787314564Sdim        case Intrinsic::x86_avx512_mask_mul_ss_round:
2788314564Sdim        case Intrinsic::x86_avx512_mask_mul_sd_round:
2789321369Sdim          V = Builder.CreateFMul(LHS, RHS);
2790314564Sdim          break;
2791314564Sdim        case Intrinsic::x86_avx512_mask_div_ss_round:
2792314564Sdim        case Intrinsic::x86_avx512_mask_div_sd_round:
2793321369Sdim          V = Builder.CreateFDiv(LHS, RHS);
2794314564Sdim          break;
2795314564Sdim        }
2796314564Sdim
2797314564Sdim        // Handle the masking aspect of the intrinsic.
2798314564Sdim        Value *Mask = II->getArgOperand(3);
2799314564Sdim        auto *C = dyn_cast<ConstantInt>(Mask);
2800314564Sdim        // We don't need a select if we know the mask bit is a 1.
2801314564Sdim        if (!C || !C->getValue()[0]) {
2802314564Sdim          // Cast the mask to an i1 vector and then extract the lowest element.
2803321369Sdim          auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
2804314564Sdim                             cast<IntegerType>(Mask->getType())->getBitWidth());
2805321369Sdim          Mask = Builder.CreateBitCast(Mask, MaskTy);
2806321369Sdim          Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
2807314564Sdim          // Extract the lowest element from the passthru operand.
2808321369Sdim          Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2),
2809314564Sdim                                                          (uint64_t)0);
2810321369Sdim          V = Builder.CreateSelect(Mask, V, Passthru);
2811314564Sdim        }
2812314564Sdim
2813314564Sdim        // Insert the result back into the original argument 0.
2814321369Sdim        V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
2815314564Sdim
2816314564Sdim        return replaceInstUsesWith(*II, V);
2817314564Sdim      }
2818314564Sdim    }
2819341825Sdim    break;
2820309124Sdim
2821296417Sdim  // Constant fold ashr( <A x Bi>, Ci ).
2822296417Sdim  // Constant fold lshr( <A x Bi>, Ci ).
2823296417Sdim  // Constant fold shl( <A x Bi>, Ci ).
2824296417Sdim  case Intrinsic::x86_sse2_psrai_d:
2825296417Sdim  case Intrinsic::x86_sse2_psrai_w:
2826296417Sdim  case Intrinsic::x86_avx2_psrai_d:
2827296417Sdim  case Intrinsic::x86_avx2_psrai_w:
2828314564Sdim  case Intrinsic::x86_avx512_psrai_q_128:
2829314564Sdim  case Intrinsic::x86_avx512_psrai_q_256:
2830314564Sdim  case Intrinsic::x86_avx512_psrai_d_512:
2831314564Sdim  case Intrinsic::x86_avx512_psrai_q_512:
2832314564Sdim  case Intrinsic::x86_avx512_psrai_w_512:
2833296417Sdim  case Intrinsic::x86_sse2_psrli_d:
2834296417Sdim  case Intrinsic::x86_sse2_psrli_q:
2835296417Sdim  case Intrinsic::x86_sse2_psrli_w:
2836296417Sdim  case Intrinsic::x86_avx2_psrli_d:
2837296417Sdim  case Intrinsic::x86_avx2_psrli_q:
2838296417Sdim  case Intrinsic::x86_avx2_psrli_w:
2839314564Sdim  case Intrinsic::x86_avx512_psrli_d_512:
2840314564Sdim  case Intrinsic::x86_avx512_psrli_q_512:
2841314564Sdim  case Intrinsic::x86_avx512_psrli_w_512:
2842276479Sdim  case Intrinsic::x86_sse2_pslli_d:
2843276479Sdim  case Intrinsic::x86_sse2_pslli_q:
2844276479Sdim  case Intrinsic::x86_sse2_pslli_w:
2845276479Sdim  case Intrinsic::x86_avx2_pslli_d:
2846276479Sdim  case Intrinsic::x86_avx2_pslli_q:
2847276479Sdim  case Intrinsic::x86_avx2_pslli_w:
2848314564Sdim  case Intrinsic::x86_avx512_pslli_d_512:
2849314564Sdim  case Intrinsic::x86_avx512_pslli_q_512:
2850314564Sdim  case Intrinsic::x86_avx512_pslli_w_512:
2851321369Sdim    if (Value *V = simplifyX86immShift(*II, Builder))
2852309124Sdim      return replaceInstUsesWith(*II, V);
2853296417Sdim    break;
2854296417Sdim
2855296417Sdim  case Intrinsic::x86_sse2_psra_d:
2856296417Sdim  case Intrinsic::x86_sse2_psra_w:
2857296417Sdim  case Intrinsic::x86_avx2_psra_d:
2858296417Sdim  case Intrinsic::x86_avx2_psra_w:
2859314564Sdim  case Intrinsic::x86_avx512_psra_q_128:
2860314564Sdim  case Intrinsic::x86_avx512_psra_q_256:
2861314564Sdim  case Intrinsic::x86_avx512_psra_d_512:
2862314564Sdim  case Intrinsic::x86_avx512_psra_q_512:
2863314564Sdim  case Intrinsic::x86_avx512_psra_w_512:
2864276479Sdim  case Intrinsic::x86_sse2_psrl_d:
2865276479Sdim  case Intrinsic::x86_sse2_psrl_q:
2866276479Sdim  case Intrinsic::x86_sse2_psrl_w:
2867276479Sdim  case Intrinsic::x86_avx2_psrl_d:
2868276479Sdim  case Intrinsic::x86_avx2_psrl_q:
2869276479Sdim  case Intrinsic::x86_avx2_psrl_w:
2870314564Sdim  case Intrinsic::x86_avx512_psrl_d_512:
2871314564Sdim  case Intrinsic::x86_avx512_psrl_q_512:
2872314564Sdim  case Intrinsic::x86_avx512_psrl_w_512:
2873296417Sdim  case Intrinsic::x86_sse2_psll_d:
2874296417Sdim  case Intrinsic::x86_sse2_psll_q:
2875296417Sdim  case Intrinsic::x86_sse2_psll_w:
2876296417Sdim  case Intrinsic::x86_avx2_psll_d:
2877296417Sdim  case Intrinsic::x86_avx2_psll_q:
2878314564Sdim  case Intrinsic::x86_avx2_psll_w:
2879314564Sdim  case Intrinsic::x86_avx512_psll_d_512:
2880314564Sdim  case Intrinsic::x86_avx512_psll_q_512:
2881314564Sdim  case Intrinsic::x86_avx512_psll_w_512: {
2882321369Sdim    if (Value *V = simplifyX86immShift(*II, Builder))
2883309124Sdim      return replaceInstUsesWith(*II, V);
2884223017Sdim
2885296417Sdim    // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
2886296417Sdim    // operand to compute the shift amount.
2887296417Sdim    Value *Arg1 = II->getArgOperand(1);
2888296417Sdim    assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
2889296417Sdim           "Unexpected packed shift size");
2890296417Sdim    unsigned VWidth = Arg1->getType()->getVectorNumElements();
2891276479Sdim
2892296417Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
2893296417Sdim      II->setArgOperand(1, V);
2894296417Sdim      return II;
2895276479Sdim    }
2896296417Sdim    break;
2897296417Sdim  }
2898276479Sdim
2899309124Sdim  case Intrinsic::x86_avx2_psllv_d:
2900309124Sdim  case Intrinsic::x86_avx2_psllv_d_256:
2901309124Sdim  case Intrinsic::x86_avx2_psllv_q:
2902309124Sdim  case Intrinsic::x86_avx2_psllv_q_256:
2903314564Sdim  case Intrinsic::x86_avx512_psllv_d_512:
2904314564Sdim  case Intrinsic::x86_avx512_psllv_q_512:
2905314564Sdim  case Intrinsic::x86_avx512_psllv_w_128:
2906314564Sdim  case Intrinsic::x86_avx512_psllv_w_256:
2907314564Sdim  case Intrinsic::x86_avx512_psllv_w_512:
2908309124Sdim  case Intrinsic::x86_avx2_psrav_d:
2909309124Sdim  case Intrinsic::x86_avx2_psrav_d_256:
2910314564Sdim  case Intrinsic::x86_avx512_psrav_q_128:
2911314564Sdim  case Intrinsic::x86_avx512_psrav_q_256:
2912314564Sdim  case Intrinsic::x86_avx512_psrav_d_512:
2913314564Sdim  case Intrinsic::x86_avx512_psrav_q_512:
2914314564Sdim  case Intrinsic::x86_avx512_psrav_w_128:
2915314564Sdim  case Intrinsic::x86_avx512_psrav_w_256:
2916314564Sdim  case Intrinsic::x86_avx512_psrav_w_512:
2917309124Sdim  case Intrinsic::x86_avx2_psrlv_d:
2918309124Sdim  case Intrinsic::x86_avx2_psrlv_d_256:
2919309124Sdim  case Intrinsic::x86_avx2_psrlv_q:
2920309124Sdim  case Intrinsic::x86_avx2_psrlv_q_256:
2921314564Sdim  case Intrinsic::x86_avx512_psrlv_d_512:
2922314564Sdim  case Intrinsic::x86_avx512_psrlv_q_512:
2923314564Sdim  case Intrinsic::x86_avx512_psrlv_w_128:
2924314564Sdim  case Intrinsic::x86_avx512_psrlv_w_256:
2925314564Sdim  case Intrinsic::x86_avx512_psrlv_w_512:
2926321369Sdim    if (Value *V = simplifyX86varShift(*II, Builder))
2927309124Sdim      return replaceInstUsesWith(*II, V);
2928296417Sdim    break;
2929276479Sdim
2930321369Sdim  case Intrinsic::x86_sse2_packssdw_128:
2931321369Sdim  case Intrinsic::x86_sse2_packsswb_128:
2932321369Sdim  case Intrinsic::x86_avx2_packssdw:
2933321369Sdim  case Intrinsic::x86_avx2_packsswb:
2934321369Sdim  case Intrinsic::x86_avx512_packssdw_512:
2935321369Sdim  case Intrinsic::x86_avx512_packsswb_512:
2936353358Sdim    if (Value *V = simplifyX86pack(*II, Builder, true))
2937321369Sdim      return replaceInstUsesWith(*II, V);
2938321369Sdim    break;
2939321369Sdim
2940321369Sdim  case Intrinsic::x86_sse2_packuswb_128:
2941321369Sdim  case Intrinsic::x86_sse41_packusdw:
2942321369Sdim  case Intrinsic::x86_avx2_packusdw:
2943321369Sdim  case Intrinsic::x86_avx2_packuswb:
2944321369Sdim  case Intrinsic::x86_avx512_packusdw_512:
2945321369Sdim  case Intrinsic::x86_avx512_packuswb_512:
2946353358Sdim    if (Value *V = simplifyX86pack(*II, Builder, false))
2947321369Sdim      return replaceInstUsesWith(*II, V);
2948321369Sdim    break;
2949321369Sdim
2950341825Sdim  case Intrinsic::x86_pclmulqdq:
2951341825Sdim  case Intrinsic::x86_pclmulqdq_256:
2952341825Sdim  case Intrinsic::x86_pclmulqdq_512: {
2953321369Sdim    if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
2954321369Sdim      unsigned Imm = C->getZExtValue();
2955321369Sdim
2956321369Sdim      bool MadeChange = false;
2957321369Sdim      Value *Arg0 = II->getArgOperand(0);
2958321369Sdim      Value *Arg1 = II->getArgOperand(1);
2959321369Sdim      unsigned VWidth = Arg0->getType()->getVectorNumElements();
2960321369Sdim
2961321369Sdim      APInt UndefElts1(VWidth, 0);
2962341825Sdim      APInt DemandedElts1 = APInt::getSplat(VWidth,
2963341825Sdim                                            APInt(2, (Imm & 0x01) ? 2 : 1));
2964341825Sdim      if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1,
2965321369Sdim                                                UndefElts1)) {
2966321369Sdim        II->setArgOperand(0, V);
2967321369Sdim        MadeChange = true;
2968321369Sdim      }
2969321369Sdim
2970321369Sdim      APInt UndefElts2(VWidth, 0);
2971341825Sdim      APInt DemandedElts2 = APInt::getSplat(VWidth,
2972341825Sdim                                            APInt(2, (Imm & 0x10) ? 2 : 1));
2973341825Sdim      if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2,
2974321369Sdim                                                UndefElts2)) {
2975321369Sdim        II->setArgOperand(1, V);
2976321369Sdim        MadeChange = true;
2977321369Sdim      }
2978321369Sdim
2979341825Sdim      // If either input elements are undef, the result is zero.
2980341825Sdim      if (DemandedElts1.isSubsetOf(UndefElts1) ||
2981341825Sdim          DemandedElts2.isSubsetOf(UndefElts2))
2982321369Sdim        return replaceInstUsesWith(*II,
2983321369Sdim                                   ConstantAggregateZero::get(II->getType()));
2984321369Sdim
2985321369Sdim      if (MadeChange)
2986321369Sdim        return II;
2987321369Sdim    }
2988321369Sdim    break;
2989321369Sdim  }
2990321369Sdim
2991296417Sdim  case Intrinsic::x86_sse41_insertps:
2992321369Sdim    if (Value *V = simplifyX86insertps(*II, Builder))
2993309124Sdim      return replaceInstUsesWith(*II, V);
2994296417Sdim    break;
2995296417Sdim
2996296417Sdim  case Intrinsic::x86_sse4a_extrq: {
2997296417Sdim    Value *Op0 = II->getArgOperand(0);
2998296417Sdim    Value *Op1 = II->getArgOperand(1);
2999296417Sdim    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
3000296417Sdim    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
3001296417Sdim    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
3002296417Sdim           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
3003296417Sdim           VWidth1 == 16 && "Unexpected operand sizes");
3004296417Sdim
3005296417Sdim    // See if we're dealing with constant values.
3006296417Sdim    Constant *C1 = dyn_cast<Constant>(Op1);
3007296417Sdim    ConstantInt *CILength =
3008314564Sdim        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
3009296417Sdim           : nullptr;
3010296417Sdim    ConstantInt *CIIndex =
3011314564Sdim        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
3012296417Sdim           : nullptr;
3013296417Sdim
3014296417Sdim    // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
3015321369Sdim    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
3016309124Sdim      return replaceInstUsesWith(*II, V);
3017296417Sdim
3018296417Sdim    // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
3019296417Sdim    // operands and the lowest 16-bits of the second.
3020309124Sdim    bool MadeChange = false;
3021296417Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
3022296417Sdim      II->setArgOperand(0, V);
3023309124Sdim      MadeChange = true;
3024223017Sdim    }
3025296417Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
3026296417Sdim      II->setArgOperand(1, V);
3027309124Sdim      MadeChange = true;
3028309124Sdim    }
3029309124Sdim    if (MadeChange)
3030296417Sdim      return II;
3031223017Sdim    break;
3032223017Sdim  }
3033296417Sdim
3034296417Sdim  case Intrinsic::x86_sse4a_extrqi: {
3035296417Sdim    // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
3036296417Sdim    // bits of the lower 64-bits. The upper 64-bits are undefined.
3037296417Sdim    Value *Op0 = II->getArgOperand(0);
3038296417Sdim    unsigned VWidth = Op0->getType()->getVectorNumElements();
3039296417Sdim    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
3040296417Sdim           "Unexpected operand size");
3041296417Sdim
3042296417Sdim    // See if we're dealing with constant values.
3043296417Sdim    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1));
3044296417Sdim    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2));
3045296417Sdim
3046296417Sdim    // Attempt to simplify to a constant or shuffle vector.
3047321369Sdim    if (Value *V = simplifyX86extrq(*II, Op0, CILength, CIIndex, Builder))
3048309124Sdim      return replaceInstUsesWith(*II, V);
3049296417Sdim
3050296417Sdim    // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
3051296417Sdim    // operand.
3052296417Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
3053296417Sdim      II->setArgOperand(0, V);
3054296417Sdim      return II;
3055296417Sdim    }
3056288943Sdim    break;
3057296417Sdim  }
3058280031Sdim
3059296417Sdim  case Intrinsic::x86_sse4a_insertq: {
3060296417Sdim    Value *Op0 = II->getArgOperand(0);
3061296417Sdim    Value *Op1 = II->getArgOperand(1);
3062296417Sdim    unsigned VWidth = Op0->getType()->getVectorNumElements();
3063296417Sdim    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
3064296417Sdim           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
3065296417Sdim           Op1->getType()->getVectorNumElements() == 2 &&
3066296417Sdim           "Unexpected operand size");
3067280031Sdim
3068296417Sdim    // See if we're dealing with constant values.
3069296417Sdim    Constant *C1 = dyn_cast<Constant>(Op1);
3070296417Sdim    ConstantInt *CI11 =
3071314564Sdim        C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
3072296417Sdim           : nullptr;
3073280031Sdim
3074296417Sdim    // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
3075296417Sdim    if (CI11) {
3076309124Sdim      const APInt &V11 = CI11->getValue();
3077296417Sdim      APInt Len = V11.zextOrTrunc(6);
3078296417Sdim      APInt Idx = V11.lshr(8).zextOrTrunc(6);
3079321369Sdim      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
3080309124Sdim        return replaceInstUsesWith(*II, V);
3081296417Sdim    }
3082276479Sdim
3083296417Sdim    // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
3084296417Sdim    // operand.
3085296417Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
3086296417Sdim      II->setArgOperand(0, V);
3087296417Sdim      return II;
3088296417Sdim    }
3089296417Sdim    break;
3090296417Sdim  }
3091276479Sdim
3092296417Sdim  case Intrinsic::x86_sse4a_insertqi: {
3093296417Sdim    // INSERTQI: Extract lowest Length bits from lower half of second source and
3094296417Sdim    // insert over first source starting at Index bit. The upper 64-bits are
3095296417Sdim    // undefined.
3096296417Sdim    Value *Op0 = II->getArgOperand(0);
3097296417Sdim    Value *Op1 = II->getArgOperand(1);
3098296417Sdim    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
3099296417Sdim    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
3100296417Sdim    assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
3101296417Sdim           Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
3102296417Sdim           VWidth1 == 2 && "Unexpected operand sizes");
3103296417Sdim
3104296417Sdim    // See if we're dealing with constant values.
3105296417Sdim    ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2));
3106296417Sdim    ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3));
3107296417Sdim
3108296417Sdim    // Attempt to simplify to a constant or shuffle vector.
3109296417Sdim    if (CILength && CIIndex) {
3110296417Sdim      APInt Len = CILength->getValue().zextOrTrunc(6);
3111296417Sdim      APInt Idx = CIIndex->getValue().zextOrTrunc(6);
3112321369Sdim      if (Value *V = simplifyX86insertq(*II, Op0, Op1, Len, Idx, Builder))
3113309124Sdim        return replaceInstUsesWith(*II, V);
3114276479Sdim    }
3115296417Sdim
3116296417Sdim    // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
3117296417Sdim    // operands.
3118309124Sdim    bool MadeChange = false;
3119296417Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
3120296417Sdim      II->setArgOperand(0, V);
3121309124Sdim      MadeChange = true;
3122296417Sdim    }
3123296417Sdim    if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
3124296417Sdim      II->setArgOperand(1, V);
3125309124Sdim      MadeChange = true;
3126309124Sdim    }
3127309124Sdim    if (MadeChange)
3128296417Sdim      return II;
3129276479Sdim    break;
3130276479Sdim  }
3131276479Sdim
3132276479Sdim  case Intrinsic::x86_sse41_pblendvb:
3133276479Sdim  case Intrinsic::x86_sse41_blendvps:
3134276479Sdim  case Intrinsic::x86_sse41_blendvpd:
3135276479Sdim  case Intrinsic::x86_avx_blendv_ps_256:
3136276479Sdim  case Intrinsic::x86_avx_blendv_pd_256:
3137276479Sdim  case Intrinsic::x86_avx2_pblendvb: {
3138344779Sdim    // fold (blend A, A, Mask) -> A
3139296417Sdim    Value *Op0 = II->getArgOperand(0);
3140296417Sdim    Value *Op1 = II->getArgOperand(1);
3141276479Sdim    Value *Mask = II->getArgOperand(2);
3142296417Sdim    if (Op0 == Op1)
3143309124Sdim      return replaceInstUsesWith(CI, Op0);
3144296417Sdim
3145296417Sdim    // Zero Mask - select 1st argument.
3146296417Sdim    if (isa<ConstantAggregateZero>(Mask))
3147309124Sdim      return replaceInstUsesWith(CI, Op0);
3148296417Sdim
3149296417Sdim    // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
3150309124Sdim    if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
3151309124Sdim      Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
3152296417Sdim      return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
3153276479Sdim    }
3154344779Sdim
3155344779Sdim    // Convert to a vector select if we can bypass casts and find a boolean
3156344779Sdim    // vector condition value.
3157344779Sdim    Value *BoolVec;
3158344779Sdim    Mask = peekThroughBitcast(Mask);
3159344779Sdim    if (match(Mask, m_SExt(m_Value(BoolVec))) &&
3160344779Sdim        BoolVec->getType()->isVectorTy() &&
3161344779Sdim        BoolVec->getType()->getScalarSizeInBits() == 1) {
3162344779Sdim      assert(Mask->getType()->getPrimitiveSizeInBits() ==
3163344779Sdim             II->getType()->getPrimitiveSizeInBits() &&
3164344779Sdim             "Not expecting mask and operands with different sizes");
3165344779Sdim
3166344779Sdim      unsigned NumMaskElts = Mask->getType()->getVectorNumElements();
3167344779Sdim      unsigned NumOperandElts = II->getType()->getVectorNumElements();
3168344779Sdim      if (NumMaskElts == NumOperandElts)
3169344779Sdim        return SelectInst::Create(BoolVec, Op1, Op0);
3170344779Sdim
3171344779Sdim      // If the mask has less elements than the operands, each mask bit maps to
3172344779Sdim      // multiple elements of the operands. Bitcast back and forth.
3173344779Sdim      if (NumMaskElts < NumOperandElts) {
3174344779Sdim        Value *CastOp0 = Builder.CreateBitCast(Op0, Mask->getType());
3175344779Sdim        Value *CastOp1 = Builder.CreateBitCast(Op1, Mask->getType());
3176344779Sdim        Value *Sel = Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
3177344779Sdim        return new BitCastInst(Sel, II->getType());
3178344779Sdim      }
3179344779Sdim    }
3180344779Sdim
3181296417Sdim    break;
3182276479Sdim  }
3183276479Sdim
3184296417Sdim  case Intrinsic::x86_ssse3_pshuf_b_128:
3185309124Sdim  case Intrinsic::x86_avx2_pshuf_b:
3186314564Sdim  case Intrinsic::x86_avx512_pshuf_b_512:
3187321369Sdim    if (Value *V = simplifyX86pshufb(*II, Builder))
3188309124Sdim      return replaceInstUsesWith(*II, V);
3189309124Sdim    break;
3190296417Sdim
3191276479Sdim  case Intrinsic::x86_avx_vpermilvar_ps:
3192276479Sdim  case Intrinsic::x86_avx_vpermilvar_ps_256:
3193314564Sdim  case Intrinsic::x86_avx512_vpermilvar_ps_512:
3194276479Sdim  case Intrinsic::x86_avx_vpermilvar_pd:
3195309124Sdim  case Intrinsic::x86_avx_vpermilvar_pd_256:
3196314564Sdim  case Intrinsic::x86_avx512_vpermilvar_pd_512:
3197321369Sdim    if (Value *V = simplifyX86vpermilvar(*II, Builder))
3198309124Sdim      return replaceInstUsesWith(*II, V);
3199309124Sdim    break;
3200276479Sdim
3201309124Sdim  case Intrinsic::x86_avx2_permd:
3202309124Sdim  case Intrinsic::x86_avx2_permps:
3203341825Sdim  case Intrinsic::x86_avx512_permvar_df_256:
3204341825Sdim  case Intrinsic::x86_avx512_permvar_df_512:
3205341825Sdim  case Intrinsic::x86_avx512_permvar_di_256:
3206341825Sdim  case Intrinsic::x86_avx512_permvar_di_512:
3207341825Sdim  case Intrinsic::x86_avx512_permvar_hi_128:
3208341825Sdim  case Intrinsic::x86_avx512_permvar_hi_256:
3209341825Sdim  case Intrinsic::x86_avx512_permvar_hi_512:
3210341825Sdim  case Intrinsic::x86_avx512_permvar_qi_128:
3211341825Sdim  case Intrinsic::x86_avx512_permvar_qi_256:
3212341825Sdim  case Intrinsic::x86_avx512_permvar_qi_512:
3213341825Sdim  case Intrinsic::x86_avx512_permvar_sf_512:
3214341825Sdim  case Intrinsic::x86_avx512_permvar_si_512:
3215321369Sdim    if (Value *V = simplifyX86vpermv(*II, Builder))
3216309124Sdim      return replaceInstUsesWith(*II, V);
3217309124Sdim    break;
3218309124Sdim
3219309124Sdim  case Intrinsic::x86_avx_maskload_ps:
3220309124Sdim  case Intrinsic::x86_avx_maskload_pd:
3221309124Sdim  case Intrinsic::x86_avx_maskload_ps_256:
3222309124Sdim  case Intrinsic::x86_avx_maskload_pd_256:
3223309124Sdim  case Intrinsic::x86_avx2_maskload_d:
3224309124Sdim  case Intrinsic::x86_avx2_maskload_q:
3225309124Sdim  case Intrinsic::x86_avx2_maskload_d_256:
3226309124Sdim  case Intrinsic::x86_avx2_maskload_q_256:
3227309124Sdim    if (Instruction *I = simplifyX86MaskedLoad(*II, *this))
3228309124Sdim      return I;
3229309124Sdim    break;
3230309124Sdim
3231309124Sdim  case Intrinsic::x86_sse2_maskmov_dqu:
3232309124Sdim  case Intrinsic::x86_avx_maskstore_ps:
3233309124Sdim  case Intrinsic::x86_avx_maskstore_pd:
3234309124Sdim  case Intrinsic::x86_avx_maskstore_ps_256:
3235309124Sdim  case Intrinsic::x86_avx_maskstore_pd_256:
3236309124Sdim  case Intrinsic::x86_avx2_maskstore_d:
3237309124Sdim  case Intrinsic::x86_avx2_maskstore_q:
3238309124Sdim  case Intrinsic::x86_avx2_maskstore_d_256:
3239309124Sdim  case Intrinsic::x86_avx2_maskstore_q_256:
3240309124Sdim    if (simplifyX86MaskedStore(*II, *this))
3241309124Sdim      return nullptr;
3242309124Sdim    break;
3243309124Sdim
3244353358Sdim  case Intrinsic::x86_addcarry_32:
3245353358Sdim  case Intrinsic::x86_addcarry_64:
3246353358Sdim    if (Value *V = simplifyX86addcarry(*II, Builder))
3247309124Sdim      return replaceInstUsesWith(*II, V);
3248296417Sdim    break;
3249296417Sdim
3250202375Srdivacky  case Intrinsic::ppc_altivec_vperm:
3251202375Srdivacky    // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
3252276479Sdim    // Note that ppc_altivec_vperm has a big-endian bias, so when creating
3253276479Sdim    // a vectorshuffle for little endian, we must undo the transformation
3254276479Sdim    // performed on vec_perm in altivec.h.  That is, we must complement
3255276479Sdim    // the permutation mask with respect to 31 and reverse the order of
3256276479Sdim    // V1 and V2.
3257234353Sdim    if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
3258234353Sdim      assert(Mask->getType()->getVectorNumElements() == 16 &&
3259234353Sdim             "Bad type for intrinsic!");
3260234353Sdim
3261202375Srdivacky      // Check that all of the elements are integer constants or undefs.
3262202375Srdivacky      bool AllEltsOk = true;
3263202375Srdivacky      for (unsigned i = 0; i != 16; ++i) {
3264234353Sdim        Constant *Elt = Mask->getAggregateElement(i);
3265276479Sdim        if (!Elt || !(isa<ConstantInt>(Elt) || isa<UndefValue>(Elt))) {
3266202375Srdivacky          AllEltsOk = false;
3267202375Srdivacky          break;
3268202375Srdivacky        }
3269202375Srdivacky      }
3270234353Sdim
3271202375Srdivacky      if (AllEltsOk) {
3272202375Srdivacky        // Cast the input vectors to byte vectors.
3273321369Sdim        Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0),
3274321369Sdim                                           Mask->getType());
3275321369Sdim        Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1),
3276321369Sdim                                           Mask->getType());
3277202375Srdivacky        Value *Result = UndefValue::get(Op0->getType());
3278234353Sdim
3279202375Srdivacky        // Only extract each element once.
3280202375Srdivacky        Value *ExtractedElts[32];
3281202375Srdivacky        memset(ExtractedElts, 0, sizeof(ExtractedElts));
3282234353Sdim
3283202375Srdivacky        for (unsigned i = 0; i != 16; ++i) {
3284234353Sdim          if (isa<UndefValue>(Mask->getAggregateElement(i)))
3285202375Srdivacky            continue;
3286234353Sdim          unsigned Idx =
3287234353Sdim            cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
3288202375Srdivacky          Idx &= 31;  // Match the hardware behavior.
3289288943Sdim          if (DL.isLittleEndian())
3290276479Sdim            Idx = 31 - Idx;
3291234353Sdim
3292276479Sdim          if (!ExtractedElts[Idx]) {
3293288943Sdim            Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
3294288943Sdim            Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
3295234353Sdim            ExtractedElts[Idx] =
3296321369Sdim              Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
3297321369Sdim                                           Builder.getInt32(Idx&15));
3298202375Srdivacky          }
3299234353Sdim
3300202375Srdivacky          // Insert this value into the result vector.
3301321369Sdim          Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx],
3302321369Sdim                                               Builder.getInt32(i));
3303202375Srdivacky        }
3304202375Srdivacky        return CastInst::Create(Instruction::BitCast, Result, CI.getType());
3305202375Srdivacky      }
3306202375Srdivacky    }
3307202375Srdivacky    break;
3308202375Srdivacky
3309341825Sdim  case Intrinsic::arm_neon_vld1: {
3310341825Sdim    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0),
3311341825Sdim                                          DL, II, &AC, &DT);
3312341825Sdim    if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder))
3313341825Sdim      return replaceInstUsesWith(*II, V);
3314341825Sdim    break;
3315341825Sdim  }
3316341825Sdim
3317218893Sdim  case Intrinsic::arm_neon_vld2:
3318218893Sdim  case Intrinsic::arm_neon_vld3:
3319218893Sdim  case Intrinsic::arm_neon_vld4:
3320218893Sdim  case Intrinsic::arm_neon_vld2lane:
3321218893Sdim  case Intrinsic::arm_neon_vld3lane:
3322218893Sdim  case Intrinsic::arm_neon_vld4lane:
3323218893Sdim  case Intrinsic::arm_neon_vst1:
3324218893Sdim  case Intrinsic::arm_neon_vst2:
3325218893Sdim  case Intrinsic::arm_neon_vst3:
3326218893Sdim  case Intrinsic::arm_neon_vst4:
3327218893Sdim  case Intrinsic::arm_neon_vst2lane:
3328218893Sdim  case Intrinsic::arm_neon_vst3lane:
3329218893Sdim  case Intrinsic::arm_neon_vst4lane: {
3330314564Sdim    unsigned MemAlign =
3331314564Sdim        getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
3332218893Sdim    unsigned AlignArg = II->getNumArgOperands() - 1;
3333218893Sdim    ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
3334218893Sdim    if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
3335218893Sdim      II->setArgOperand(AlignArg,
3336218893Sdim                        ConstantInt::get(Type::getInt32Ty(II->getContext()),
3337218893Sdim                                         MemAlign, false));
3338218893Sdim      return II;
3339218893Sdim    }
3340218893Sdim    break;
3341218893Sdim  }
3342218893Sdim
3343341825Sdim  case Intrinsic::arm_neon_vtbl1:
3344341825Sdim  case Intrinsic::aarch64_neon_tbl1:
3345341825Sdim    if (Value *V = simplifyNeonTbl1(*II, Builder))
3346341825Sdim      return replaceInstUsesWith(*II, V);
3347341825Sdim    break;
3348341825Sdim
3349239462Sdim  case Intrinsic::arm_neon_vmulls:
3350276479Sdim  case Intrinsic::arm_neon_vmullu:
3351276479Sdim  case Intrinsic::aarch64_neon_smull:
3352276479Sdim  case Intrinsic::aarch64_neon_umull: {
3353239462Sdim    Value *Arg0 = II->getArgOperand(0);
3354239462Sdim    Value *Arg1 = II->getArgOperand(1);
3355239462Sdim
3356239462Sdim    // Handle mul by zero first:
3357239462Sdim    if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
3358309124Sdim      return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
3359239462Sdim    }
3360239462Sdim
3361239462Sdim    // Check for constant LHS & RHS - in this case we just simplify.
3362353358Sdim    bool Zext = (IID == Intrinsic::arm_neon_vmullu ||
3363353358Sdim                 IID == Intrinsic::aarch64_neon_umull);
3364239462Sdim    VectorType *NewVT = cast<VectorType>(II->getType());
3365276479Sdim    if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
3366276479Sdim      if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
3367276479Sdim        CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext);
3368276479Sdim        CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext);
3369276479Sdim
3370309124Sdim        return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
3371239462Sdim      }
3372239462Sdim
3373276479Sdim      // Couldn't simplify - canonicalize constant to the RHS.
3374239462Sdim      std::swap(Arg0, Arg1);
3375239462Sdim    }
3376239462Sdim
3377239462Sdim    // Handle mul by one:
3378276479Sdim    if (Constant *CV1 = dyn_cast<Constant>(Arg1))
3379239462Sdim      if (ConstantInt *Splat =
3380276479Sdim              dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
3381276479Sdim        if (Splat->isOne())
3382276479Sdim          return CastInst::CreateIntegerCast(Arg0, II->getType(),
3383276479Sdim                                             /*isSigned=*/!Zext);
3384276479Sdim
3385276479Sdim    break;
3386276479Sdim  }
3387341825Sdim  case Intrinsic::arm_neon_aesd:
3388341825Sdim  case Intrinsic::arm_neon_aese:
3389341825Sdim  case Intrinsic::aarch64_crypto_aesd:
3390341825Sdim  case Intrinsic::aarch64_crypto_aese: {
3391341825Sdim    Value *DataArg = II->getArgOperand(0);
3392341825Sdim    Value *KeyArg  = II->getArgOperand(1);
3393341825Sdim
3394341825Sdim    // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
3395341825Sdim    Value *Data, *Key;
3396341825Sdim    if (match(KeyArg, m_ZeroInt()) &&
3397341825Sdim        match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
3398341825Sdim      II->setArgOperand(0, Data);
3399341825Sdim      II->setArgOperand(1, Key);
3400341825Sdim      return II;
3401341825Sdim    }
3402341825Sdim    break;
3403341825Sdim  }
3404360784Sdim  case Intrinsic::arm_mve_pred_i2v: {
3405360784Sdim    Value *Arg = II->getArgOperand(0);
3406360784Sdim    Value *ArgArg;
3407360784Sdim    if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) &&
3408360784Sdim        II->getType() == ArgArg->getType())
3409360784Sdim      return replaceInstUsesWith(*II, ArgArg);
3410360784Sdim    Constant *XorMask;
3411360784Sdim    if (match(Arg,
3412360784Sdim              m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)),
3413360784Sdim                    m_Constant(XorMask))) &&
3414360784Sdim        II->getType() == ArgArg->getType()) {
3415360784Sdim      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
3416360784Sdim        if (CI->getValue().trunc(16).isAllOnesValue()) {
3417360784Sdim          auto TrueVector = Builder.CreateVectorSplat(
3418360784Sdim              II->getType()->getVectorNumElements(), Builder.getTrue());
3419360784Sdim          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
3420360784Sdim        }
3421360784Sdim      }
3422360784Sdim    }
3423360784Sdim    KnownBits ScalarKnown(32);
3424360784Sdim    if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16),
3425360784Sdim                             ScalarKnown, 0))
3426360784Sdim      return II;
3427360784Sdim    break;
3428360784Sdim  }
3429360784Sdim  case Intrinsic::arm_mve_pred_v2i: {
3430360784Sdim    Value *Arg = II->getArgOperand(0);
3431360784Sdim    Value *ArgArg;
3432360784Sdim    if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_i2v>(m_Value(ArgArg))))
3433360784Sdim      return replaceInstUsesWith(*II, ArgArg);
3434360784Sdim    if (!II->getMetadata(LLVMContext::MD_range)) {
3435360784Sdim      Type *IntTy32 = Type::getInt32Ty(II->getContext());
3436360784Sdim      Metadata *M[] = {
3437360784Sdim        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0)),
3438360784Sdim        ConstantAsMetadata::get(ConstantInt::get(IntTy32, 0xFFFF))
3439360784Sdim      };
3440360784Sdim      II->setMetadata(LLVMContext::MD_range, MDNode::get(II->getContext(), M));
3441360784Sdim      return II;
3442360784Sdim    }
3443360784Sdim    break;
3444360784Sdim  }
3445360784Sdim  case Intrinsic::arm_mve_vadc:
3446360784Sdim  case Intrinsic::arm_mve_vadc_predicated: {
3447360784Sdim    unsigned CarryOp =
3448360784Sdim        (II->getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
3449360784Sdim    assert(II->getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
3450360784Sdim           "Bad type for intrinsic!");
3451360784Sdim
3452360784Sdim    KnownBits CarryKnown(32);
3453360784Sdim    if (SimplifyDemandedBits(II, CarryOp, APInt::getOneBitSet(32, 29),
3454360784Sdim                             CarryKnown))
3455360784Sdim      return II;
3456360784Sdim    break;
3457360784Sdim  }
3458321369Sdim  case Intrinsic::amdgcn_rcp: {
3459321369Sdim    Value *Src = II->getArgOperand(0);
3460276479Sdim
3461321369Sdim    // TODO: Move to ConstantFolding/InstSimplify?
3462321369Sdim    if (isa<UndefValue>(Src))
3463321369Sdim      return replaceInstUsesWith(CI, Src);
3464321369Sdim
3465321369Sdim    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
3466276479Sdim      const APFloat &ArgVal = C->getValueAPF();
3467360784Sdim      APFloat Val(ArgVal.getSemantics(), 1);
3468276479Sdim      APFloat::opStatus Status = Val.divide(ArgVal,
3469276479Sdim                                            APFloat::rmNearestTiesToEven);
3470276479Sdim      // Only do this if it was exact and therefore not dependent on the
3471276479Sdim      // rounding mode.
3472276479Sdim      if (Status == APFloat::opOK)
3473309124Sdim        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
3474239462Sdim    }
3475239462Sdim
3476239462Sdim    break;
3477239462Sdim  }
3478321369Sdim  case Intrinsic::amdgcn_rsq: {
3479321369Sdim    Value *Src = II->getArgOperand(0);
3480321369Sdim
3481321369Sdim    // TODO: Move to ConstantFolding/InstSimplify?
3482321369Sdim    if (isa<UndefValue>(Src))
3483321369Sdim      return replaceInstUsesWith(CI, Src);
3484321369Sdim    break;
3485321369Sdim  }
3486309124Sdim  case Intrinsic::amdgcn_frexp_mant:
3487309124Sdim  case Intrinsic::amdgcn_frexp_exp: {
3488309124Sdim    Value *Src = II->getArgOperand(0);
3489309124Sdim    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
3490309124Sdim      int Exp;
3491309124Sdim      APFloat Significand = frexp(C->getValueAPF(), Exp,
3492309124Sdim                                  APFloat::rmNearestTiesToEven);
3493309124Sdim
3494353358Sdim      if (IID == Intrinsic::amdgcn_frexp_mant) {
3495309124Sdim        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(),
3496309124Sdim                                                       Significand));
3497309124Sdim      }
3498309124Sdim
3499309124Sdim      // Match instruction special case behavior.
3500309124Sdim      if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
3501309124Sdim        Exp = 0;
3502309124Sdim
3503309124Sdim      return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp));
3504309124Sdim    }
3505309124Sdim
3506309124Sdim    if (isa<UndefValue>(Src))
3507309124Sdim      return replaceInstUsesWith(CI, UndefValue::get(II->getType()));
3508309124Sdim
3509309124Sdim    break;
3510309124Sdim  }
3511314564Sdim  case Intrinsic::amdgcn_class: {
3512314564Sdim    enum  {
3513314564Sdim      S_NAN = 1 << 0,        // Signaling NaN
3514314564Sdim      Q_NAN = 1 << 1,        // Quiet NaN
3515314564Sdim      N_INFINITY = 1 << 2,   // Negative infinity
3516314564Sdim      N_NORMAL = 1 << 3,     // Negative normal
3517314564Sdim      N_SUBNORMAL = 1 << 4,  // Negative subnormal
3518314564Sdim      N_ZERO = 1 << 5,       // Negative zero
3519314564Sdim      P_ZERO = 1 << 6,       // Positive zero
3520314564Sdim      P_SUBNORMAL = 1 << 7,  // Positive subnormal
3521314564Sdim      P_NORMAL = 1 << 8,     // Positive normal
3522314564Sdim      P_INFINITY = 1 << 9    // Positive infinity
3523314564Sdim    };
3524314564Sdim
3525314564Sdim    const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
3526314564Sdim      N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | P_NORMAL | P_INFINITY;
3527314564Sdim
3528314564Sdim    Value *Src0 = II->getArgOperand(0);
3529314564Sdim    Value *Src1 = II->getArgOperand(1);
3530314564Sdim    const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
3531314564Sdim    if (!CMask) {
3532314564Sdim      if (isa<UndefValue>(Src0))
3533314564Sdim        return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
3534314564Sdim
3535314564Sdim      if (isa<UndefValue>(Src1))
3536314564Sdim        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
3537314564Sdim      break;
3538314564Sdim    }
3539314564Sdim
3540314564Sdim    uint32_t Mask = CMask->getZExtValue();
3541314564Sdim
3542314564Sdim    // If all tests are made, it doesn't matter what the value is.
3543314564Sdim    if ((Mask & FullMask) == FullMask)
3544314564Sdim      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true));
3545314564Sdim
3546314564Sdim    if ((Mask & FullMask) == 0)
3547314564Sdim      return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
3548314564Sdim
3549314564Sdim    if (Mask == (S_NAN | Q_NAN)) {
3550314564Sdim      // Equivalent of isnan. Replace with standard fcmp.
3551321369Sdim      Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0);
3552314564Sdim      FCmp->takeName(II);
3553314564Sdim      return replaceInstUsesWith(*II, FCmp);
3554314564Sdim    }
3555314564Sdim
3556344779Sdim    if (Mask == (N_ZERO | P_ZERO)) {
3557344779Sdim      // Equivalent of == 0.
3558344779Sdim      Value *FCmp = Builder.CreateFCmpOEQ(
3559344779Sdim        Src0, ConstantFP::get(Src0->getType(), 0.0));
3560344779Sdim
3561344779Sdim      FCmp->takeName(II);
3562344779Sdim      return replaceInstUsesWith(*II, FCmp);
3563344779Sdim    }
3564344779Sdim
3565344779Sdim    // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
3566344779Sdim    if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) {
3567344779Sdim      II->setArgOperand(1, ConstantInt::get(Src1->getType(),
3568344779Sdim                                            Mask & ~(S_NAN | Q_NAN)));
3569344779Sdim      return II;
3570344779Sdim    }
3571344779Sdim
3572314564Sdim    const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
3573314564Sdim    if (!CVal) {
3574314564Sdim      if (isa<UndefValue>(Src0))
3575314564Sdim        return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
3576314564Sdim
3577314564Sdim      // Clamp mask to used bits
3578314564Sdim      if ((Mask & FullMask) != Mask) {
3579321369Sdim        CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(),
3580314564Sdim          { Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) }
3581314564Sdim        );
3582314564Sdim
3583314564Sdim        NewCall->takeName(II);
3584314564Sdim        return replaceInstUsesWith(*II, NewCall);
3585314564Sdim      }
3586314564Sdim
3587314564Sdim      break;
3588314564Sdim    }
3589314564Sdim
3590314564Sdim    const APFloat &Val = CVal->getValueAPF();
3591314564Sdim
3592314564Sdim    bool Result =
3593314564Sdim      ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
3594314564Sdim      ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
3595314564Sdim      ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
3596314564Sdim      ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
3597314564Sdim      ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
3598314564Sdim      ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
3599314564Sdim      ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
3600314564Sdim      ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
3601314564Sdim      ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
3602314564Sdim      ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
3603314564Sdim
3604314564Sdim    return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
3605314564Sdim  }
3606321369Sdim  case Intrinsic::amdgcn_cvt_pkrtz: {
3607321369Sdim    Value *Src0 = II->getArgOperand(0);
3608321369Sdim    Value *Src1 = II->getArgOperand(1);
3609321369Sdim    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
3610321369Sdim      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
3611321369Sdim        const fltSemantics &HalfSem
3612321369Sdim          = II->getType()->getScalarType()->getFltSemantics();
3613321369Sdim        bool LosesInfo;
3614321369Sdim        APFloat Val0 = C0->getValueAPF();
3615321369Sdim        APFloat Val1 = C1->getValueAPF();
3616321369Sdim        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
3617321369Sdim        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
3618321369Sdim
3619321369Sdim        Constant *Folded = ConstantVector::get({
3620321369Sdim            ConstantFP::get(II->getContext(), Val0),
3621321369Sdim            ConstantFP::get(II->getContext(), Val1) });
3622321369Sdim        return replaceInstUsesWith(*II, Folded);
3623321369Sdim      }
3624321369Sdim    }
3625321369Sdim
3626321369Sdim    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
3627321369Sdim      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
3628321369Sdim
3629321369Sdim    break;
3630321369Sdim  }
3631329410Sdim  case Intrinsic::amdgcn_cvt_pknorm_i16:
3632329410Sdim  case Intrinsic::amdgcn_cvt_pknorm_u16:
3633329410Sdim  case Intrinsic::amdgcn_cvt_pk_i16:
3634329410Sdim  case Intrinsic::amdgcn_cvt_pk_u16: {
3635329410Sdim    Value *Src0 = II->getArgOperand(0);
3636329410Sdim    Value *Src1 = II->getArgOperand(1);
3637329410Sdim
3638329410Sdim    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
3639329410Sdim      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
3640329410Sdim
3641329410Sdim    break;
3642329410Sdim  }
3643321369Sdim  case Intrinsic::amdgcn_ubfe:
3644321369Sdim  case Intrinsic::amdgcn_sbfe: {
3645321369Sdim    // Decompose simple cases into standard shifts.
3646321369Sdim    Value *Src = II->getArgOperand(0);
3647321369Sdim    if (isa<UndefValue>(Src))
3648321369Sdim      return replaceInstUsesWith(*II, Src);
3649321369Sdim
3650321369Sdim    unsigned Width;
3651321369Sdim    Type *Ty = II->getType();
3652321369Sdim    unsigned IntSize = Ty->getIntegerBitWidth();
3653321369Sdim
3654321369Sdim    ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2));
3655321369Sdim    if (CWidth) {
3656321369Sdim      Width = CWidth->getZExtValue();
3657321369Sdim      if ((Width & (IntSize - 1)) == 0)
3658321369Sdim        return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));
3659321369Sdim
3660321369Sdim      if (Width >= IntSize) {
3661321369Sdim        // Hardware ignores high bits, so remove those.
3662321369Sdim        II->setArgOperand(2, ConstantInt::get(CWidth->getType(),
3663321369Sdim                                              Width & (IntSize - 1)));
3664321369Sdim        return II;
3665321369Sdim      }
3666321369Sdim    }
3667321369Sdim
3668321369Sdim    unsigned Offset;
3669321369Sdim    ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
3670321369Sdim    if (COffset) {
3671321369Sdim      Offset = COffset->getZExtValue();
3672321369Sdim      if (Offset >= IntSize) {
3673321369Sdim        II->setArgOperand(1, ConstantInt::get(COffset->getType(),
3674321369Sdim                                              Offset & (IntSize - 1)));
3675321369Sdim        return II;
3676321369Sdim      }
3677321369Sdim    }
3678321369Sdim
3679353358Sdim    bool Signed = IID == Intrinsic::amdgcn_sbfe;
3680321369Sdim
3681321369Sdim    if (!CWidth || !COffset)
3682321369Sdim      break;
3683321369Sdim
3684344779Sdim    // The case of Width == 0 is handled above, which makes this tranformation
3685344779Sdim    // safe.  If Width == 0, then the ashr and lshr instructions become poison
3686344779Sdim    // value since the shift amount would be equal to the bit size.
3687344779Sdim    assert(Width != 0);
3688344779Sdim
3689321369Sdim    // TODO: This allows folding to undef when the hardware has specific
3690321369Sdim    // behavior?
3691321369Sdim    if (Offset + Width < IntSize) {
3692321369Sdim      Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width);
3693321369Sdim      Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width)
3694321369Sdim                                 : Builder.CreateLShr(Shl, IntSize - Width);
3695321369Sdim      RightShift->takeName(II);
3696321369Sdim      return replaceInstUsesWith(*II, RightShift);
3697321369Sdim    }
3698321369Sdim
3699321369Sdim    Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset)
3700321369Sdim                               : Builder.CreateLShr(Src, Offset);
3701321369Sdim
3702321369Sdim    RightShift->takeName(II);
3703321369Sdim    return replaceInstUsesWith(*II, RightShift);
3704321369Sdim  }
3705321369Sdim  case Intrinsic::amdgcn_exp:
3706321369Sdim  case Intrinsic::amdgcn_exp_compr: {
3707353358Sdim    ConstantInt *En = cast<ConstantInt>(II->getArgOperand(1));
3708321369Sdim    unsigned EnBits = En->getZExtValue();
3709321369Sdim    if (EnBits == 0xf)
3710321369Sdim      break; // All inputs enabled.
3711321369Sdim
3712353358Sdim    bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
3713321369Sdim    bool Changed = false;
3714321369Sdim    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
3715321369Sdim      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
3716321369Sdim          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
3717321369Sdim        Value *Src = II->getArgOperand(I + 2);
3718321369Sdim        if (!isa<UndefValue>(Src)) {
3719321369Sdim          II->setArgOperand(I + 2, UndefValue::get(Src->getType()));
3720321369Sdim          Changed = true;
3721321369Sdim        }
3722321369Sdim      }
3723321369Sdim    }
3724321369Sdim
3725321369Sdim    if (Changed)
3726321369Sdim      return II;
3727321369Sdim
3728321369Sdim    break;
3729321369Sdim  }
3730321369Sdim  case Intrinsic::amdgcn_fmed3: {
3731321369Sdim    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
3732321369Sdim    // for the shader.
3733321369Sdim
3734321369Sdim    Value *Src0 = II->getArgOperand(0);
3735321369Sdim    Value *Src1 = II->getArgOperand(1);
3736321369Sdim    Value *Src2 = II->getArgOperand(2);
3737321369Sdim
3738341825Sdim    // Checking for NaN before canonicalization provides better fidelity when
3739341825Sdim    // mapping other operations onto fmed3 since the order of operands is
3740341825Sdim    // unchanged.
3741341825Sdim    CallInst *NewCall = nullptr;
3742341825Sdim    if (match(Src0, m_NaN()) || isa<UndefValue>(Src0)) {
3743341825Sdim      NewCall = Builder.CreateMinNum(Src1, Src2);
3744341825Sdim    } else if (match(Src1, m_NaN()) || isa<UndefValue>(Src1)) {
3745341825Sdim      NewCall = Builder.CreateMinNum(Src0, Src2);
3746341825Sdim    } else if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
3747341825Sdim      NewCall = Builder.CreateMaxNum(Src0, Src1);
3748341825Sdim    }
3749341825Sdim
3750341825Sdim    if (NewCall) {
3751341825Sdim      NewCall->copyFastMathFlags(II);
3752341825Sdim      NewCall->takeName(II);
3753341825Sdim      return replaceInstUsesWith(*II, NewCall);
3754341825Sdim    }
3755341825Sdim
3756321369Sdim    bool Swap = false;
3757321369Sdim    // Canonicalize constants to RHS operands.
3758321369Sdim    //
3759321369Sdim    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
3760321369Sdim    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
3761321369Sdim      std::swap(Src0, Src1);
3762321369Sdim      Swap = true;
3763321369Sdim    }
3764321369Sdim
3765321369Sdim    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
3766321369Sdim      std::swap(Src1, Src2);
3767321369Sdim      Swap = true;
3768321369Sdim    }
3769321369Sdim
3770321369Sdim    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
3771321369Sdim      std::swap(Src0, Src1);
3772321369Sdim      Swap = true;
3773321369Sdim    }
3774321369Sdim
3775321369Sdim    if (Swap) {
3776321369Sdim      II->setArgOperand(0, Src0);
3777321369Sdim      II->setArgOperand(1, Src1);
3778321369Sdim      II->setArgOperand(2, Src2);
3779321369Sdim      return II;
3780321369Sdim    }
3781321369Sdim
3782321369Sdim    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
3783321369Sdim      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
3784321369Sdim        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
3785321369Sdim          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
3786321369Sdim                                       C2->getValueAPF());
3787321369Sdim          return replaceInstUsesWith(*II,
3788321369Sdim            ConstantFP::get(Builder.getContext(), Result));
3789321369Sdim        }
3790321369Sdim      }
3791321369Sdim    }
3792321369Sdim
3793321369Sdim    break;
3794321369Sdim  }
3795321369Sdim  case Intrinsic::amdgcn_icmp:
3796321369Sdim  case Intrinsic::amdgcn_fcmp: {
3797353358Sdim    const ConstantInt *CC = cast<ConstantInt>(II->getArgOperand(2));
3798321369Sdim    // Guard against invalid arguments.
3799321369Sdim    int64_t CCVal = CC->getZExtValue();
3800353358Sdim    bool IsInteger = IID == Intrinsic::amdgcn_icmp;
3801321369Sdim    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
3802321369Sdim                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
3803321369Sdim        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
3804321369Sdim                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
3805321369Sdim      break;
3806321369Sdim
3807321369Sdim    Value *Src0 = II->getArgOperand(0);
3808321369Sdim    Value *Src1 = II->getArgOperand(1);
3809321369Sdim
3810321369Sdim    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
3811321369Sdim      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
3812321369Sdim        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
3813321369Sdim        if (CCmp->isNullValue()) {
3814321369Sdim          return replaceInstUsesWith(
3815321369Sdim              *II, ConstantExpr::getSExt(CCmp, II->getType()));
3816321369Sdim        }
3817321369Sdim
3818321369Sdim        // The result of V_ICMP/V_FCMP assembly instructions (which this
3819321369Sdim        // intrinsic exposes) is one bit per thread, masked with the EXEC
3820321369Sdim        // register (which contains the bitmask of live threads). So a
3821321369Sdim        // comparison that always returns true is the same as a read of the
3822321369Sdim        // EXEC register.
3823353358Sdim        Function *NewF = Intrinsic::getDeclaration(
3824321369Sdim            II->getModule(), Intrinsic::read_register, II->getType());
3825321369Sdim        Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")};
3826321369Sdim        MDNode *MD = MDNode::get(II->getContext(), MDArgs);
3827321369Sdim        Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
3828321369Sdim        CallInst *NewCall = Builder.CreateCall(NewF, Args);
3829321369Sdim        NewCall->addAttribute(AttributeList::FunctionIndex,
3830321369Sdim                              Attribute::Convergent);
3831321369Sdim        NewCall->takeName(II);
3832321369Sdim        return replaceInstUsesWith(*II, NewCall);
3833321369Sdim      }
3834321369Sdim
3835321369Sdim      // Canonicalize constants to RHS.
3836321369Sdim      CmpInst::Predicate SwapPred
3837321369Sdim        = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
3838321369Sdim      II->setArgOperand(0, Src1);
3839321369Sdim      II->setArgOperand(1, Src0);
3840321369Sdim      II->setArgOperand(2, ConstantInt::get(CC->getType(),
3841321369Sdim                                            static_cast<int>(SwapPred)));
3842321369Sdim      return II;
3843321369Sdim    }
3844321369Sdim
3845321369Sdim    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
3846321369Sdim      break;
3847321369Sdim
3848321369Sdim    // Canonicalize compare eq with true value to compare != 0
3849321369Sdim    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
3850321369Sdim    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
3851321369Sdim    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
3852321369Sdim    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
3853321369Sdim    Value *ExtSrc;
3854321369Sdim    if (CCVal == CmpInst::ICMP_EQ &&
3855321369Sdim        ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) ||
3856321369Sdim         (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) &&
3857321369Sdim        ExtSrc->getType()->isIntegerTy(1)) {
3858321369Sdim      II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType()));
3859321369Sdim      II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
3860321369Sdim      return II;
3861321369Sdim    }
3862321369Sdim
3863321369Sdim    CmpInst::Predicate SrcPred;
3864321369Sdim    Value *SrcLHS;
3865321369Sdim    Value *SrcRHS;
3866321369Sdim
3867321369Sdim    // Fold compare eq/ne with 0 from a compare result as the predicate to the
3868321369Sdim    // intrinsic. The typical use is a wave vote function in the library, which
3869321369Sdim    // will be fed from a user code condition compared with 0. Fold in the
3870321369Sdim    // redundant compare.
3871321369Sdim
3872321369Sdim    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
3873321369Sdim    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
3874321369Sdim    //
3875321369Sdim    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
3876321369Sdim    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
3877321369Sdim    if (match(Src1, m_Zero()) &&
3878321369Sdim        match(Src0,
3879321369Sdim              m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) {
3880321369Sdim      if (CCVal == CmpInst::ICMP_EQ)
3881321369Sdim        SrcPred = CmpInst::getInversePredicate(SrcPred);
3882321369Sdim
3883321369Sdim      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ?
3884321369Sdim        Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp;
3885321369Sdim
3886344779Sdim      Type *Ty = SrcLHS->getType();
3887344779Sdim      if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
3888344779Sdim        // Promote to next legal integer type.
3889344779Sdim        unsigned Width = CmpType->getBitWidth();
3890344779Sdim        unsigned NewWidth = Width;
3891344779Sdim
3892344779Sdim        // Don't do anything for i1 comparisons.
3893344779Sdim        if (Width == 1)
3894344779Sdim          break;
3895344779Sdim
3896344779Sdim        if (Width <= 16)
3897344779Sdim          NewWidth = 16;
3898344779Sdim        else if (Width <= 32)
3899344779Sdim          NewWidth = 32;
3900344779Sdim        else if (Width <= 64)
3901344779Sdim          NewWidth = 64;
3902344779Sdim        else if (Width > 64)
3903344779Sdim          break; // Can't handle this.
3904344779Sdim
3905344779Sdim        if (Width != NewWidth) {
3906344779Sdim          IntegerType *CmpTy = Builder.getIntNTy(NewWidth);
3907344779Sdim          if (CmpInst::isSigned(SrcPred)) {
3908344779Sdim            SrcLHS = Builder.CreateSExt(SrcLHS, CmpTy);
3909344779Sdim            SrcRHS = Builder.CreateSExt(SrcRHS, CmpTy);
3910344779Sdim          } else {
3911344779Sdim            SrcLHS = Builder.CreateZExt(SrcLHS, CmpTy);
3912344779Sdim            SrcRHS = Builder.CreateZExt(SrcRHS, CmpTy);
3913344779Sdim          }
3914344779Sdim        }
3915344779Sdim      } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
3916344779Sdim        break;
3917344779Sdim
3918353358Sdim      Function *NewF =
3919353358Sdim          Intrinsic::getDeclaration(II->getModule(), NewIID,
3920353358Sdim                                    { II->getType(),
3921353358Sdim                                      SrcLHS->getType() });
3922321369Sdim      Value *Args[] = { SrcLHS, SrcRHS,
3923321369Sdim                        ConstantInt::get(CC->getType(), SrcPred) };
3924321369Sdim      CallInst *NewCall = Builder.CreateCall(NewF, Args);
3925321369Sdim      NewCall->takeName(II);
3926321369Sdim      return replaceInstUsesWith(*II, NewCall);
3927321369Sdim    }
3928321369Sdim
3929321369Sdim    break;
3930321369Sdim  }
3931327952Sdim  case Intrinsic::amdgcn_wqm_vote: {
3932327952Sdim    // wqm_vote is identity when the argument is constant.
3933327952Sdim    if (!isa<Constant>(II->getArgOperand(0)))
3934327952Sdim      break;
3935327952Sdim
3936327952Sdim    return replaceInstUsesWith(*II, II->getArgOperand(0));
3937327952Sdim  }
3938327952Sdim  case Intrinsic::amdgcn_kill: {
3939327952Sdim    const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0));
3940327952Sdim    if (!C || !C->getZExtValue())
3941327952Sdim      break;
3942327952Sdim
3943327952Sdim    // amdgcn.kill(i1 1) is a no-op
3944327952Sdim    return eraseInstFromFunction(CI);
3945327952Sdim  }
3946341825Sdim  case Intrinsic::amdgcn_update_dpp: {
3947341825Sdim    Value *Old = II->getArgOperand(0);
3948341825Sdim
3949353358Sdim    auto BC = cast<ConstantInt>(II->getArgOperand(5));
3950353358Sdim    auto RM = cast<ConstantInt>(II->getArgOperand(3));
3951353358Sdim    auto BM = cast<ConstantInt>(II->getArgOperand(4));
3952353358Sdim    if (BC->isZeroValue() ||
3953341825Sdim        RM->getZExtValue() != 0xF ||
3954341825Sdim        BM->getZExtValue() != 0xF ||
3955341825Sdim        isa<UndefValue>(Old))
3956341825Sdim      break;
3957341825Sdim
3958341825Sdim    // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
3959341825Sdim    II->setOperand(0, UndefValue::get(Old->getType()));
3960341825Sdim    return II;
3961341825Sdim  }
3962353358Sdim  case Intrinsic::amdgcn_readfirstlane:
3963353358Sdim  case Intrinsic::amdgcn_readlane: {
3964353358Sdim    // A constant value is trivially uniform.
3965353358Sdim    if (Constant *C = dyn_cast<Constant>(II->getArgOperand(0)))
3966353358Sdim      return replaceInstUsesWith(*II, C);
3967353358Sdim
3968353358Sdim    // The rest of these may not be safe if the exec may not be the same between
3969353358Sdim    // the def and use.
3970353358Sdim    Value *Src = II->getArgOperand(0);
3971353358Sdim    Instruction *SrcInst = dyn_cast<Instruction>(Src);
3972353358Sdim    if (SrcInst && SrcInst->getParent() != II->getParent())
3973353358Sdim      break;
3974353358Sdim
3975353358Sdim    // readfirstlane (readfirstlane x) -> readfirstlane x
3976353358Sdim    // readlane (readfirstlane x), y -> readfirstlane x
3977353358Sdim    if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readfirstlane>()))
3978353358Sdim      return replaceInstUsesWith(*II, Src);
3979353358Sdim
3980353358Sdim    if (IID == Intrinsic::amdgcn_readfirstlane) {
3981353358Sdim      // readfirstlane (readlane x, y) -> readlane x, y
3982353358Sdim      if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>()))
3983353358Sdim        return replaceInstUsesWith(*II, Src);
3984353358Sdim    } else {
3985353358Sdim      // readlane (readlane x, y), y -> readlane x, y
3986353358Sdim      if (match(Src, m_Intrinsic<Intrinsic::amdgcn_readlane>(
3987353358Sdim                  m_Value(), m_Specific(II->getArgOperand(1)))))
3988353358Sdim        return replaceInstUsesWith(*II, Src);
3989353358Sdim    }
3990353358Sdim
3991353358Sdim    break;
3992353358Sdim  }
3993202375Srdivacky  case Intrinsic::stackrestore: {
3994202375Srdivacky    // If the save is right next to the restore, remove the restore.  This can
3995202375Srdivacky    // happen when variable allocas are DCE'd.
3996210299Sed    if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
3997202375Srdivacky      if (SS->getIntrinsicID() == Intrinsic::stacksave) {
3998341825Sdim        // Skip over debug info.
3999341825Sdim        if (SS->getNextNonDebugInstruction() == II) {
4000309124Sdim          return eraseInstFromFunction(CI);
4001341825Sdim        }
4002202375Srdivacky      }
4003202375Srdivacky    }
4004234353Sdim
4005202375Srdivacky    // Scan down this block to see if there is another stack restore in the
4006202375Srdivacky    // same block without an intervening call/alloca.
4007296417Sdim    BasicBlock::iterator BI(II);
4008344779Sdim    Instruction *TI = II->getParent()->getTerminator();
4009202375Srdivacky    bool CannotRemove = false;
4010202375Srdivacky    for (++BI; &*BI != TI; ++BI) {
4011239462Sdim      if (isa<AllocaInst>(BI)) {
4012202375Srdivacky        CannotRemove = true;
4013202375Srdivacky        break;
4014202375Srdivacky      }
4015202375Srdivacky      if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
4016353358Sdim        if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) {
4017202375Srdivacky          // If there is a stackrestore below this one, remove this one.
4018353358Sdim          if (II2->getIntrinsicID() == Intrinsic::stackrestore)
4019309124Sdim            return eraseInstFromFunction(CI);
4020309124Sdim
4021309124Sdim          // Bail if we cross over an intrinsic with side effects, such as
4022360784Sdim          // llvm.stacksave, or llvm.read_register.
4023353358Sdim          if (II2->mayHaveSideEffects()) {
4024309124Sdim            CannotRemove = true;
4025309124Sdim            break;
4026309124Sdim          }
4027202375Srdivacky        } else {
4028202375Srdivacky          // If we found a non-intrinsic call, we can't remove the stack
4029202375Srdivacky          // restore.
4030202375Srdivacky          CannotRemove = true;
4031202375Srdivacky          break;
4032202375Srdivacky        }
4033202375Srdivacky      }
4034202375Srdivacky    }
4035234353Sdim
4036226633Sdim    // If the stack restore is in a return, resume, or unwind block and if there
4037226633Sdim    // are no allocas or calls between the restore and the return, nuke the
4038226633Sdim    // restore.
4039234353Sdim    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI)))
4040309124Sdim      return eraseInstFromFunction(CI);
4041202375Srdivacky    break;
4042202375Srdivacky  }
4043309124Sdim  case Intrinsic::lifetime_start:
4044314564Sdim    // Asan needs to poison memory to detect invalid access which is possible
4045314564Sdim    // even for empty lifetime range.
4046327952Sdim    if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
4047360784Sdim        II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) ||
4048327952Sdim        II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
4049314564Sdim      break;
4050314564Sdim
4051309124Sdim    if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start,
4052309124Sdim                                  Intrinsic::lifetime_end, *this))
4053309124Sdim      return nullptr;
4054296417Sdim    break;
4055280031Sdim  case Intrinsic::assume: {
4056309124Sdim    Value *IIOperand = II->getArgOperand(0);
4057341825Sdim    // Remove an assume if it is followed by an identical assume.
4058341825Sdim    // TODO: Do we need this? Unless there are conflicting assumptions, the
4059341825Sdim    // computeKnownBits(IIOperand) below here eliminates redundant assumes.
4060341825Sdim    Instruction *Next = II->getNextNonDebugInstruction();
4061341825Sdim    if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
4062309124Sdim      return eraseInstFromFunction(CI);
4063309124Sdim
4064280031Sdim    // Canonicalize assume(a && b) -> assume(a); assume(b);
4065280031Sdim    // Note: New assumption intrinsics created here are registered by
4066280031Sdim    // the InstCombineIRInserter object.
4067353358Sdim    FunctionType *AssumeIntrinsicTy = II->getFunctionType();
4068353358Sdim    Value *AssumeIntrinsic = II->getCalledValue();
4069353358Sdim    Value *A, *B;
4070280031Sdim    if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) {
4071353358Sdim      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName());
4072353358Sdim      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName());
4073309124Sdim      return eraseInstFromFunction(*II);
4074280031Sdim    }
4075280031Sdim    // assume(!(a || b)) -> assume(!a); assume(!b);
4076280031Sdim    if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) {
4077353358Sdim      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
4078353358Sdim                         Builder.CreateNot(A), II->getName());
4079353358Sdim      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
4080353358Sdim                         Builder.CreateNot(B), II->getName());
4081309124Sdim      return eraseInstFromFunction(*II);
4082280031Sdim    }
4083280031Sdim
4084280031Sdim    // assume( (load addr) != null ) -> add 'nonnull' metadata to load
4085280031Sdim    // (if assume is valid at the load)
4086314564Sdim    CmpInst::Predicate Pred;
4087314564Sdim    Instruction *LHS;
4088314564Sdim    if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
4089314564Sdim        Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
4090314564Sdim        LHS->getType()->isPointerTy() &&
4091314564Sdim        isValidAssumeForContext(II, LHS, &DT)) {
4092314564Sdim      MDNode *MD = MDNode::get(II->getContext(), None);
4093314564Sdim      LHS->setMetadata(LLVMContext::MD_nonnull, MD);
4094314564Sdim      return eraseInstFromFunction(*II);
4095314564Sdim
4096280031Sdim      // TODO: apply nonnull return attributes to calls and invokes
4097280031Sdim      // TODO: apply range metadata for range check patterns?
4098280031Sdim    }
4099314564Sdim
4100280031Sdim    // If there is a dominating assume with the same condition as this one,
4101280031Sdim    // then this one is redundant, and should be removed.
4102321369Sdim    KnownBits Known(1);
4103321369Sdim    computeKnownBits(IIOperand, Known, 0, II);
4104321369Sdim    if (Known.isAllOnes())
4105309124Sdim      return eraseInstFromFunction(*II);
4106280031Sdim
4107314564Sdim    // Update the cache of affected values for this assumption (we might be
4108314564Sdim    // here because we just simplified the condition).
4109314564Sdim    AC.updateAffectedValues(II);
4110280031Sdim    break;
4111202375Srdivacky  }
4112280031Sdim  case Intrinsic::experimental_gc_relocate: {
4113360784Sdim    auto &GCR = *cast<GCRelocateInst>(II);
4114360784Sdim
4115360784Sdim    // If we have two copies of the same pointer in the statepoint argument
4116360784Sdim    // list, canonicalize to one.  This may let us common gc.relocates.
4117360784Sdim    if (GCR.getBasePtr() == GCR.getDerivedPtr() &&
4118360784Sdim        GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) {
4119360784Sdim      auto *OpIntTy = GCR.getOperand(2)->getType();
4120360784Sdim      II->setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex()));
4121360784Sdim      return II;
4122360784Sdim    }
4123360784Sdim
4124280031Sdim    // Translate facts known about a pointer before relocating into
4125280031Sdim    // facts about the relocate value, while being careful to
4126280031Sdim    // preserve relocation semantics.
4127360784Sdim    Value *DerivedPtr = GCR.getDerivedPtr();
4128202375Srdivacky
4129280031Sdim    // Remove the relocation if unused, note that this check is required
4130280031Sdim    // to prevent the cases below from looping forever.
4131280031Sdim    if (II->use_empty())
4132309124Sdim      return eraseInstFromFunction(*II);
4133280031Sdim
4134280031Sdim    // Undef is undef, even after relocation.
4135280031Sdim    // TODO: provide a hook for this in GCStrategy.  This is clearly legal for
4136280031Sdim    // most practical collectors, but there was discussion in the review thread
4137280031Sdim    // about whether it was legal for all possible collectors.
4138309124Sdim    if (isa<UndefValue>(DerivedPtr))
4139309124Sdim      // Use undef of gc_relocate's type to replace it.
4140309124Sdim      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
4141280031Sdim
4142309124Sdim    if (auto *PT = dyn_cast<PointerType>(II->getType())) {
4143309124Sdim      // The relocation of null will be null for most any collector.
4144309124Sdim      // TODO: provide a hook for this in GCStrategy.  There might be some
4145309124Sdim      // weird collector this property does not hold for.
4146309124Sdim      if (isa<ConstantPointerNull>(DerivedPtr))
4147309124Sdim        // Use null-pointer of gc_relocate's type to replace it.
4148309124Sdim        return replaceInstUsesWith(*II, ConstantPointerNull::get(PT));
4149280031Sdim
4150309124Sdim      // isKnownNonNull -> nonnull attribute
4151344779Sdim      if (!II->hasRetAttr(Attribute::NonNull) &&
4152344779Sdim          isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) {
4153321369Sdim        II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
4154344779Sdim        return II;
4155344779Sdim      }
4156288943Sdim    }
4157280031Sdim
4158280031Sdim    // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
4159280031Sdim    // Canonicalize on the type from the uses to the defs
4160288943Sdim
4161280031Sdim    // TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
4162309124Sdim    break;
4163280031Sdim  }
4164321369Sdim
4165321369Sdim  case Intrinsic::experimental_guard: {
4166341825Sdim    // Is this guard followed by another guard?  We scan forward over a small
4167341825Sdim    // fixed window of instructions to handle common cases with conditions
4168341825Sdim    // computed between guards.
4169360784Sdim    Instruction *NextInst = II->getNextNonDebugInstruction();
4170341825Sdim    for (unsigned i = 0; i < GuardWideningWindow; i++) {
4171341825Sdim      // Note: Using context-free form to avoid compile time blow up
4172341825Sdim      if (!isSafeToSpeculativelyExecute(NextInst))
4173341825Sdim        break;
4174360784Sdim      NextInst = NextInst->getNextNonDebugInstruction();
4175341825Sdim    }
4176321369Sdim    Value *NextCond = nullptr;
4177321369Sdim    if (match(NextInst,
4178321369Sdim              m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
4179321369Sdim      Value *CurrCond = II->getArgOperand(0);
4180321369Sdim
4181321369Sdim      // Remove a guard that it is immediately preceded by an identical guard.
4182321369Sdim      // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
4183360784Sdim      if (CurrCond != NextCond) {
4184360784Sdim        Instruction *MoveI = II->getNextNonDebugInstruction();
4185360784Sdim        while (MoveI != NextInst) {
4186360784Sdim          auto *Temp = MoveI;
4187360784Sdim          MoveI = MoveI->getNextNonDebugInstruction();
4188360784Sdim          Temp->moveBefore(II);
4189360784Sdim        }
4190360784Sdim        II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
4191341825Sdim      }
4192360784Sdim      eraseInstFromFunction(*NextInst);
4193360784Sdim      return II;
4194321369Sdim    }
4195321369Sdim    break;
4196280031Sdim  }
4197321369Sdim  }
4198353358Sdim  return visitCallBase(*II);
4199202375Srdivacky}
4200202375Srdivacky
4201321369Sdim// Fence instruction simplification
4202321369SdimInstruction *InstCombiner::visitFenceInst(FenceInst &FI) {
4203321369Sdim  // Remove identical consecutive fences.
4204341825Sdim  Instruction *Next = FI.getNextNonDebugInstruction();
4205341825Sdim  if (auto *NFI = dyn_cast<FenceInst>(Next))
4206321369Sdim    if (FI.isIdenticalTo(NFI))
4207321369Sdim      return eraseInstFromFunction(FI);
4208321369Sdim  return nullptr;
4209321369Sdim}
4210321369Sdim
4211202375Srdivacky// InvokeInst simplification
4212202375SrdivackyInstruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
4213353358Sdim  return visitCallBase(II);
4214202375Srdivacky}
4215202375Srdivacky
4216353358Sdim// CallBrInst simplification
4217353358SdimInstruction *InstCombiner::visitCallBrInst(CallBrInst &CBI) {
4218353358Sdim  return visitCallBase(CBI);
4219353358Sdim}
4220353358Sdim
4221309124Sdim/// If this cast does not affect the value passed through the varargs area, we
4222309124Sdim/// can eliminate the use of the cast.
4223353358Sdimstatic bool isSafeToEliminateVarargsCast(const CallBase &Call,
4224288943Sdim                                         const DataLayout &DL,
4225288943Sdim                                         const CastInst *const CI,
4226202375Srdivacky                                         const int ix) {
4227202375Srdivacky  if (!CI->isLosslessCast())
4228202375Srdivacky    return false;
4229202375Srdivacky
4230280031Sdim  // If this is a GC intrinsic, avoid munging types.  We need types for
4231280031Sdim  // statepoint reconstruction in SelectionDAG.
4232280031Sdim  // TODO: This is probably something which should be expanded to all
4233280031Sdim  // intrinsics since the entire point of intrinsics is that
4234280031Sdim  // they are understandable by the optimizer.
4235353358Sdim  if (isStatepoint(&Call) || isGCRelocate(&Call) || isGCResult(&Call))
4236280031Sdim    return false;
4237280031Sdim
4238276479Sdim  // The size of ByVal or InAlloca arguments is derived from the type, so we
4239202375Srdivacky  // can't change to a type with a different size.  If the size were
4240202375Srdivacky  // passed explicitly we could avoid this check.
4241353358Sdim  if (!Call.isByValOrInAllocaArgument(ix))
4242202375Srdivacky    return true;
4243202375Srdivacky
4244234353Sdim  Type* SrcTy =
4245202375Srdivacky            cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
4246353358Sdim  Type *DstTy = Call.isByValArgument(ix)
4247353358Sdim                    ? Call.getParamByValType(ix)
4248353358Sdim                    : cast<PointerType>(CI->getType())->getElementType();
4249202375Srdivacky  if (!SrcTy->isSized() || !DstTy->isSized())
4250202375Srdivacky    return false;
4251288943Sdim  if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy))
4252202375Srdivacky    return false;
4253202375Srdivacky  return true;
4254202375Srdivacky}
4255202375Srdivacky
4256288943SdimInstruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
4257276479Sdim  if (!CI->getCalledFunction()) return nullptr;
4258204961Srdivacky
4259288943Sdim  auto InstCombineRAUW = [this](Instruction *From, Value *With) {
4260309124Sdim    replaceInstUsesWith(*From, With);
4261288943Sdim  };
4262344779Sdim  auto InstCombineErase = [this](Instruction *I) {
4263344779Sdim    eraseInstFromFunction(*I);
4264344779Sdim  };
4265353358Sdim  LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW,
4266344779Sdim                               InstCombineErase);
4267288943Sdim  if (Value *With = Simplifier.optimizeCall(CI)) {
4268249423Sdim    ++NumSimplified;
4269309124Sdim    return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
4270249423Sdim  }
4271243830Sdim
4272276479Sdim  return nullptr;
4273204961Srdivacky}
4274204961Srdivacky
4275309124Sdimstatic IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) {
4276226633Sdim  // Strip off at most one level of pointer casts, looking for an alloca.  This
4277226633Sdim  // is good enough in practice and simpler than handling any number of casts.
4278226633Sdim  Value *Underlying = TrampMem->stripPointerCasts();
4279226633Sdim  if (Underlying != TrampMem &&
4280276479Sdim      (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem))
4281276479Sdim    return nullptr;
4282226633Sdim  if (!isa<AllocaInst>(Underlying))
4283276479Sdim    return nullptr;
4284226633Sdim
4285276479Sdim  IntrinsicInst *InitTrampoline = nullptr;
4286276479Sdim  for (User *U : TrampMem->users()) {
4287276479Sdim    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
4288226633Sdim    if (!II)
4289276479Sdim      return nullptr;
4290226633Sdim    if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
4291226633Sdim      if (InitTrampoline)
4292226633Sdim        // More than one init_trampoline writes to this value.  Give up.
4293276479Sdim        return nullptr;
4294226633Sdim      InitTrampoline = II;
4295226633Sdim      continue;
4296226633Sdim    }
4297226633Sdim    if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
4298226633Sdim      // Allow any number of calls to adjust.trampoline.
4299226633Sdim      continue;
4300276479Sdim    return nullptr;
4301226633Sdim  }
4302226633Sdim
4303226633Sdim  // No call to init.trampoline found.
4304226633Sdim  if (!InitTrampoline)
4305276479Sdim    return nullptr;
4306226633Sdim
4307226633Sdim  // Check that the alloca is being used in the expected way.
4308226633Sdim  if (InitTrampoline->getOperand(0) != TrampMem)
4309276479Sdim    return nullptr;
4310226633Sdim
4311226633Sdim  return InitTrampoline;
4312226633Sdim}
4313226633Sdim
4314309124Sdimstatic IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
4315226633Sdim                                               Value *TrampMem) {
4316226633Sdim  // Visit all the previous instructions in the basic block, and try to find a
4317226633Sdim  // init.trampoline which has a direct path to the adjust.trampoline.
4318296417Sdim  for (BasicBlock::iterator I = AdjustTramp->getIterator(),
4319296417Sdim                            E = AdjustTramp->getParent()->begin();
4320296417Sdim       I != E;) {
4321296417Sdim    Instruction *Inst = &*--I;
4322226633Sdim    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
4323226633Sdim      if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
4324226633Sdim          II->getOperand(0) == TrampMem)
4325226633Sdim        return II;
4326226633Sdim    if (Inst->mayWriteToMemory())
4327276479Sdim      return nullptr;
4328226633Sdim  }
4329276479Sdim  return nullptr;
4330226633Sdim}
4331226633Sdim
4332226633Sdim// Given a call to llvm.adjust.trampoline, find and return the corresponding
4333226633Sdim// call to llvm.init.trampoline if the call to the trampoline can be optimized
4334226633Sdim// to a direct call to a function.  Otherwise return NULL.
4335309124Sdimstatic IntrinsicInst *findInitTrampoline(Value *Callee) {
4336226633Sdim  Callee = Callee->stripPointerCasts();
4337226633Sdim  IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
4338226633Sdim  if (!AdjustTramp ||
4339226633Sdim      AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
4340276479Sdim    return nullptr;
4341226633Sdim
4342226633Sdim  Value *TrampMem = AdjustTramp->getOperand(0);
4343226633Sdim
4344309124Sdim  if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem))
4345226633Sdim    return IT;
4346309124Sdim  if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem))
4347226633Sdim    return IT;
4348276479Sdim  return nullptr;
4349226633Sdim}
4350226633Sdim
4351360784Sdimstatic void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
4352360784Sdim  unsigned NumArgs = Call.getNumArgOperands();
4353360784Sdim  ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
4354360784Sdim  ConstantInt *Op1C =
4355360784Sdim      (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
4356360784Sdim  // Bail out if the allocation size is zero.
4357360784Sdim  if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue()))
4358360784Sdim    return;
4359360784Sdim
4360360784Sdim  if (isMallocLikeFn(&Call, TLI) && Op0C) {
4361360784Sdim    if (isOpNewLikeFn(&Call, TLI))
4362360784Sdim      Call.addAttribute(AttributeList::ReturnIndex,
4363360784Sdim                        Attribute::getWithDereferenceableBytes(
4364360784Sdim                            Call.getContext(), Op0C->getZExtValue()));
4365360784Sdim    else
4366360784Sdim      Call.addAttribute(AttributeList::ReturnIndex,
4367360784Sdim                        Attribute::getWithDereferenceableOrNullBytes(
4368360784Sdim                            Call.getContext(), Op0C->getZExtValue()));
4369360784Sdim  } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
4370360784Sdim    Call.addAttribute(AttributeList::ReturnIndex,
4371360784Sdim                      Attribute::getWithDereferenceableOrNullBytes(
4372360784Sdim                          Call.getContext(), Op1C->getZExtValue()));
4373360784Sdim  } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
4374360784Sdim    bool Overflow;
4375360784Sdim    const APInt &N = Op0C->getValue();
4376360784Sdim    APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
4377360784Sdim    if (!Overflow)
4378360784Sdim      Call.addAttribute(AttributeList::ReturnIndex,
4379360784Sdim                        Attribute::getWithDereferenceableOrNullBytes(
4380360784Sdim                            Call.getContext(), Size.getZExtValue()));
4381360784Sdim  } else if (isStrdupLikeFn(&Call, TLI)) {
4382360784Sdim    uint64_t Len = GetStringLength(Call.getOperand(0));
4383360784Sdim    if (Len) {
4384360784Sdim      // strdup
4385360784Sdim      if (NumArgs == 1)
4386360784Sdim        Call.addAttribute(AttributeList::ReturnIndex,
4387360784Sdim                          Attribute::getWithDereferenceableOrNullBytes(
4388360784Sdim                              Call.getContext(), Len));
4389360784Sdim      // strndup
4390360784Sdim      else if (NumArgs == 2 && Op1C)
4391360784Sdim        Call.addAttribute(
4392360784Sdim            AttributeList::ReturnIndex,
4393360784Sdim            Attribute::getWithDereferenceableOrNullBytes(
4394360784Sdim                Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
4395360784Sdim    }
4396360784Sdim  }
4397360784Sdim}
4398360784Sdim
4399353358Sdim/// Improvements for call, callbr and invoke instructions.
4400353358SdimInstruction *InstCombiner::visitCallBase(CallBase &Call) {
4401360784Sdim  if (isAllocationFn(&Call, &TLI))
4402360784Sdim    annotateAnyAllocSite(Call, &TLI);
4403239462Sdim
4404202375Srdivacky  bool Changed = false;
4405202375Srdivacky
4406288943Sdim  // Mark any parameters that are known to be non-null with the nonnull
4407288943Sdim  // attribute.  This is helpful for inlining calls to functions with null
4408288943Sdim  // checks on their arguments.
4409321369Sdim  SmallVector<unsigned, 4> ArgNos;
4410288943Sdim  unsigned ArgNo = 0;
4411296417Sdim
4412353358Sdim  for (Value *V : Call.args()) {
4413309124Sdim    if (V->getType()->isPointerTy() &&
4414353358Sdim        !Call.paramHasAttr(ArgNo, Attribute::NonNull) &&
4415353358Sdim        isKnownNonZero(V, DL, 0, &AC, &Call, &DT))
4416321369Sdim      ArgNos.push_back(ArgNo);
4417288943Sdim    ArgNo++;
4418288943Sdim  }
4419296417Sdim
4420353358Sdim  assert(ArgNo == Call.arg_size() && "sanity check");
4421288943Sdim
4422321369Sdim  if (!ArgNos.empty()) {
4423353358Sdim    AttributeList AS = Call.getAttributes();
4424353358Sdim    LLVMContext &Ctx = Call.getContext();
4425321369Sdim    AS = AS.addParamAttribute(Ctx, ArgNos,
4426321369Sdim                              Attribute::get(Ctx, Attribute::NonNull));
4427353358Sdim    Call.setAttributes(AS);
4428296417Sdim    Changed = true;
4429296417Sdim  }
4430296417Sdim
4431218893Sdim  // If the callee is a pointer to a function, attempt to move any casts to the
4432353358Sdim  // arguments of the call/callbr/invoke.
4433353358Sdim  Value *Callee = Call.getCalledValue();
4434353358Sdim  if (!isa<Function>(Callee) && transformConstExprCastCall(Call))
4435276479Sdim    return nullptr;
4436202375Srdivacky
4437309124Sdim  if (Function *CalleeF = dyn_cast<Function>(Callee)) {
4438309124Sdim    // Remove the convergent attr on calls when the callee is not convergent.
4439353358Sdim    if (Call.isConvergent() && !CalleeF->isConvergent() &&
4440309124Sdim        !CalleeF->isIntrinsic()) {
4441353358Sdim      LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call
4442353358Sdim                        << "\n");
4443353358Sdim      Call.setNotConvergent();
4444353358Sdim      return &Call;
4445309124Sdim    }
4446309124Sdim
4447203954Srdivacky    // If the call and callee calling conventions don't match, this call must
4448203954Srdivacky    // be unreachable, as the call is undefined.
4449353358Sdim    if (CalleeF->getCallingConv() != Call.getCallingConv() &&
4450203954Srdivacky        // Only do this for calls to a function with a body.  A prototype may
4451203954Srdivacky        // not actually end up matching the implementation's calling conv for a
4452203954Srdivacky        // variety of reasons (e.g. it may be written in assembly).
4453203954Srdivacky        !CalleeF->isDeclaration()) {
4454353358Sdim      Instruction *OldCall = &Call;
4455353358Sdim      CreateNonTerminatorUnreachable(OldCall);
4456249423Sdim      // If OldCall does not return void then replaceAllUsesWith undef.
4457202375Srdivacky      // This allows ValueHandlers and custom metadata to adjust itself.
4458202375Srdivacky      if (!OldCall->getType()->isVoidTy())
4459309124Sdim        replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
4460203954Srdivacky      if (isa<CallInst>(OldCall))
4461309124Sdim        return eraseInstFromFunction(*OldCall);
4462234353Sdim
4463353358Sdim      // We cannot remove an invoke or a callbr, because it would change thexi
4464353358Sdim      // CFG, just change the callee to a null pointer.
4465353358Sdim      cast<CallBase>(OldCall)->setCalledFunction(
4466353358Sdim          CalleeF->getFunctionType(),
4467353358Sdim          Constant::getNullValue(CalleeF->getType()));
4468276479Sdim      return nullptr;
4469202375Srdivacky    }
4470309124Sdim  }
4471202375Srdivacky
4472341825Sdim  if ((isa<ConstantPointerNull>(Callee) &&
4473353358Sdim       !NullPointerIsDefined(Call.getFunction())) ||
4474341825Sdim      isa<UndefValue>(Callee)) {
4475353358Sdim    // If Call does not return void then replaceAllUsesWith undef.
4476202375Srdivacky    // This allows ValueHandlers and custom metadata to adjust itself.
4477353358Sdim    if (!Call.getType()->isVoidTy())
4478353358Sdim      replaceInstUsesWith(Call, UndefValue::get(Call.getType()));
4479202375Srdivacky
4480353358Sdim    if (Call.isTerminator()) {
4481353358Sdim      // Can't remove an invoke or callbr because we cannot change the CFG.
4482276479Sdim      return nullptr;
4483202375Srdivacky    }
4484239462Sdim
4485353358Sdim    // This instruction is not reachable, just remove it.
4486353358Sdim    CreateNonTerminatorUnreachable(&Call);
4487353358Sdim    return eraseInstFromFunction(Call);
4488202375Srdivacky  }
4489202375Srdivacky
4490309124Sdim  if (IntrinsicInst *II = findInitTrampoline(Callee))
4491353358Sdim    return transformCallThroughTrampoline(Call, *II);
4492202375Srdivacky
4493226633Sdim  PointerType *PTy = cast<PointerType>(Callee->getType());
4494226633Sdim  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
4495202375Srdivacky  if (FTy->isVarArg()) {
4496234353Sdim    int ix = FTy->getNumParams();
4497202375Srdivacky    // See if we can optimize any arguments passed through the varargs area of
4498202375Srdivacky    // the call.
4499353358Sdim    for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end();
4500353358Sdim         I != E; ++I, ++ix) {
4501202375Srdivacky      CastInst *CI = dyn_cast<CastInst>(*I);
4502353358Sdim      if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) {
4503202375Srdivacky        *I = CI->getOperand(0);
4504353358Sdim
4505353358Sdim        // Update the byval type to match the argument type.
4506353358Sdim        if (Call.isByValArgument(ix)) {
4507353358Sdim          Call.removeParamAttr(ix, Attribute::ByVal);
4508353358Sdim          Call.addParamAttr(
4509353358Sdim              ix, Attribute::getWithByValType(
4510353358Sdim                      Call.getContext(),
4511353358Sdim                      CI->getOperand(0)->getType()->getPointerElementType()));
4512353358Sdim        }
4513202375Srdivacky        Changed = true;
4514202375Srdivacky      }
4515202375Srdivacky    }
4516202375Srdivacky  }
4517202375Srdivacky
4518353358Sdim  if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) {
4519202375Srdivacky    // Inline asm calls cannot throw - mark them 'nounwind'.
4520353358Sdim    Call.setDoesNotThrow();
4521202375Srdivacky    Changed = true;
4522202375Srdivacky  }
4523202375Srdivacky
4524243830Sdim  // Try to optimize the call if possible, we require DataLayout for most of
4525204961Srdivacky  // this.  None of these calls are seen as possibly dead so go ahead and
4526204961Srdivacky  // delete the instruction now.
4527353358Sdim  if (CallInst *CI = dyn_cast<CallInst>(&Call)) {
4528288943Sdim    Instruction *I = tryOptimizeCall(CI);
4529204961Srdivacky    // If we changed something return the result, etc. Otherwise let
4530204961Srdivacky    // the fallthrough check.
4531309124Sdim    if (I) return eraseInstFromFunction(*I);
4532204961Srdivacky  }
4533204961Srdivacky
4534360784Sdim  if (isAllocLikeFn(&Call, &TLI))
4535360784Sdim    return visitAllocSite(Call);
4536360784Sdim
4537353358Sdim  return Changed ? &Call : nullptr;
4538202375Srdivacky}
4539202375Srdivacky
4540309124Sdim/// If the callee is a constexpr cast of a function, attempt to move the cast to
4541353358Sdim/// the arguments of the call/callbr/invoke.
4542353358Sdimbool InstCombiner::transformConstExprCastCall(CallBase &Call) {
4543353358Sdim  auto *Callee = dyn_cast<Function>(Call.getCalledValue()->stripPointerCasts());
4544276479Sdim  if (!Callee)
4545202375Srdivacky    return false;
4546314564Sdim
4547341825Sdim  // If this is a call to a thunk function, don't remove the cast. Thunks are
4548341825Sdim  // used to transparently forward all incoming parameters and outgoing return
4549341825Sdim  // values, so it's important to leave the cast in place.
4550280031Sdim  if (Callee->hasFnAttribute("thunk"))
4551280031Sdim    return false;
4552314564Sdim
4553341825Sdim  // If this is a musttail call, the callee's prototype must match the caller's
4554341825Sdim  // prototype with the exception of pointee types. The code below doesn't
4555341825Sdim  // implement that, so we can't do this transform.
4556341825Sdim  // TODO: Do the transform if it only requires adding pointer casts.
4557353358Sdim  if (Call.isMustTailCall())
4558341825Sdim    return false;
4559341825Sdim
4560353358Sdim  Instruction *Caller = &Call;
4561353358Sdim  const AttributeList &CallerPAL = Call.getAttributes();
4562202375Srdivacky
4563202375Srdivacky  // Okay, this is a cast from a function to a different type.  Unless doing so
4564202375Srdivacky  // would cause a type conversion of one of our arguments, change this call to
4565202375Srdivacky  // be a direct call with arguments casted to the appropriate types.
4566226633Sdim  FunctionType *FT = Callee->getFunctionType();
4567226633Sdim  Type *OldRetTy = Caller->getType();
4568226633Sdim  Type *NewRetTy = FT->getReturnType();
4569202375Srdivacky
4570202375Srdivacky  // Check to see if we are changing the return type...
4571202375Srdivacky  if (OldRetTy != NewRetTy) {
4572276479Sdim
4573276479Sdim    if (NewRetTy->isStructTy())
4574276479Sdim      return false; // TODO: Handle multiple return values.
4575276479Sdim
4576280031Sdim    if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) {
4577261991Sdim      if (Callee->isDeclaration())
4578261991Sdim        return false;   // Cannot transform this return value.
4579202375Srdivacky
4580261991Sdim      if (!Caller->use_empty() &&
4581261991Sdim          // void -> non-void is handled specially
4582261991Sdim          !NewRetTy->isVoidTy())
4583280031Sdim        return false;   // Cannot transform this return value.
4584261991Sdim    }
4585202375Srdivacky
4586202375Srdivacky    if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
4587321369Sdim      AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
4588288943Sdim      if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
4589202375Srdivacky        return false;   // Attribute not compatible with transformed value.
4590202375Srdivacky    }
4591202375Srdivacky
4592353358Sdim    // If the callbase is an invoke/callbr instruction, and the return value is
4593353358Sdim    // used by a PHI node in a successor, we cannot change the return type of
4594353358Sdim    // the call because there is no place to put the cast instruction (without
4595353358Sdim    // breaking the critical edge).  Bail out in this case.
4596353358Sdim    if (!Caller->use_empty()) {
4597202375Srdivacky      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
4598276479Sdim        for (User *U : II->users())
4599276479Sdim          if (PHINode *PN = dyn_cast<PHINode>(U))
4600202375Srdivacky            if (PN->getParent() == II->getNormalDest() ||
4601202375Srdivacky                PN->getParent() == II->getUnwindDest())
4602202375Srdivacky              return false;
4603353358Sdim      // FIXME: Be conservative for callbr to avoid a quadratic search.
4604353358Sdim      if (isa<CallBrInst>(Caller))
4605353358Sdim        return false;
4606353358Sdim    }
4607202375Srdivacky  }
4608202375Srdivacky
4609353358Sdim  unsigned NumActualArgs = Call.arg_size();
4610202375Srdivacky  unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
4611202375Srdivacky
4612280031Sdim  // Prevent us turning:
4613280031Sdim  // declare void @takes_i32_inalloca(i32* inalloca)
4614280031Sdim  //  call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0)
4615280031Sdim  //
4616280031Sdim  // into:
4617280031Sdim  //  call void @takes_i32_inalloca(i32* null)
4618288943Sdim  //
4619288943Sdim  //  Similarly, avoid folding away bitcasts of byval calls.
4620288943Sdim  if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
4621288943Sdim      Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
4622280031Sdim    return false;
4623280031Sdim
4624353358Sdim  auto AI = Call.arg_begin();
4625202375Srdivacky  for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
4626226633Sdim    Type *ParamTy = FT->getParamType(i);
4627226633Sdim    Type *ActTy = (*AI)->getType();
4628202375Srdivacky
4629280031Sdim    if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
4630202375Srdivacky      return false;   // Cannot transform this parameter value.
4631202375Srdivacky
4632321369Sdim    if (AttrBuilder(CallerPAL.getParamAttributes(i))
4633321369Sdim            .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
4634202375Srdivacky      return false;   // Attribute not compatible with transformed value.
4635234353Sdim
4636353358Sdim    if (Call.isInAllocaArgument(i))
4637276479Sdim      return false;   // Cannot transform to and from inalloca.
4638276479Sdim
4639218893Sdim    // If the parameter is passed as a byval argument, then we have to have a
4640218893Sdim    // sized type and the sized type has to have the same size as the old type.
4641321369Sdim    if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
4642226633Sdim      PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
4643288943Sdim      if (!ParamPTy || !ParamPTy->getElementType()->isSized())
4644218893Sdim        return false;
4645234353Sdim
4646353358Sdim      Type *CurElTy = Call.getParamByValType(i);
4647288943Sdim      if (DL.getTypeAllocSize(CurElTy) !=
4648288943Sdim          DL.getTypeAllocSize(ParamPTy->getElementType()))
4649218893Sdim        return false;
4650218893Sdim    }
4651202375Srdivacky  }
4652202375Srdivacky
4653219077Sdim  if (Callee->isDeclaration()) {
4654219077Sdim    // Do not delete arguments unless we have a function body.
4655219077Sdim    if (FT->getNumParams() < NumActualArgs && !FT->isVarArg())
4656219077Sdim      return false;
4657202375Srdivacky
4658219077Sdim    // If the callee is just a declaration, don't change the varargsness of the
4659219077Sdim    // call.  We don't want to introduce a varargs call where one doesn't
4660219077Sdim    // already exist.
4661353358Sdim    PointerType *APTy = cast<PointerType>(Call.getCalledValue()->getType());
4662219077Sdim    if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
4663219077Sdim      return false;
4664234353Sdim
4665234353Sdim    // If both the callee and the cast type are varargs, we still have to make
4666234353Sdim    // sure the number of fixed parameters are the same or we have the same
4667234353Sdim    // ABI issues as if we introduce a varargs call.
4668234353Sdim    if (FT->isVarArg() &&
4669234353Sdim        cast<FunctionType>(APTy->getElementType())->isVarArg() &&
4670234353Sdim        FT->getNumParams() !=
4671234353Sdim        cast<FunctionType>(APTy->getElementType())->getNumParams())
4672234353Sdim      return false;
4673219077Sdim  }
4674234353Sdim
4675202375Srdivacky  if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
4676321369Sdim      !CallerPAL.isEmpty()) {
4677202375Srdivacky    // In this case we have more arguments than the new function type, but we
4678202375Srdivacky    // won't be dropping them.  Check that these extra arguments have attributes
4679202375Srdivacky    // that are compatible with being a vararg call argument.
4680321369Sdim    unsigned SRetIdx;
4681321369Sdim    if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) &&
4682321369Sdim        SRetIdx > FT->getNumParams())
4683321369Sdim      return false;
4684321369Sdim  }
4685249423Sdim
4686202375Srdivacky  // Okay, we decided that this is a safe thing to do: go ahead and start
4687219077Sdim  // inserting cast instructions as necessary.
4688321369Sdim  SmallVector<Value *, 8> Args;
4689321369Sdim  SmallVector<AttributeSet, 8> ArgAttrs;
4690202375Srdivacky  Args.reserve(NumActualArgs);
4691321369Sdim  ArgAttrs.reserve(NumActualArgs);
4692202375Srdivacky
4693202375Srdivacky  // Get any return attributes.
4694321369Sdim  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
4695202375Srdivacky
4696202375Srdivacky  // If the return value is not being used, the type may not be compatible
4697202375Srdivacky  // with the existing attributes.  Wipe out any problematic attributes.
4698288943Sdim  RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));
4699202375Srdivacky
4700353358Sdim  LLVMContext &Ctx = Call.getContext();
4701353358Sdim  AI = Call.arg_begin();
4702202375Srdivacky  for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
4703226633Sdim    Type *ParamTy = FT->getParamType(i);
4704261991Sdim
4705321369Sdim    Value *NewArg = *AI;
4706321369Sdim    if ((*AI)->getType() != ParamTy)
4707321369Sdim      NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy);
4708321369Sdim    Args.push_back(NewArg);
4709202375Srdivacky
4710202375Srdivacky    // Add any parameter attributes.
4711353358Sdim    if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
4712353358Sdim      AttrBuilder AB(CallerPAL.getParamAttributes(i));
4713353358Sdim      AB.addByValAttr(NewArg->getType()->getPointerElementType());
4714353358Sdim      ArgAttrs.push_back(AttributeSet::get(Ctx, AB));
4715353358Sdim    } else
4716353358Sdim      ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
4717202375Srdivacky  }
4718202375Srdivacky
4719202375Srdivacky  // If the function takes more arguments than the call was taking, add them
4720202375Srdivacky  // now.
4721321369Sdim  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) {
4722202375Srdivacky    Args.push_back(Constant::getNullValue(FT->getParamType(i)));
4723321369Sdim    ArgAttrs.push_back(AttributeSet());
4724321369Sdim  }
4725202375Srdivacky
4726202375Srdivacky  // If we are removing arguments to the function, emit an obnoxious warning.
4727202375Srdivacky  if (FT->getNumParams() < NumActualArgs) {
4728249423Sdim    // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722
4729249423Sdim    if (FT->isVarArg()) {
4730202375Srdivacky      // Add all of the arguments in their promoted form to the arg list.
4731202375Srdivacky      for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
4732226633Sdim        Type *PTy = getPromotedType((*AI)->getType());
4733321369Sdim        Value *NewArg = *AI;
4734202375Srdivacky        if (PTy != (*AI)->getType()) {
4735202375Srdivacky          // Must promote to pass through va_arg area!
4736202375Srdivacky          Instruction::CastOps opcode =
4737202375Srdivacky            CastInst::getCastOpcode(*AI, false, PTy, false);
4738321369Sdim          NewArg = Builder.CreateCast(opcode, *AI, PTy);
4739202375Srdivacky        }
4740321369Sdim        Args.push_back(NewArg);
4741202375Srdivacky
4742202375Srdivacky        // Add any parameter attributes.
4743321369Sdim        ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
4744202375Srdivacky      }
4745202375Srdivacky    }
4746202375Srdivacky  }
4747202375Srdivacky
4748249423Sdim  AttributeSet FnAttrs = CallerPAL.getFnAttributes();
4749202375Srdivacky
4750202375Srdivacky  if (NewRetTy->isVoidTy())
4751202375Srdivacky    Caller->setName("");   // Void type should not have a name.
4752202375Srdivacky
4753321369Sdim  assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) &&
4754321369Sdim         "missing argument attributes");
4755321369Sdim  AttributeList NewCallerPAL = AttributeList::get(
4756321369Sdim      Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs);
4757202375Srdivacky
4758296417Sdim  SmallVector<OperandBundleDef, 1> OpBundles;
4759353358Sdim  Call.getOperandBundlesAsDefs(OpBundles);
4760296417Sdim
4761353358Sdim  CallBase *NewCall;
4762202375Srdivacky  if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
4763353358Sdim    NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(),
4764353358Sdim                                   II->getUnwindDest(), Args, OpBundles);
4765353358Sdim  } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) {
4766353358Sdim    NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(),
4767353358Sdim                                   CBI->getIndirectDests(), Args, OpBundles);
4768202375Srdivacky  } else {
4769353358Sdim    NewCall = Builder.CreateCall(Callee, Args, OpBundles);
4770353358Sdim    cast<CallInst>(NewCall)->setTailCallKind(
4771353358Sdim        cast<CallInst>(Caller)->getTailCallKind());
4772202375Srdivacky  }
4773353358Sdim  NewCall->takeName(Caller);
4774353358Sdim  NewCall->setCallingConv(Call.getCallingConv());
4775353358Sdim  NewCall->setAttributes(NewCallerPAL);
4776202375Srdivacky
4777321369Sdim  // Preserve the weight metadata for the new call instruction. The metadata
4778321369Sdim  // is used by SamplePGO to check callsite's hotness.
4779321369Sdim  uint64_t W;
4780321369Sdim  if (Caller->extractProfTotalWeight(W))
4781353358Sdim    NewCall->setProfWeight(W);
4782321369Sdim
4783202375Srdivacky  // Insert a cast of the return type as necessary.
4784353358Sdim  Instruction *NC = NewCall;
4785202375Srdivacky  Value *NV = NC;
4786202375Srdivacky  if (OldRetTy != NV->getType() && !Caller->use_empty()) {
4787202375Srdivacky    if (!NV->getType()->isVoidTy()) {
4788280031Sdim      NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy);
4789223017Sdim      NC->setDebugLoc(Caller->getDebugLoc());
4790202375Srdivacky
4791353358Sdim      // If this is an invoke/callbr instruction, we should insert it after the
4792353358Sdim      // first non-phi instruction in the normal successor block.
4793202375Srdivacky      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
4794226633Sdim        BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
4795202375Srdivacky        InsertNewInstBefore(NC, *I);
4796353358Sdim      } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) {
4797353358Sdim        BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt();
4798353358Sdim        InsertNewInstBefore(NC, *I);
4799202375Srdivacky      } else {
4800218893Sdim        // Otherwise, it's a call, just insert cast right after the call.
4801202375Srdivacky        InsertNewInstBefore(NC, *Caller);
4802202375Srdivacky      }
4803202375Srdivacky      Worklist.AddUsersToWorkList(*Caller);
4804202375Srdivacky    } else {
4805202375Srdivacky      NV = UndefValue::get(Caller->getType());
4806202375Srdivacky    }
4807202375Srdivacky  }
4808202375Srdivacky
4809202375Srdivacky  if (!Caller->use_empty())
4810309124Sdim    replaceInstUsesWith(*Caller, NV);
4811280031Sdim  else if (Caller->hasValueHandle()) {
4812280031Sdim    if (OldRetTy == NV->getType())
4813280031Sdim      ValueHandleBase::ValueIsRAUWd(Caller, NV);
4814280031Sdim    else
4815280031Sdim      // We cannot call ValueIsRAUWd with a different type, and the
4816280031Sdim      // actual tracked value will disappear.
4817280031Sdim      ValueHandleBase::ValueIsDeleted(Caller);
4818280031Sdim  }
4819223017Sdim
4820309124Sdim  eraseInstFromFunction(*Caller);
4821202375Srdivacky  return true;
4822202375Srdivacky}
4823202375Srdivacky
4824309124Sdim/// Turn a call to a function created by init_trampoline / adjust_trampoline
4825309124Sdim/// intrinsic pair into a direct call to the underlying function.
4826226633SdimInstruction *
4827353358SdimInstCombiner::transformCallThroughTrampoline(CallBase &Call,
4828353358Sdim                                             IntrinsicInst &Tramp) {
4829353358Sdim  Value *Callee = Call.getCalledValue();
4830353358Sdim  Type *CalleeTy = Callee->getType();
4831353358Sdim  FunctionType *FTy = Call.getFunctionType();
4832353358Sdim  AttributeList Attrs = Call.getAttributes();
4833202375Srdivacky
4834202375Srdivacky  // If the call already has the 'nest' attribute somewhere then give up -
4835202375Srdivacky  // otherwise 'nest' would occur twice after splicing in the chain.
4836249423Sdim  if (Attrs.hasAttrSomewhere(Attribute::Nest))
4837276479Sdim    return nullptr;
4838202375Srdivacky
4839353358Sdim  Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts());
4840353358Sdim  FunctionType *NestFTy = NestF->getFunctionType();
4841202375Srdivacky
4842321369Sdim  AttributeList NestAttrs = NestF->getAttributes();
4843202375Srdivacky  if (!NestAttrs.isEmpty()) {
4844321369Sdim    unsigned NestArgNo = 0;
4845276479Sdim    Type *NestTy = nullptr;
4846249423Sdim    AttributeSet NestAttr;
4847202375Srdivacky
4848202375Srdivacky    // Look for a parameter marked with the 'nest' attribute.
4849202375Srdivacky    for (FunctionType::param_iterator I = NestFTy->param_begin(),
4850321369Sdim                                      E = NestFTy->param_end();
4851321369Sdim         I != E; ++NestArgNo, ++I) {
4852321369Sdim      AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
4853321369Sdim      if (AS.hasAttribute(Attribute::Nest)) {
4854202375Srdivacky        // Record the parameter type and any other attributes.
4855202375Srdivacky        NestTy = *I;
4856321369Sdim        NestAttr = AS;
4857202375Srdivacky        break;
4858202375Srdivacky      }
4859321369Sdim    }
4860202375Srdivacky
4861202375Srdivacky    if (NestTy) {
4862202375Srdivacky      std::vector<Value*> NewArgs;
4863321369Sdim      std::vector<AttributeSet> NewArgAttrs;
4864353358Sdim      NewArgs.reserve(Call.arg_size() + 1);
4865353358Sdim      NewArgAttrs.reserve(Call.arg_size());
4866202375Srdivacky
4867202375Srdivacky      // Insert the nest argument into the call argument list, which may
4868202375Srdivacky      // mean appending it.  Likewise for attributes.
4869202375Srdivacky
4870202375Srdivacky      {
4871321369Sdim        unsigned ArgNo = 0;
4872353358Sdim        auto I = Call.arg_begin(), E = Call.arg_end();
4873202375Srdivacky        do {
4874321369Sdim          if (ArgNo == NestArgNo) {
4875202375Srdivacky            // Add the chain argument and attributes.
4876353358Sdim            Value *NestVal = Tramp.getArgOperand(2);
4877202375Srdivacky            if (NestVal->getType() != NestTy)
4878321369Sdim              NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest");
4879202375Srdivacky            NewArgs.push_back(NestVal);
4880321369Sdim            NewArgAttrs.push_back(NestAttr);
4881202375Srdivacky          }
4882202375Srdivacky
4883202375Srdivacky          if (I == E)
4884202375Srdivacky            break;
4885202375Srdivacky
4886202375Srdivacky          // Add the original argument and attributes.
4887202375Srdivacky          NewArgs.push_back(*I);
4888321369Sdim          NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
4889202375Srdivacky
4890321369Sdim          ++ArgNo;
4891309124Sdim          ++I;
4892314564Sdim        } while (true);
4893202375Srdivacky      }
4894202375Srdivacky
4895202375Srdivacky      // The trampoline may have been bitcast to a bogus type (FTy).
4896202375Srdivacky      // Handle this by synthesizing a new function type, equal to FTy
4897202375Srdivacky      // with the chain parameter inserted.
4898202375Srdivacky
4899224145Sdim      std::vector<Type*> NewTypes;
4900202375Srdivacky      NewTypes.reserve(FTy->getNumParams()+1);
4901202375Srdivacky
4902202375Srdivacky      // Insert the chain's type into the list of parameter types, which may
4903202375Srdivacky      // mean appending it.
4904202375Srdivacky      {
4905321369Sdim        unsigned ArgNo = 0;
4906202375Srdivacky        FunctionType::param_iterator I = FTy->param_begin(),
4907202375Srdivacky          E = FTy->param_end();
4908202375Srdivacky
4909202375Srdivacky        do {
4910321369Sdim          if (ArgNo == NestArgNo)
4911202375Srdivacky            // Add the chain's type.
4912202375Srdivacky            NewTypes.push_back(NestTy);
4913202375Srdivacky
4914202375Srdivacky          if (I == E)
4915202375Srdivacky            break;
4916202375Srdivacky
4917202375Srdivacky          // Add the original type.
4918202375Srdivacky          NewTypes.push_back(*I);
4919202375Srdivacky
4920321369Sdim          ++ArgNo;
4921309124Sdim          ++I;
4922314564Sdim        } while (true);
4923202375Srdivacky      }
4924202375Srdivacky
4925202375Srdivacky      // Replace the trampoline call with a direct call.  Let the generic
4926202375Srdivacky      // code sort out any function type mismatches.
4927234353Sdim      FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes,
4928202375Srdivacky                                                FTy->isVarArg());
4929202375Srdivacky      Constant *NewCallee =
4930202375Srdivacky        NestF->getType() == PointerType::getUnqual(NewFTy) ?
4931234353Sdim        NestF : ConstantExpr::getBitCast(NestF,
4932202375Srdivacky                                         PointerType::getUnqual(NewFTy));
4933321369Sdim      AttributeList NewPAL =
4934321369Sdim          AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
4935321369Sdim                             Attrs.getRetAttributes(), NewArgAttrs);
4936202375Srdivacky
4937309124Sdim      SmallVector<OperandBundleDef, 1> OpBundles;
4938353358Sdim      Call.getOperandBundlesAsDefs(OpBundles);
4939309124Sdim
4940202375Srdivacky      Instruction *NewCaller;
4941353358Sdim      if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
4942353358Sdim        NewCaller = InvokeInst::Create(NewFTy, NewCallee,
4943202375Srdivacky                                       II->getNormalDest(), II->getUnwindDest(),
4944309124Sdim                                       NewArgs, OpBundles);
4945202375Srdivacky        cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
4946202375Srdivacky        cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
4947353358Sdim      } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) {
4948353358Sdim        NewCaller =
4949353358Sdim            CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(),
4950353358Sdim                               CBI->getIndirectDests(), NewArgs, OpBundles);
4951353358Sdim        cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv());
4952353358Sdim        cast<CallBrInst>(NewCaller)->setAttributes(NewPAL);
4953202375Srdivacky      } else {
4954353358Sdim        NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles);
4955314564Sdim        cast<CallInst>(NewCaller)->setTailCallKind(
4956353358Sdim            cast<CallInst>(Call).getTailCallKind());
4957314564Sdim        cast<CallInst>(NewCaller)->setCallingConv(
4958353358Sdim            cast<CallInst>(Call).getCallingConv());
4959202375Srdivacky        cast<CallInst>(NewCaller)->setAttributes(NewPAL);
4960202375Srdivacky      }
4961353358Sdim      NewCaller->setDebugLoc(Call.getDebugLoc());
4962223017Sdim
4963223017Sdim      return NewCaller;
4964202375Srdivacky    }
4965202375Srdivacky  }
4966202375Srdivacky
4967202375Srdivacky  // Replace the trampoline call with a direct call.  Since there is no 'nest'
4968202375Srdivacky  // parameter, there is no need to adjust the argument list.  Let the generic
4969202375Srdivacky  // code sort out any function type mismatches.
4970353358Sdim  Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy);
4971353358Sdim  Call.setCalledFunction(FTy, NewCallee);
4972353358Sdim  return &Call;
4973202375Srdivacky}
4974