1//===---- CGBuiltin.cpp - Emit LLVM Code for builtins ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This contains code to emit Builtin calls as LLVM code.
10//
11//===----------------------------------------------------------------------===//
12
13#include "ABIInfo.h"
14#include "CGCUDARuntime.h"
15#include "CGCXXABI.h"
16#include "CGObjCRuntime.h"
17#include "CGOpenCLRuntime.h"
18#include "CGRecordLayout.h"
19#include "CodeGenFunction.h"
20#include "CodeGenModule.h"
21#include "ConstantEmitter.h"
22#include "PatternInit.h"
23#include "TargetInfo.h"
24#include "clang/AST/ASTContext.h"
25#include "clang/AST/Attr.h"
26#include "clang/AST/Decl.h"
27#include "clang/AST/OSLog.h"
28#include "clang/AST/OperationKinds.h"
29#include "clang/Basic/TargetBuiltins.h"
30#include "clang/Basic/TargetInfo.h"
31#include "clang/Basic/TargetOptions.h"
32#include "clang/CodeGen/CGFunctionInfo.h"
33#include "clang/Frontend/FrontendDiagnostic.h"
34#include "llvm/ADT/APFloat.h"
35#include "llvm/ADT/APInt.h"
36#include "llvm/ADT/FloatingPointMode.h"
37#include "llvm/ADT/SmallPtrSet.h"
38#include "llvm/ADT/StringExtras.h"
39#include "llvm/Analysis/ValueTracking.h"
40#include "llvm/IR/DataLayout.h"
41#include "llvm/IR/InlineAsm.h"
42#include "llvm/IR/Intrinsics.h"
43#include "llvm/IR/IntrinsicsAArch64.h"
44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsARM.h"
46#include "llvm/IR/IntrinsicsBPF.h"
47#include "llvm/IR/IntrinsicsHexagon.h"
48#include "llvm/IR/IntrinsicsNVPTX.h"
49#include "llvm/IR/IntrinsicsPowerPC.h"
50#include "llvm/IR/IntrinsicsR600.h"
51#include "llvm/IR/IntrinsicsRISCV.h"
52#include "llvm/IR/IntrinsicsS390.h"
53#include "llvm/IR/IntrinsicsVE.h"
54#include "llvm/IR/IntrinsicsWebAssembly.h"
55#include "llvm/IR/IntrinsicsX86.h"
56#include "llvm/IR/MDBuilder.h"
57#include "llvm/IR/MatrixBuilder.h"
58#include "llvm/Support/ConvertUTF.h"
59#include "llvm/Support/MathExtras.h"
60#include "llvm/Support/ScopedPrinter.h"
61#include "llvm/TargetParser/AArch64TargetParser.h"
62#include "llvm/TargetParser/X86TargetParser.h"
63#include <optional>
64#include <sstream>
65
66using namespace clang;
67using namespace CodeGen;
68using namespace llvm;
69
70static void initializeAlloca(CodeGenFunction &CGF, AllocaInst *AI, Value *Size,
71                             Align AlignmentInBytes) {
72  ConstantInt *Byte;
73  switch (CGF.getLangOpts().getTrivialAutoVarInit()) {
74  case LangOptions::TrivialAutoVarInitKind::Uninitialized:
75    // Nothing to initialize.
76    return;
77  case LangOptions::TrivialAutoVarInitKind::Zero:
78    Byte = CGF.Builder.getInt8(0x00);
79    break;
80  case LangOptions::TrivialAutoVarInitKind::Pattern: {
81    llvm::Type *Int8 = llvm::IntegerType::getInt8Ty(CGF.CGM.getLLVMContext());
82    Byte = llvm::dyn_cast<llvm::ConstantInt>(
83        initializationPatternFor(CGF.CGM, Int8));
84    break;
85  }
86  }
87  if (CGF.CGM.stopAutoInit())
88    return;
89  auto *I = CGF.Builder.CreateMemSet(AI, Byte, Size, AlignmentInBytes);
90  I->addAnnotationMetadata("auto-init");
91}
92
93/// getBuiltinLibFunction - Given a builtin id for a function like
94/// "__builtin_fabsf", return a Function* for "fabsf".
95llvm::Constant *CodeGenModule::getBuiltinLibFunction(const FunctionDecl *FD,
96                                                     unsigned BuiltinID) {
97  assert(Context.BuiltinInfo.isLibFunction(BuiltinID));
98
99  // Get the name, skip over the __builtin_ prefix (if necessary).
100  StringRef Name;
101  GlobalDecl D(FD);
102
103  // TODO: This list should be expanded or refactored after all GCC-compatible
104  // std libcall builtins are implemented.
105  static SmallDenseMap<unsigned, StringRef, 64> F128Builtins{
106      {Builtin::BI__builtin___fprintf_chk, "__fprintf_chkieee128"},
107      {Builtin::BI__builtin___printf_chk, "__printf_chkieee128"},
108      {Builtin::BI__builtin___snprintf_chk, "__snprintf_chkieee128"},
109      {Builtin::BI__builtin___sprintf_chk, "__sprintf_chkieee128"},
110      {Builtin::BI__builtin___vfprintf_chk, "__vfprintf_chkieee128"},
111      {Builtin::BI__builtin___vprintf_chk, "__vprintf_chkieee128"},
112      {Builtin::BI__builtin___vsnprintf_chk, "__vsnprintf_chkieee128"},
113      {Builtin::BI__builtin___vsprintf_chk, "__vsprintf_chkieee128"},
114      {Builtin::BI__builtin_fprintf, "__fprintfieee128"},
115      {Builtin::BI__builtin_printf, "__printfieee128"},
116      {Builtin::BI__builtin_snprintf, "__snprintfieee128"},
117      {Builtin::BI__builtin_sprintf, "__sprintfieee128"},
118      {Builtin::BI__builtin_vfprintf, "__vfprintfieee128"},
119      {Builtin::BI__builtin_vprintf, "__vprintfieee128"},
120      {Builtin::BI__builtin_vsnprintf, "__vsnprintfieee128"},
121      {Builtin::BI__builtin_vsprintf, "__vsprintfieee128"},
122      {Builtin::BI__builtin_fscanf, "__fscanfieee128"},
123      {Builtin::BI__builtin_scanf, "__scanfieee128"},
124      {Builtin::BI__builtin_sscanf, "__sscanfieee128"},
125      {Builtin::BI__builtin_vfscanf, "__vfscanfieee128"},
126      {Builtin::BI__builtin_vscanf, "__vscanfieee128"},
127      {Builtin::BI__builtin_vsscanf, "__vsscanfieee128"},
128      {Builtin::BI__builtin_nexttowardf128, "__nexttowardieee128"},
129  };
130
131  // The AIX library functions frexpl, ldexpl, and modfl are for 128-bit
132  // IBM 'long double' (i.e. __ibm128). Map to the 'double' versions
133  // if it is 64-bit 'long double' mode.
134  static SmallDenseMap<unsigned, StringRef, 4> AIXLongDouble64Builtins{
135      {Builtin::BI__builtin_frexpl, "frexp"},
136      {Builtin::BI__builtin_ldexpl, "ldexp"},
137      {Builtin::BI__builtin_modfl, "modf"},
138  };
139
140  // If the builtin has been declared explicitly with an assembler label,
141  // use the mangled name. This differs from the plain label on platforms
142  // that prefix labels.
143  if (FD->hasAttr<AsmLabelAttr>())
144    Name = getMangledName(D);
145  else {
146    // TODO: This mutation should also be applied to other targets other than
147    // PPC, after backend supports IEEE 128-bit style libcalls.
148    if (getTriple().isPPC64() &&
149        &getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad() &&
150        F128Builtins.contains(BuiltinID))
151      Name = F128Builtins[BuiltinID];
152    else if (getTriple().isOSAIX() &&
153             &getTarget().getLongDoubleFormat() ==
154                 &llvm::APFloat::IEEEdouble() &&
155             AIXLongDouble64Builtins.contains(BuiltinID))
156      Name = AIXLongDouble64Builtins[BuiltinID];
157    else
158      Name = Context.BuiltinInfo.getName(BuiltinID).substr(10);
159  }
160
161  llvm::FunctionType *Ty =
162    cast<llvm::FunctionType>(getTypes().ConvertType(FD->getType()));
163
164  return GetOrCreateLLVMFunction(Name, Ty, D, /*ForVTable=*/false);
165}
166
167/// Emit the conversions required to turn the given value into an
168/// integer of the given size.
169static Value *EmitToInt(CodeGenFunction &CGF, llvm::Value *V,
170                        QualType T, llvm::IntegerType *IntType) {
171  V = CGF.EmitToMemory(V, T);
172
173  if (V->getType()->isPointerTy())
174    return CGF.Builder.CreatePtrToInt(V, IntType);
175
176  assert(V->getType() == IntType);
177  return V;
178}
179
180static Value *EmitFromInt(CodeGenFunction &CGF, llvm::Value *V,
181                          QualType T, llvm::Type *ResultType) {
182  V = CGF.EmitFromMemory(V, T);
183
184  if (ResultType->isPointerTy())
185    return CGF.Builder.CreateIntToPtr(V, ResultType);
186
187  assert(V->getType() == ResultType);
188  return V;
189}
190
191static Address CheckAtomicAlignment(CodeGenFunction &CGF, const CallExpr *E) {
192  ASTContext &Ctx = CGF.getContext();
193  Address Ptr = CGF.EmitPointerWithAlignment(E->getArg(0));
194  unsigned Bytes = Ptr.getElementType()->isPointerTy()
195                       ? Ctx.getTypeSizeInChars(Ctx.VoidPtrTy).getQuantity()
196                       : Ptr.getElementType()->getScalarSizeInBits() / 8;
197  unsigned Align = Ptr.getAlignment().getQuantity();
198  if (Align % Bytes != 0) {
199    DiagnosticsEngine &Diags = CGF.CGM.getDiags();
200    Diags.Report(E->getBeginLoc(), diag::warn_sync_op_misaligned);
201    // Force address to be at least naturally-aligned.
202    return Ptr.withAlignment(CharUnits::fromQuantity(Bytes));
203  }
204  return Ptr;
205}
206
207/// Utility to insert an atomic instruction based on Intrinsic::ID
208/// and the expression node.
209static Value *MakeBinaryAtomicValue(
210    CodeGenFunction &CGF, llvm::AtomicRMWInst::BinOp Kind, const CallExpr *E,
211    AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
212
213  QualType T = E->getType();
214  assert(E->getArg(0)->getType()->isPointerType());
215  assert(CGF.getContext().hasSameUnqualifiedType(T,
216                                  E->getArg(0)->getType()->getPointeeType()));
217  assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
218
219  Address DestAddr = CheckAtomicAlignment(CGF, E);
220
221  llvm::IntegerType *IntType = llvm::IntegerType::get(
222      CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
223
224  llvm::Value *Val = CGF.EmitScalarExpr(E->getArg(1));
225  llvm::Type *ValueType = Val->getType();
226  Val = EmitToInt(CGF, Val, T, IntType);
227
228  llvm::Value *Result =
229      CGF.Builder.CreateAtomicRMW(Kind, DestAddr, Val, Ordering);
230  return EmitFromInt(CGF, Result, T, ValueType);
231}
232
233static Value *EmitNontemporalStore(CodeGenFunction &CGF, const CallExpr *E) {
234  Value *Val = CGF.EmitScalarExpr(E->getArg(0));
235  Address Addr = CGF.EmitPointerWithAlignment(E->getArg(1));
236
237  Val = CGF.EmitToMemory(Val, E->getArg(0)->getType());
238  LValue LV = CGF.MakeAddrLValue(Addr, E->getArg(0)->getType());
239  LV.setNontemporal(true);
240  CGF.EmitStoreOfScalar(Val, LV, false);
241  return nullptr;
242}
243
244static Value *EmitNontemporalLoad(CodeGenFunction &CGF, const CallExpr *E) {
245  Address Addr = CGF.EmitPointerWithAlignment(E->getArg(0));
246
247  LValue LV = CGF.MakeAddrLValue(Addr, E->getType());
248  LV.setNontemporal(true);
249  return CGF.EmitLoadOfScalar(LV, E->getExprLoc());
250}
251
252static RValue EmitBinaryAtomic(CodeGenFunction &CGF,
253                               llvm::AtomicRMWInst::BinOp Kind,
254                               const CallExpr *E) {
255  return RValue::get(MakeBinaryAtomicValue(CGF, Kind, E));
256}
257
258/// Utility to insert an atomic instruction based Intrinsic::ID and
259/// the expression node, where the return value is the result of the
260/// operation.
261static RValue EmitBinaryAtomicPost(CodeGenFunction &CGF,
262                                   llvm::AtomicRMWInst::BinOp Kind,
263                                   const CallExpr *E,
264                                   Instruction::BinaryOps Op,
265                                   bool Invert = false) {
266  QualType T = E->getType();
267  assert(E->getArg(0)->getType()->isPointerType());
268  assert(CGF.getContext().hasSameUnqualifiedType(T,
269                                  E->getArg(0)->getType()->getPointeeType()));
270  assert(CGF.getContext().hasSameUnqualifiedType(T, E->getArg(1)->getType()));
271
272  Address DestAddr = CheckAtomicAlignment(CGF, E);
273
274  llvm::IntegerType *IntType = llvm::IntegerType::get(
275      CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
276
277  llvm::Value *Val = CGF.EmitScalarExpr(E->getArg(1));
278  llvm::Type *ValueType = Val->getType();
279  Val = EmitToInt(CGF, Val, T, IntType);
280
281  llvm::Value *Result = CGF.Builder.CreateAtomicRMW(
282      Kind, DestAddr, Val, llvm::AtomicOrdering::SequentiallyConsistent);
283  Result = CGF.Builder.CreateBinOp(Op, Result, Val);
284  if (Invert)
285    Result =
286        CGF.Builder.CreateBinOp(llvm::Instruction::Xor, Result,
287                                llvm::ConstantInt::getAllOnesValue(IntType));
288  Result = EmitFromInt(CGF, Result, T, ValueType);
289  return RValue::get(Result);
290}
291
292/// Utility to insert an atomic cmpxchg instruction.
293///
294/// @param CGF The current codegen function.
295/// @param E   Builtin call expression to convert to cmpxchg.
296///            arg0 - address to operate on
297///            arg1 - value to compare with
298///            arg2 - new value
299/// @param ReturnBool Specifies whether to return success flag of
300///                   cmpxchg result or the old value.
301///
302/// @returns result of cmpxchg, according to ReturnBool
303///
304/// Note: In order to lower Microsoft's _InterlockedCompareExchange* intrinsics
305/// invoke the function EmitAtomicCmpXchgForMSIntrin.
306static Value *MakeAtomicCmpXchgValue(CodeGenFunction &CGF, const CallExpr *E,
307                                     bool ReturnBool) {
308  QualType T = ReturnBool ? E->getArg(1)->getType() : E->getType();
309  Address DestAddr = CheckAtomicAlignment(CGF, E);
310
311  llvm::IntegerType *IntType = llvm::IntegerType::get(
312      CGF.getLLVMContext(), CGF.getContext().getTypeSize(T));
313
314  Value *Cmp = CGF.EmitScalarExpr(E->getArg(1));
315  llvm::Type *ValueType = Cmp->getType();
316  Cmp = EmitToInt(CGF, Cmp, T, IntType);
317  Value *New = EmitToInt(CGF, CGF.EmitScalarExpr(E->getArg(2)), T, IntType);
318
319  Value *Pair = CGF.Builder.CreateAtomicCmpXchg(
320      DestAddr, Cmp, New, llvm::AtomicOrdering::SequentiallyConsistent,
321      llvm::AtomicOrdering::SequentiallyConsistent);
322  if (ReturnBool)
323    // Extract boolean success flag and zext it to int.
324    return CGF.Builder.CreateZExt(CGF.Builder.CreateExtractValue(Pair, 1),
325                                  CGF.ConvertType(E->getType()));
326  else
327    // Extract old value and emit it using the same type as compare value.
328    return EmitFromInt(CGF, CGF.Builder.CreateExtractValue(Pair, 0), T,
329                       ValueType);
330}
331
332/// This function should be invoked to emit atomic cmpxchg for Microsoft's
333/// _InterlockedCompareExchange* intrinsics which have the following signature:
334/// T _InterlockedCompareExchange(T volatile *Destination,
335///                               T Exchange,
336///                               T Comparand);
337///
338/// Whereas the llvm 'cmpxchg' instruction has the following syntax:
339/// cmpxchg *Destination, Comparand, Exchange.
340/// So we need to swap Comparand and Exchange when invoking
341/// CreateAtomicCmpXchg. That is the reason we could not use the above utility
342/// function MakeAtomicCmpXchgValue since it expects the arguments to be
343/// already swapped.
344
345static
346Value *EmitAtomicCmpXchgForMSIntrin(CodeGenFunction &CGF, const CallExpr *E,
347    AtomicOrdering SuccessOrdering = AtomicOrdering::SequentiallyConsistent) {
348  assert(E->getArg(0)->getType()->isPointerType());
349  assert(CGF.getContext().hasSameUnqualifiedType(
350      E->getType(), E->getArg(0)->getType()->getPointeeType()));
351  assert(CGF.getContext().hasSameUnqualifiedType(E->getType(),
352                                                 E->getArg(1)->getType()));
353  assert(CGF.getContext().hasSameUnqualifiedType(E->getType(),
354                                                 E->getArg(2)->getType()));
355
356  Address DestAddr = CheckAtomicAlignment(CGF, E);
357
358  auto *Comparand = CGF.EmitScalarExpr(E->getArg(2));
359  auto *Exchange = CGF.EmitScalarExpr(E->getArg(1));
360
361  // For Release ordering, the failure ordering should be Monotonic.
362  auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release ?
363                         AtomicOrdering::Monotonic :
364                         SuccessOrdering;
365
366  // The atomic instruction is marked volatile for consistency with MSVC. This
367  // blocks the few atomics optimizations that LLVM has. If we want to optimize
368  // _Interlocked* operations in the future, we will have to remove the volatile
369  // marker.
370  auto *Result = CGF.Builder.CreateAtomicCmpXchg(
371      DestAddr, Comparand, Exchange, SuccessOrdering, FailureOrdering);
372  Result->setVolatile(true);
373  return CGF.Builder.CreateExtractValue(Result, 0);
374}
375
376// 64-bit Microsoft platforms support 128 bit cmpxchg operations. They are
377// prototyped like this:
378//
379// unsigned char _InterlockedCompareExchange128...(
380//     __int64 volatile * _Destination,
381//     __int64 _ExchangeHigh,
382//     __int64 _ExchangeLow,
383//     __int64 * _ComparandResult);
384//
385// Note that Destination is assumed to be at least 16-byte aligned, despite
386// being typed int64.
387
388static Value *EmitAtomicCmpXchg128ForMSIntrin(CodeGenFunction &CGF,
389                                              const CallExpr *E,
390                                              AtomicOrdering SuccessOrdering) {
391  assert(E->getNumArgs() == 4);
392  llvm::Value *DestPtr = CGF.EmitScalarExpr(E->getArg(0));
393  llvm::Value *ExchangeHigh = CGF.EmitScalarExpr(E->getArg(1));
394  llvm::Value *ExchangeLow = CGF.EmitScalarExpr(E->getArg(2));
395  Address ComparandAddr = CGF.EmitPointerWithAlignment(E->getArg(3));
396
397  assert(DestPtr->getType()->isPointerTy());
398  assert(!ExchangeHigh->getType()->isPointerTy());
399  assert(!ExchangeLow->getType()->isPointerTy());
400
401  // For Release ordering, the failure ordering should be Monotonic.
402  auto FailureOrdering = SuccessOrdering == AtomicOrdering::Release
403                             ? AtomicOrdering::Monotonic
404                             : SuccessOrdering;
405
406  // Convert to i128 pointers and values. Alignment is also overridden for
407  // destination pointer.
408  llvm::Type *Int128Ty = llvm::IntegerType::get(CGF.getLLVMContext(), 128);
409  Address DestAddr(DestPtr, Int128Ty,
410                   CGF.getContext().toCharUnitsFromBits(128));
411  ComparandAddr = ComparandAddr.withElementType(Int128Ty);
412
413  // (((i128)hi) << 64) | ((i128)lo)
414  ExchangeHigh = CGF.Builder.CreateZExt(ExchangeHigh, Int128Ty);
415  ExchangeLow = CGF.Builder.CreateZExt(ExchangeLow, Int128Ty);
416  ExchangeHigh =
417      CGF.Builder.CreateShl(ExchangeHigh, llvm::ConstantInt::get(Int128Ty, 64));
418  llvm::Value *Exchange = CGF.Builder.CreateOr(ExchangeHigh, ExchangeLow);
419
420  // Load the comparand for the instruction.
421  llvm::Value *Comparand = CGF.Builder.CreateLoad(ComparandAddr);
422
423  auto *CXI = CGF.Builder.CreateAtomicCmpXchg(DestAddr, Comparand, Exchange,
424                                              SuccessOrdering, FailureOrdering);
425
426  // The atomic instruction is marked volatile for consistency with MSVC. This
427  // blocks the few atomics optimizations that LLVM has. If we want to optimize
428  // _Interlocked* operations in the future, we will have to remove the volatile
429  // marker.
430  CXI->setVolatile(true);
431
432  // Store the result as an outparameter.
433  CGF.Builder.CreateStore(CGF.Builder.CreateExtractValue(CXI, 0),
434                          ComparandAddr);
435
436  // Get the success boolean and zero extend it to i8.
437  Value *Success = CGF.Builder.CreateExtractValue(CXI, 1);
438  return CGF.Builder.CreateZExt(Success, CGF.Int8Ty);
439}
440
441static Value *EmitAtomicIncrementValue(CodeGenFunction &CGF, const CallExpr *E,
442    AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
443  assert(E->getArg(0)->getType()->isPointerType());
444
445  auto *IntTy = CGF.ConvertType(E->getType());
446  Address DestAddr = CheckAtomicAlignment(CGF, E);
447  auto *Result = CGF.Builder.CreateAtomicRMW(
448      AtomicRMWInst::Add, DestAddr, ConstantInt::get(IntTy, 1), Ordering);
449  return CGF.Builder.CreateAdd(Result, ConstantInt::get(IntTy, 1));
450}
451
452static Value *EmitAtomicDecrementValue(
453    CodeGenFunction &CGF, const CallExpr *E,
454    AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent) {
455  assert(E->getArg(0)->getType()->isPointerType());
456
457  auto *IntTy = CGF.ConvertType(E->getType());
458  Address DestAddr = CheckAtomicAlignment(CGF, E);
459  auto *Result = CGF.Builder.CreateAtomicRMW(
460      AtomicRMWInst::Sub, DestAddr, ConstantInt::get(IntTy, 1), Ordering);
461  return CGF.Builder.CreateSub(Result, ConstantInt::get(IntTy, 1));
462}
463
464// Build a plain volatile load.
465static Value *EmitISOVolatileLoad(CodeGenFunction &CGF, const CallExpr *E) {
466  Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
467  QualType ElTy = E->getArg(0)->getType()->getPointeeType();
468  CharUnits LoadSize = CGF.getContext().getTypeSizeInChars(ElTy);
469  llvm::Type *ITy =
470      llvm::IntegerType::get(CGF.getLLVMContext(), LoadSize.getQuantity() * 8);
471  llvm::LoadInst *Load = CGF.Builder.CreateAlignedLoad(ITy, Ptr, LoadSize);
472  Load->setVolatile(true);
473  return Load;
474}
475
476// Build a plain volatile store.
477static Value *EmitISOVolatileStore(CodeGenFunction &CGF, const CallExpr *E) {
478  Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
479  Value *Value = CGF.EmitScalarExpr(E->getArg(1));
480  QualType ElTy = E->getArg(0)->getType()->getPointeeType();
481  CharUnits StoreSize = CGF.getContext().getTypeSizeInChars(ElTy);
482  llvm::StoreInst *Store =
483      CGF.Builder.CreateAlignedStore(Value, Ptr, StoreSize);
484  Store->setVolatile(true);
485  return Store;
486}
487
488// Emit a simple mangled intrinsic that has 1 argument and a return type
489// matching the argument type. Depending on mode, this may be a constrained
490// floating-point intrinsic.
491static Value *emitUnaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
492                                const CallExpr *E, unsigned IntrinsicID,
493                                unsigned ConstrainedIntrinsicID) {
494  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
495
496  CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
497  if (CGF.Builder.getIsFPConstrained()) {
498    Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
499    return CGF.Builder.CreateConstrainedFPCall(F, { Src0 });
500  } else {
501    Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
502    return CGF.Builder.CreateCall(F, Src0);
503  }
504}
505
506// Emit an intrinsic that has 2 operands of the same type as its result.
507// Depending on mode, this may be a constrained floating-point intrinsic.
508static Value *emitBinaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
509                                const CallExpr *E, unsigned IntrinsicID,
510                                unsigned ConstrainedIntrinsicID) {
511  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
512  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
513
514  if (CGF.Builder.getIsFPConstrained()) {
515    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
516    Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
517    return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1 });
518  } else {
519    Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
520    return CGF.Builder.CreateCall(F, { Src0, Src1 });
521  }
522}
523
524// Has second type mangled argument.
525static Value *emitBinaryExpMaybeConstrainedFPBuiltin(
526    CodeGenFunction &CGF, const CallExpr *E, llvm::Intrinsic::ID IntrinsicID,
527    llvm::Intrinsic::ID ConstrainedIntrinsicID) {
528  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
529  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
530
531  if (CGF.Builder.getIsFPConstrained()) {
532    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
533    Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
534                                       {Src0->getType(), Src1->getType()});
535    return CGF.Builder.CreateConstrainedFPCall(F, {Src0, Src1});
536  }
537
538  Function *F =
539      CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), Src1->getType()});
540  return CGF.Builder.CreateCall(F, {Src0, Src1});
541}
542
543// Emit an intrinsic that has 3 operands of the same type as its result.
544// Depending on mode, this may be a constrained floating-point intrinsic.
545static Value *emitTernaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
546                                 const CallExpr *E, unsigned IntrinsicID,
547                                 unsigned ConstrainedIntrinsicID) {
548  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
549  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
550  llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
551
552  if (CGF.Builder.getIsFPConstrained()) {
553    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
554    Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
555    return CGF.Builder.CreateConstrainedFPCall(F, { Src0, Src1, Src2 });
556  } else {
557    Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
558    return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
559  }
560}
561
562// Emit an intrinsic where all operands are of the same type as the result.
563// Depending on mode, this may be a constrained floating-point intrinsic.
564static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
565                                                unsigned IntrinsicID,
566                                                unsigned ConstrainedIntrinsicID,
567                                                llvm::Type *Ty,
568                                                ArrayRef<Value *> Args) {
569  Function *F;
570  if (CGF.Builder.getIsFPConstrained())
571    F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Ty);
572  else
573    F = CGF.CGM.getIntrinsic(IntrinsicID, Ty);
574
575  if (CGF.Builder.getIsFPConstrained())
576    return CGF.Builder.CreateConstrainedFPCall(F, Args);
577  else
578    return CGF.Builder.CreateCall(F, Args);
579}
580
581// Emit a simple mangled intrinsic that has 1 argument and a return type
582// matching the argument type.
583static Value *emitUnaryBuiltin(CodeGenFunction &CGF, const CallExpr *E,
584                               unsigned IntrinsicID,
585                               llvm::StringRef Name = "") {
586  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
587
588  Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
589  return CGF.Builder.CreateCall(F, Src0, Name);
590}
591
592// Emit an intrinsic that has 2 operands of the same type as its result.
593static Value *emitBinaryBuiltin(CodeGenFunction &CGF,
594                                const CallExpr *E,
595                                unsigned IntrinsicID) {
596  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
597  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
598
599  Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
600  return CGF.Builder.CreateCall(F, { Src0, Src1 });
601}
602
603// Emit an intrinsic that has 3 operands of the same type as its result.
604static Value *emitTernaryBuiltin(CodeGenFunction &CGF,
605                                 const CallExpr *E,
606                                 unsigned IntrinsicID) {
607  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
608  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
609  llvm::Value *Src2 = CGF.EmitScalarExpr(E->getArg(2));
610
611  Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
612  return CGF.Builder.CreateCall(F, { Src0, Src1, Src2 });
613}
614
615// Emit an intrinsic that has 1 float or double operand, and 1 integer.
616static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
617                               const CallExpr *E,
618                               unsigned IntrinsicID) {
619  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
620  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
621
622  Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
623  return CGF.Builder.CreateCall(F, {Src0, Src1});
624}
625
626// Emit an intrinsic that has overloaded integer result and fp operand.
627static Value *
628emitMaybeConstrainedFPToIntRoundBuiltin(CodeGenFunction &CGF, const CallExpr *E,
629                                        unsigned IntrinsicID,
630                                        unsigned ConstrainedIntrinsicID) {
631  llvm::Type *ResultType = CGF.ConvertType(E->getType());
632  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
633
634  if (CGF.Builder.getIsFPConstrained()) {
635    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
636    Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID,
637                                       {ResultType, Src0->getType()});
638    return CGF.Builder.CreateConstrainedFPCall(F, {Src0});
639  } else {
640    Function *F =
641        CGF.CGM.getIntrinsic(IntrinsicID, {ResultType, Src0->getType()});
642    return CGF.Builder.CreateCall(F, Src0);
643  }
644}
645
646static Value *emitFrexpBuiltin(CodeGenFunction &CGF, const CallExpr *E,
647                               llvm::Intrinsic::ID IntrinsicID) {
648  llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
649  llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
650
651  QualType IntPtrTy = E->getArg(1)->getType()->getPointeeType();
652  llvm::Type *IntTy = CGF.ConvertType(IntPtrTy);
653  llvm::Function *F =
654      CGF.CGM.getIntrinsic(IntrinsicID, {Src0->getType(), IntTy});
655  llvm::Value *Call = CGF.Builder.CreateCall(F, Src0);
656
657  llvm::Value *Exp = CGF.Builder.CreateExtractValue(Call, 1);
658  LValue LV = CGF.MakeNaturalAlignAddrLValue(Src1, IntPtrTy);
659  CGF.EmitStoreOfScalar(Exp, LV);
660
661  return CGF.Builder.CreateExtractValue(Call, 0);
662}
663
664/// EmitFAbs - Emit a call to @llvm.fabs().
665static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) {
666  Function *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType());
667  llvm::CallInst *Call = CGF.Builder.CreateCall(F, V);
668  Call->setDoesNotAccessMemory();
669  return Call;
670}
671
672/// Emit the computation of the sign bit for a floating point value. Returns
673/// the i1 sign bit value.
674static Value *EmitSignBit(CodeGenFunction &CGF, Value *V) {
675  LLVMContext &C = CGF.CGM.getLLVMContext();
676
677  llvm::Type *Ty = V->getType();
678  int Width = Ty->getPrimitiveSizeInBits();
679  llvm::Type *IntTy = llvm::IntegerType::get(C, Width);
680  V = CGF.Builder.CreateBitCast(V, IntTy);
681  if (Ty->isPPC_FP128Ty()) {
682    // We want the sign bit of the higher-order double. The bitcast we just
683    // did works as if the double-double was stored to memory and then
684    // read as an i128. The "store" will put the higher-order double in the
685    // lower address in both little- and big-Endian modes, but the "load"
686    // will treat those bits as a different part of the i128: the low bits in
687    // little-Endian, the high bits in big-Endian. Therefore, on big-Endian
688    // we need to shift the high bits down to the low before truncating.
689    Width >>= 1;
690    if (CGF.getTarget().isBigEndian()) {
691      Value *ShiftCst = llvm::ConstantInt::get(IntTy, Width);
692      V = CGF.Builder.CreateLShr(V, ShiftCst);
693    }
694    // We are truncating value in order to extract the higher-order
695    // double, which we will be using to extract the sign from.
696    IntTy = llvm::IntegerType::get(C, Width);
697    V = CGF.Builder.CreateTrunc(V, IntTy);
698  }
699  Value *Zero = llvm::Constant::getNullValue(IntTy);
700  return CGF.Builder.CreateICmpSLT(V, Zero);
701}
702
703static RValue emitLibraryCall(CodeGenFunction &CGF, const FunctionDecl *FD,
704                              const CallExpr *E, llvm::Constant *calleeValue) {
705  CGCallee callee = CGCallee::forDirect(calleeValue, GlobalDecl(FD));
706  return CGF.EmitCall(E->getCallee()->getType(), callee, E, ReturnValueSlot());
707}
708
709/// Emit a call to llvm.{sadd,uadd,ssub,usub,smul,umul}.with.overflow.*
710/// depending on IntrinsicID.
711///
712/// \arg CGF The current codegen function.
713/// \arg IntrinsicID The ID for the Intrinsic we wish to generate.
714/// \arg X The first argument to the llvm.*.with.overflow.*.
715/// \arg Y The second argument to the llvm.*.with.overflow.*.
716/// \arg Carry The carry returned by the llvm.*.with.overflow.*.
717/// \returns The result (i.e. sum/product) returned by the intrinsic.
718static llvm::Value *EmitOverflowIntrinsic(CodeGenFunction &CGF,
719                                          const llvm::Intrinsic::ID IntrinsicID,
720                                          llvm::Value *X, llvm::Value *Y,
721                                          llvm::Value *&Carry) {
722  // Make sure we have integers of the same width.
723  assert(X->getType() == Y->getType() &&
724         "Arguments must be the same type. (Did you forget to make sure both "
725         "arguments have the same integer width?)");
726
727  Function *Callee = CGF.CGM.getIntrinsic(IntrinsicID, X->getType());
728  llvm::Value *Tmp = CGF.Builder.CreateCall(Callee, {X, Y});
729  Carry = CGF.Builder.CreateExtractValue(Tmp, 1);
730  return CGF.Builder.CreateExtractValue(Tmp, 0);
731}
732
733static Value *emitRangedBuiltin(CodeGenFunction &CGF,
734                                unsigned IntrinsicID,
735                                int low, int high) {
736    llvm::MDBuilder MDHelper(CGF.getLLVMContext());
737    llvm::MDNode *RNode = MDHelper.createRange(APInt(32, low), APInt(32, high));
738    Function *F = CGF.CGM.getIntrinsic(IntrinsicID, {});
739    llvm::Instruction *Call = CGF.Builder.CreateCall(F);
740    Call->setMetadata(llvm::LLVMContext::MD_range, RNode);
741    Call->setMetadata(llvm::LLVMContext::MD_noundef,
742                      llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
743    return Call;
744}
745
746namespace {
747  struct WidthAndSignedness {
748    unsigned Width;
749    bool Signed;
750  };
751}
752
753static WidthAndSignedness
754getIntegerWidthAndSignedness(const clang::ASTContext &context,
755                             const clang::QualType Type) {
756  assert(Type->isIntegerType() && "Given type is not an integer.");
757  unsigned Width = Type->isBooleanType()  ? 1
758                   : Type->isBitIntType() ? context.getIntWidth(Type)
759                                          : context.getTypeInfo(Type).Width;
760  bool Signed = Type->isSignedIntegerType();
761  return {Width, Signed};
762}
763
764// Given one or more integer types, this function produces an integer type that
765// encompasses them: any value in one of the given types could be expressed in
766// the encompassing type.
767static struct WidthAndSignedness
768EncompassingIntegerType(ArrayRef<struct WidthAndSignedness> Types) {
769  assert(Types.size() > 0 && "Empty list of types.");
770
771  // If any of the given types is signed, we must return a signed type.
772  bool Signed = false;
773  for (const auto &Type : Types) {
774    Signed |= Type.Signed;
775  }
776
777  // The encompassing type must have a width greater than or equal to the width
778  // of the specified types.  Additionally, if the encompassing type is signed,
779  // its width must be strictly greater than the width of any unsigned types
780  // given.
781  unsigned Width = 0;
782  for (const auto &Type : Types) {
783    unsigned MinWidth = Type.Width + (Signed && !Type.Signed);
784    if (Width < MinWidth) {
785      Width = MinWidth;
786    }
787  }
788
789  return {Width, Signed};
790}
791
792Value *CodeGenFunction::EmitVAStartEnd(Value *ArgValue, bool IsStart) {
793  Intrinsic::ID inst = IsStart ? Intrinsic::vastart : Intrinsic::vaend;
794  return Builder.CreateCall(CGM.getIntrinsic(inst), ArgValue);
795}
796
797/// Checks if using the result of __builtin_object_size(p, @p From) in place of
798/// __builtin_object_size(p, @p To) is correct
799static bool areBOSTypesCompatible(int From, int To) {
800  // Note: Our __builtin_object_size implementation currently treats Type=0 and
801  // Type=2 identically. Encoding this implementation detail here may make
802  // improving __builtin_object_size difficult in the future, so it's omitted.
803  return From == To || (From == 0 && To == 1) || (From == 3 && To == 2);
804}
805
806static llvm::Value *
807getDefaultBuiltinObjectSizeResult(unsigned Type, llvm::IntegerType *ResType) {
808  return ConstantInt::get(ResType, (Type & 2) ? 0 : -1, /*isSigned=*/true);
809}
810
811llvm::Value *
812CodeGenFunction::evaluateOrEmitBuiltinObjectSize(const Expr *E, unsigned Type,
813                                                 llvm::IntegerType *ResType,
814                                                 llvm::Value *EmittedE,
815                                                 bool IsDynamic) {
816  uint64_t ObjectSize;
817  if (!E->tryEvaluateObjectSize(ObjectSize, getContext(), Type))
818    return emitBuiltinObjectSize(E, Type, ResType, EmittedE, IsDynamic);
819  return ConstantInt::get(ResType, ObjectSize, /*isSigned=*/true);
820}
821
822const FieldDecl *CodeGenFunction::FindFlexibleArrayMemberField(
823    ASTContext &Ctx, const RecordDecl *RD, StringRef Name, uint64_t &Offset) {
824  const LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
825      getLangOpts().getStrictFlexArraysLevel();
826  uint32_t FieldNo = 0;
827
828  if (RD->isImplicit())
829    return nullptr;
830
831  for (const FieldDecl *FD : RD->fields()) {
832    if ((Name.empty() || FD->getNameAsString() == Name) &&
833        Decl::isFlexibleArrayMemberLike(
834            Ctx, FD, FD->getType(), StrictFlexArraysLevel,
835            /*IgnoreTemplateOrMacroSubstitution=*/true)) {
836      const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
837      Offset += Layout.getFieldOffset(FieldNo);
838      return FD;
839    }
840
841    QualType Ty = FD->getType();
842    if (Ty->isRecordType()) {
843      if (const FieldDecl *Field = FindFlexibleArrayMemberField(
844              Ctx, Ty->getAsRecordDecl(), Name, Offset)) {
845        const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(RD);
846        Offset += Layout.getFieldOffset(FieldNo);
847        return Field;
848      }
849    }
850
851    if (!RD->isUnion())
852      ++FieldNo;
853  }
854
855  return nullptr;
856}
857
858static unsigned CountCountedByAttrs(const RecordDecl *RD) {
859  unsigned Num = 0;
860
861  for (const Decl *D : RD->decls()) {
862    if (const auto *FD = dyn_cast<FieldDecl>(D);
863        FD && FD->hasAttr<CountedByAttr>()) {
864      return ++Num;
865    }
866
867    if (const auto *Rec = dyn_cast<RecordDecl>(D))
868      Num += CountCountedByAttrs(Rec);
869  }
870
871  return Num;
872}
873
874llvm::Value *
875CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
876                                             llvm::IntegerType *ResType) {
877  // The code generated here calculates the size of a struct with a flexible
878  // array member that uses the counted_by attribute. There are two instances
879  // we handle:
880  //
881  //       struct s {
882  //         unsigned long flags;
883  //         int count;
884  //         int array[] __attribute__((counted_by(count)));
885  //       }
886  //
887  //   1) bdos of the flexible array itself:
888  //
889  //     __builtin_dynamic_object_size(p->array, 1) ==
890  //         p->count * sizeof(*p->array)
891  //
892  //   2) bdos of a pointer into the flexible array:
893  //
894  //     __builtin_dynamic_object_size(&p->array[42], 1) ==
895  //         (p->count - 42) * sizeof(*p->array)
896  //
897  //   2) bdos of the whole struct, including the flexible array:
898  //
899  //     __builtin_dynamic_object_size(p, 1) ==
900  //        max(sizeof(struct s),
901  //            offsetof(struct s, array) + p->count * sizeof(*p->array))
902  //
903  ASTContext &Ctx = getContext();
904  const Expr *Base = E->IgnoreParenImpCasts();
905  const Expr *Idx = nullptr;
906
907  if (const auto *UO = dyn_cast<UnaryOperator>(Base);
908      UO && UO->getOpcode() == UO_AddrOf) {
909    Expr *SubExpr = UO->getSubExpr()->IgnoreParenImpCasts();
910    if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(SubExpr)) {
911      Base = ASE->getBase()->IgnoreParenImpCasts();
912      Idx = ASE->getIdx()->IgnoreParenImpCasts();
913
914      if (const auto *IL = dyn_cast<IntegerLiteral>(Idx)) {
915        int64_t Val = IL->getValue().getSExtValue();
916        if (Val < 0)
917          return getDefaultBuiltinObjectSizeResult(Type, ResType);
918
919        if (Val == 0)
920          // The index is 0, so we don't need to take it into account.
921          Idx = nullptr;
922      }
923    } else {
924      // Potential pointer to another element in the struct.
925      Base = SubExpr;
926    }
927  }
928
929  // Get the flexible array member Decl.
930  const RecordDecl *OuterRD = nullptr;
931  std::string FAMName;
932  if (const auto *ME = dyn_cast<MemberExpr>(Base)) {
933    // Check if \p Base is referencing the FAM itself.
934    const ValueDecl *VD = ME->getMemberDecl();
935    OuterRD = VD->getDeclContext()->getOuterLexicalRecordContext();
936    FAMName = VD->getNameAsString();
937  } else if (const auto *DRE = dyn_cast<DeclRefExpr>(Base)) {
938    // Check if we're pointing to the whole struct.
939    QualType Ty = DRE->getDecl()->getType();
940    if (Ty->isPointerType())
941      Ty = Ty->getPointeeType();
942    OuterRD = Ty->getAsRecordDecl();
943
944    // If we have a situation like this:
945    //
946    //     struct union_of_fams {
947    //         int flags;
948    //         union {
949    //             signed char normal_field;
950    //             struct {
951    //                 int count1;
952    //                 int arr1[] __counted_by(count1);
953    //             };
954    //             struct {
955    //                 signed char count2;
956    //                 int arr2[] __counted_by(count2);
957    //             };
958    //         };
959    //    };
960    //
961    // We don't konw which 'count' to use in this scenario:
962    //
963    //     size_t get_size(struct union_of_fams *p) {
964    //         return __builtin_dynamic_object_size(p, 1);
965    //     }
966    //
967    // Instead of calculating a wrong number, we give up.
968    if (OuterRD && CountCountedByAttrs(OuterRD) > 1)
969      return nullptr;
970  }
971
972  if (!OuterRD)
973    return nullptr;
974
975  uint64_t Offset = 0;
976  const FieldDecl *FAMDecl =
977      FindFlexibleArrayMemberField(Ctx, OuterRD, FAMName, Offset);
978  Offset = Ctx.toCharUnitsFromBits(Offset).getQuantity();
979
980  if (!FAMDecl || !FAMDecl->hasAttr<CountedByAttr>())
981    // No flexible array member found or it doesn't have the "counted_by"
982    // attribute.
983    return nullptr;
984
985  const FieldDecl *CountedByFD = FindCountedByField(FAMDecl);
986  if (!CountedByFD)
987    // Can't find the field referenced by the "counted_by" attribute.
988    return nullptr;
989
990  // Build a load of the counted_by field.
991  bool IsSigned = CountedByFD->getType()->isSignedIntegerType();
992  Value *CountedByInst = EmitCountedByFieldExpr(Base, FAMDecl, CountedByFD);
993  if (!CountedByInst)
994    return getDefaultBuiltinObjectSizeResult(Type, ResType);
995
996  CountedByInst = Builder.CreateIntCast(CountedByInst, ResType, IsSigned);
997
998  // Build a load of the index and subtract it from the count.
999  Value *IdxInst = nullptr;
1000  if (Idx) {
1001    if (Idx->HasSideEffects(getContext()))
1002      // We can't have side-effects.
1003      return getDefaultBuiltinObjectSizeResult(Type, ResType);
1004
1005    bool IdxSigned = Idx->getType()->isSignedIntegerType();
1006    IdxInst = EmitAnyExprToTemp(Idx).getScalarVal();
1007    IdxInst = Builder.CreateIntCast(IdxInst, ResType, IdxSigned);
1008
1009    // We go ahead with the calculation here. If the index turns out to be
1010    // negative, we'll catch it at the end.
1011    CountedByInst =
1012        Builder.CreateSub(CountedByInst, IdxInst, "", !IsSigned, IsSigned);
1013  }
1014
1015  // Calculate how large the flexible array member is in bytes.
1016  const ArrayType *ArrayTy = Ctx.getAsArrayType(FAMDecl->getType());
1017  CharUnits Size = Ctx.getTypeSizeInChars(ArrayTy->getElementType());
1018  llvm::Constant *ElemSize =
1019      llvm::ConstantInt::get(ResType, Size.getQuantity(), IsSigned);
1020  Value *FAMSize =
1021      Builder.CreateMul(CountedByInst, ElemSize, "", !IsSigned, IsSigned);
1022  FAMSize = Builder.CreateIntCast(FAMSize, ResType, IsSigned);
1023  Value *Res = FAMSize;
1024
1025  if (isa<DeclRefExpr>(Base)) {
1026    // The whole struct is specificed in the __bdos.
1027    const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(OuterRD);
1028
1029    // Get the offset of the FAM.
1030    llvm::Constant *FAMOffset = ConstantInt::get(ResType, Offset, IsSigned);
1031    Value *OffsetAndFAMSize =
1032        Builder.CreateAdd(FAMOffset, Res, "", !IsSigned, IsSigned);
1033
1034    // Get the full size of the struct.
1035    llvm::Constant *SizeofStruct =
1036        ConstantInt::get(ResType, Layout.getSize().getQuantity(), IsSigned);
1037
1038    // max(sizeof(struct s),
1039    //     offsetof(struct s, array) + p->count * sizeof(*p->array))
1040    Res = IsSigned
1041              ? Builder.CreateBinaryIntrinsic(llvm::Intrinsic::smax,
1042                                              OffsetAndFAMSize, SizeofStruct)
1043              : Builder.CreateBinaryIntrinsic(llvm::Intrinsic::umax,
1044                                              OffsetAndFAMSize, SizeofStruct);
1045  }
1046
1047  // A negative \p IdxInst or \p CountedByInst means that the index lands
1048  // outside of the flexible array member. If that's the case, we want to
1049  // return 0.
1050  Value *Cmp = Builder.CreateIsNotNeg(CountedByInst);
1051  if (IdxInst)
1052    Cmp = Builder.CreateAnd(Builder.CreateIsNotNeg(IdxInst), Cmp);
1053
1054  return Builder.CreateSelect(Cmp, Res, ConstantInt::get(ResType, 0, IsSigned));
1055}
1056
1057/// Returns a Value corresponding to the size of the given expression.
1058/// This Value may be either of the following:
1059///   - A llvm::Argument (if E is a param with the pass_object_size attribute on
1060///     it)
1061///   - A call to the @llvm.objectsize intrinsic
1062///
1063/// EmittedE is the result of emitting `E` as a scalar expr. If it's non-null
1064/// and we wouldn't otherwise try to reference a pass_object_size parameter,
1065/// we'll call @llvm.objectsize on EmittedE, rather than emitting E.
1066llvm::Value *
1067CodeGenFunction::emitBuiltinObjectSize(const Expr *E, unsigned Type,
1068                                       llvm::IntegerType *ResType,
1069                                       llvm::Value *EmittedE, bool IsDynamic) {
1070  // We need to reference an argument if the pointer is a parameter with the
1071  // pass_object_size attribute.
1072  if (auto *D = dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts())) {
1073    auto *Param = dyn_cast<ParmVarDecl>(D->getDecl());
1074    auto *PS = D->getDecl()->getAttr<PassObjectSizeAttr>();
1075    if (Param != nullptr && PS != nullptr &&
1076        areBOSTypesCompatible(PS->getType(), Type)) {
1077      auto Iter = SizeArguments.find(Param);
1078      assert(Iter != SizeArguments.end());
1079
1080      const ImplicitParamDecl *D = Iter->second;
1081      auto DIter = LocalDeclMap.find(D);
1082      assert(DIter != LocalDeclMap.end());
1083
1084      return EmitLoadOfScalar(DIter->second, /*Volatile=*/false,
1085                              getContext().getSizeType(), E->getBeginLoc());
1086    }
1087  }
1088
1089  if (IsDynamic) {
1090    // Emit special code for a flexible array member with the "counted_by"
1091    // attribute.
1092    if (Value *V = emitFlexibleArrayMemberSize(E, Type, ResType))
1093      return V;
1094  }
1095
1096  // LLVM can't handle Type=3 appropriately, and __builtin_object_size shouldn't
1097  // evaluate E for side-effects. In either case, we shouldn't lower to
1098  // @llvm.objectsize.
1099  if (Type == 3 || (!EmittedE && E->HasSideEffects(getContext())))
1100    return getDefaultBuiltinObjectSizeResult(Type, ResType);
1101
1102  Value *Ptr = EmittedE ? EmittedE : EmitScalarExpr(E);
1103  assert(Ptr->getType()->isPointerTy() &&
1104         "Non-pointer passed to __builtin_object_size?");
1105
1106  Function *F =
1107      CGM.getIntrinsic(Intrinsic::objectsize, {ResType, Ptr->getType()});
1108
1109  // LLVM only supports 0 and 2, make sure that we pass along that as a boolean.
1110  Value *Min = Builder.getInt1((Type & 2) != 0);
1111  // For GCC compatibility, __builtin_object_size treat NULL as unknown size.
1112  Value *NullIsUnknown = Builder.getTrue();
1113  Value *Dynamic = Builder.getInt1(IsDynamic);
1114  return Builder.CreateCall(F, {Ptr, Min, NullIsUnknown, Dynamic});
1115}
1116
1117namespace {
1118/// A struct to generically describe a bit test intrinsic.
1119struct BitTest {
1120  enum ActionKind : uint8_t { TestOnly, Complement, Reset, Set };
1121  enum InterlockingKind : uint8_t {
1122    Unlocked,
1123    Sequential,
1124    Acquire,
1125    Release,
1126    NoFence
1127  };
1128
1129  ActionKind Action;
1130  InterlockingKind Interlocking;
1131  bool Is64Bit;
1132
1133  static BitTest decodeBitTestBuiltin(unsigned BuiltinID);
1134};
1135} // namespace
1136
1137BitTest BitTest::decodeBitTestBuiltin(unsigned BuiltinID) {
1138  switch (BuiltinID) {
1139    // Main portable variants.
1140  case Builtin::BI_bittest:
1141    return {TestOnly, Unlocked, false};
1142  case Builtin::BI_bittestandcomplement:
1143    return {Complement, Unlocked, false};
1144  case Builtin::BI_bittestandreset:
1145    return {Reset, Unlocked, false};
1146  case Builtin::BI_bittestandset:
1147    return {Set, Unlocked, false};
1148  case Builtin::BI_interlockedbittestandreset:
1149    return {Reset, Sequential, false};
1150  case Builtin::BI_interlockedbittestandset:
1151    return {Set, Sequential, false};
1152
1153    // X86-specific 64-bit variants.
1154  case Builtin::BI_bittest64:
1155    return {TestOnly, Unlocked, true};
1156  case Builtin::BI_bittestandcomplement64:
1157    return {Complement, Unlocked, true};
1158  case Builtin::BI_bittestandreset64:
1159    return {Reset, Unlocked, true};
1160  case Builtin::BI_bittestandset64:
1161    return {Set, Unlocked, true};
1162  case Builtin::BI_interlockedbittestandreset64:
1163    return {Reset, Sequential, true};
1164  case Builtin::BI_interlockedbittestandset64:
1165    return {Set, Sequential, true};
1166
1167    // ARM/AArch64-specific ordering variants.
1168  case Builtin::BI_interlockedbittestandset_acq:
1169    return {Set, Acquire, false};
1170  case Builtin::BI_interlockedbittestandset_rel:
1171    return {Set, Release, false};
1172  case Builtin::BI_interlockedbittestandset_nf:
1173    return {Set, NoFence, false};
1174  case Builtin::BI_interlockedbittestandreset_acq:
1175    return {Reset, Acquire, false};
1176  case Builtin::BI_interlockedbittestandreset_rel:
1177    return {Reset, Release, false};
1178  case Builtin::BI_interlockedbittestandreset_nf:
1179    return {Reset, NoFence, false};
1180  }
1181  llvm_unreachable("expected only bittest intrinsics");
1182}
1183
1184static char bitActionToX86BTCode(BitTest::ActionKind A) {
1185  switch (A) {
1186  case BitTest::TestOnly:   return '\0';
1187  case BitTest::Complement: return 'c';
1188  case BitTest::Reset:      return 'r';
1189  case BitTest::Set:        return 's';
1190  }
1191  llvm_unreachable("invalid action");
1192}
1193
1194static llvm::Value *EmitX86BitTestIntrinsic(CodeGenFunction &CGF,
1195                                            BitTest BT,
1196                                            const CallExpr *E, Value *BitBase,
1197                                            Value *BitPos) {
1198  char Action = bitActionToX86BTCode(BT.Action);
1199  char SizeSuffix = BT.Is64Bit ? 'q' : 'l';
1200
1201  // Build the assembly.
1202  SmallString<64> Asm;
1203  raw_svector_ostream AsmOS(Asm);
1204  if (BT.Interlocking != BitTest::Unlocked)
1205    AsmOS << "lock ";
1206  AsmOS << "bt";
1207  if (Action)
1208    AsmOS << Action;
1209  AsmOS << SizeSuffix << " $2, ($1)";
1210
1211  // Build the constraints. FIXME: We should support immediates when possible.
1212  std::string Constraints = "={@ccc},r,r,~{cc},~{memory}";
1213  std::string_view MachineClobbers = CGF.getTarget().getClobbers();
1214  if (!MachineClobbers.empty()) {
1215    Constraints += ',';
1216    Constraints += MachineClobbers;
1217  }
1218  llvm::IntegerType *IntType = llvm::IntegerType::get(
1219      CGF.getLLVMContext(),
1220      CGF.getContext().getTypeSize(E->getArg(1)->getType()));
1221  llvm::FunctionType *FTy =
1222      llvm::FunctionType::get(CGF.Int8Ty, {CGF.UnqualPtrTy, IntType}, false);
1223
1224  llvm::InlineAsm *IA =
1225      llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1226  return CGF.Builder.CreateCall(IA, {BitBase, BitPos});
1227}
1228
1229static llvm::AtomicOrdering
1230getBitTestAtomicOrdering(BitTest::InterlockingKind I) {
1231  switch (I) {
1232  case BitTest::Unlocked:   return llvm::AtomicOrdering::NotAtomic;
1233  case BitTest::Sequential: return llvm::AtomicOrdering::SequentiallyConsistent;
1234  case BitTest::Acquire:    return llvm::AtomicOrdering::Acquire;
1235  case BitTest::Release:    return llvm::AtomicOrdering::Release;
1236  case BitTest::NoFence:    return llvm::AtomicOrdering::Monotonic;
1237  }
1238  llvm_unreachable("invalid interlocking");
1239}
1240
1241/// Emit a _bittest* intrinsic. These intrinsics take a pointer to an array of
1242/// bits and a bit position and read and optionally modify the bit at that
1243/// position. The position index can be arbitrarily large, i.e. it can be larger
1244/// than 31 or 63, so we need an indexed load in the general case.
1245static llvm::Value *EmitBitTestIntrinsic(CodeGenFunction &CGF,
1246                                         unsigned BuiltinID,
1247                                         const CallExpr *E) {
1248  Value *BitBase = CGF.EmitScalarExpr(E->getArg(0));
1249  Value *BitPos = CGF.EmitScalarExpr(E->getArg(1));
1250
1251  BitTest BT = BitTest::decodeBitTestBuiltin(BuiltinID);
1252
1253  // X86 has special BT, BTC, BTR, and BTS instructions that handle the array
1254  // indexing operation internally. Use them if possible.
1255  if (CGF.getTarget().getTriple().isX86())
1256    return EmitX86BitTestIntrinsic(CGF, BT, E, BitBase, BitPos);
1257
1258  // Otherwise, use generic code to load one byte and test the bit. Use all but
1259  // the bottom three bits as the array index, and the bottom three bits to form
1260  // a mask.
1261  // Bit = BitBaseI8[BitPos >> 3] & (1 << (BitPos & 0x7)) != 0;
1262  Value *ByteIndex = CGF.Builder.CreateAShr(
1263      BitPos, llvm::ConstantInt::get(BitPos->getType(), 3), "bittest.byteidx");
1264  Value *BitBaseI8 = CGF.Builder.CreatePointerCast(BitBase, CGF.Int8PtrTy);
1265  Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBaseI8,
1266                                                 ByteIndex, "bittest.byteaddr"),
1267                   CGF.Int8Ty, CharUnits::One());
1268  Value *PosLow =
1269      CGF.Builder.CreateAnd(CGF.Builder.CreateTrunc(BitPos, CGF.Int8Ty),
1270                            llvm::ConstantInt::get(CGF.Int8Ty, 0x7));
1271
1272  // The updating instructions will need a mask.
1273  Value *Mask = nullptr;
1274  if (BT.Action != BitTest::TestOnly) {
1275    Mask = CGF.Builder.CreateShl(llvm::ConstantInt::get(CGF.Int8Ty, 1), PosLow,
1276                                 "bittest.mask");
1277  }
1278
1279  // Check the action and ordering of the interlocked intrinsics.
1280  llvm::AtomicOrdering Ordering = getBitTestAtomicOrdering(BT.Interlocking);
1281
1282  Value *OldByte = nullptr;
1283  if (Ordering != llvm::AtomicOrdering::NotAtomic) {
1284    // Emit a combined atomicrmw load/store operation for the interlocked
1285    // intrinsics.
1286    llvm::AtomicRMWInst::BinOp RMWOp = llvm::AtomicRMWInst::Or;
1287    if (BT.Action == BitTest::Reset) {
1288      Mask = CGF.Builder.CreateNot(Mask);
1289      RMWOp = llvm::AtomicRMWInst::And;
1290    }
1291    OldByte = CGF.Builder.CreateAtomicRMW(RMWOp, ByteAddr, Mask, Ordering);
1292  } else {
1293    // Emit a plain load for the non-interlocked intrinsics.
1294    OldByte = CGF.Builder.CreateLoad(ByteAddr, "bittest.byte");
1295    Value *NewByte = nullptr;
1296    switch (BT.Action) {
1297    case BitTest::TestOnly:
1298      // Don't store anything.
1299      break;
1300    case BitTest::Complement:
1301      NewByte = CGF.Builder.CreateXor(OldByte, Mask);
1302      break;
1303    case BitTest::Reset:
1304      NewByte = CGF.Builder.CreateAnd(OldByte, CGF.Builder.CreateNot(Mask));
1305      break;
1306    case BitTest::Set:
1307      NewByte = CGF.Builder.CreateOr(OldByte, Mask);
1308      break;
1309    }
1310    if (NewByte)
1311      CGF.Builder.CreateStore(NewByte, ByteAddr);
1312  }
1313
1314  // However we loaded the old byte, either by plain load or atomicrmw, shift
1315  // the bit into the low position and mask it to 0 or 1.
1316  Value *ShiftedByte = CGF.Builder.CreateLShr(OldByte, PosLow, "bittest.shr");
1317  return CGF.Builder.CreateAnd(
1318      ShiftedByte, llvm::ConstantInt::get(CGF.Int8Ty, 1), "bittest.res");
1319}
1320
1321static llvm::Value *emitPPCLoadReserveIntrinsic(CodeGenFunction &CGF,
1322                                                unsigned BuiltinID,
1323                                                const CallExpr *E) {
1324  Value *Addr = CGF.EmitScalarExpr(E->getArg(0));
1325
1326  SmallString<64> Asm;
1327  raw_svector_ostream AsmOS(Asm);
1328  llvm::IntegerType *RetType = CGF.Int32Ty;
1329
1330  switch (BuiltinID) {
1331  case clang::PPC::BI__builtin_ppc_ldarx:
1332    AsmOS << "ldarx ";
1333    RetType = CGF.Int64Ty;
1334    break;
1335  case clang::PPC::BI__builtin_ppc_lwarx:
1336    AsmOS << "lwarx ";
1337    RetType = CGF.Int32Ty;
1338    break;
1339  case clang::PPC::BI__builtin_ppc_lharx:
1340    AsmOS << "lharx ";
1341    RetType = CGF.Int16Ty;
1342    break;
1343  case clang::PPC::BI__builtin_ppc_lbarx:
1344    AsmOS << "lbarx ";
1345    RetType = CGF.Int8Ty;
1346    break;
1347  default:
1348    llvm_unreachable("Expected only PowerPC load reserve intrinsics");
1349  }
1350
1351  AsmOS << "$0, ${1:y}";
1352
1353  std::string Constraints = "=r,*Z,~{memory}";
1354  std::string_view MachineClobbers = CGF.getTarget().getClobbers();
1355  if (!MachineClobbers.empty()) {
1356    Constraints += ',';
1357    Constraints += MachineClobbers;
1358  }
1359
1360  llvm::Type *PtrType = CGF.UnqualPtrTy;
1361  llvm::FunctionType *FTy = llvm::FunctionType::get(RetType, {PtrType}, false);
1362
1363  llvm::InlineAsm *IA =
1364      llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1365  llvm::CallInst *CI = CGF.Builder.CreateCall(IA, {Addr});
1366  CI->addParamAttr(
1367      0, Attribute::get(CGF.getLLVMContext(), Attribute::ElementType, RetType));
1368  return CI;
1369}
1370
1371namespace {
1372enum class MSVCSetJmpKind {
1373  _setjmpex,
1374  _setjmp3,
1375  _setjmp
1376};
1377}
1378
1379/// MSVC handles setjmp a bit differently on different platforms. On every
1380/// architecture except 32-bit x86, the frame address is passed. On x86, extra
1381/// parameters can be passed as variadic arguments, but we always pass none.
1382static RValue EmitMSVCRTSetJmp(CodeGenFunction &CGF, MSVCSetJmpKind SJKind,
1383                               const CallExpr *E) {
1384  llvm::Value *Arg1 = nullptr;
1385  llvm::Type *Arg1Ty = nullptr;
1386  StringRef Name;
1387  bool IsVarArg = false;
1388  if (SJKind == MSVCSetJmpKind::_setjmp3) {
1389    Name = "_setjmp3";
1390    Arg1Ty = CGF.Int32Ty;
1391    Arg1 = llvm::ConstantInt::get(CGF.IntTy, 0);
1392    IsVarArg = true;
1393  } else {
1394    Name = SJKind == MSVCSetJmpKind::_setjmp ? "_setjmp" : "_setjmpex";
1395    Arg1Ty = CGF.Int8PtrTy;
1396    if (CGF.getTarget().getTriple().getArch() == llvm::Triple::aarch64) {
1397      Arg1 = CGF.Builder.CreateCall(
1398          CGF.CGM.getIntrinsic(Intrinsic::sponentry, CGF.AllocaInt8PtrTy));
1399    } else
1400      Arg1 = CGF.Builder.CreateCall(
1401          CGF.CGM.getIntrinsic(Intrinsic::frameaddress, CGF.AllocaInt8PtrTy),
1402          llvm::ConstantInt::get(CGF.Int32Ty, 0));
1403  }
1404
1405  // Mark the call site and declaration with ReturnsTwice.
1406  llvm::Type *ArgTypes[2] = {CGF.Int8PtrTy, Arg1Ty};
1407  llvm::AttributeList ReturnsTwiceAttr = llvm::AttributeList::get(
1408      CGF.getLLVMContext(), llvm::AttributeList::FunctionIndex,
1409      llvm::Attribute::ReturnsTwice);
1410  llvm::FunctionCallee SetJmpFn = CGF.CGM.CreateRuntimeFunction(
1411      llvm::FunctionType::get(CGF.IntTy, ArgTypes, IsVarArg), Name,
1412      ReturnsTwiceAttr, /*Local=*/true);
1413
1414  llvm::Value *Buf = CGF.Builder.CreateBitOrPointerCast(
1415      CGF.EmitScalarExpr(E->getArg(0)), CGF.Int8PtrTy);
1416  llvm::Value *Args[] = {Buf, Arg1};
1417  llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(SetJmpFn, Args);
1418  CB->setAttributes(ReturnsTwiceAttr);
1419  return RValue::get(CB);
1420}
1421
1422// Many of MSVC builtins are on x64, ARM and AArch64; to avoid repeating code,
1423// we handle them here.
1424enum class CodeGenFunction::MSVCIntrin {
1425  _BitScanForward,
1426  _BitScanReverse,
1427  _InterlockedAnd,
1428  _InterlockedDecrement,
1429  _InterlockedExchange,
1430  _InterlockedExchangeAdd,
1431  _InterlockedExchangeSub,
1432  _InterlockedIncrement,
1433  _InterlockedOr,
1434  _InterlockedXor,
1435  _InterlockedExchangeAdd_acq,
1436  _InterlockedExchangeAdd_rel,
1437  _InterlockedExchangeAdd_nf,
1438  _InterlockedExchange_acq,
1439  _InterlockedExchange_rel,
1440  _InterlockedExchange_nf,
1441  _InterlockedCompareExchange_acq,
1442  _InterlockedCompareExchange_rel,
1443  _InterlockedCompareExchange_nf,
1444  _InterlockedCompareExchange128,
1445  _InterlockedCompareExchange128_acq,
1446  _InterlockedCompareExchange128_rel,
1447  _InterlockedCompareExchange128_nf,
1448  _InterlockedOr_acq,
1449  _InterlockedOr_rel,
1450  _InterlockedOr_nf,
1451  _InterlockedXor_acq,
1452  _InterlockedXor_rel,
1453  _InterlockedXor_nf,
1454  _InterlockedAnd_acq,
1455  _InterlockedAnd_rel,
1456  _InterlockedAnd_nf,
1457  _InterlockedIncrement_acq,
1458  _InterlockedIncrement_rel,
1459  _InterlockedIncrement_nf,
1460  _InterlockedDecrement_acq,
1461  _InterlockedDecrement_rel,
1462  _InterlockedDecrement_nf,
1463  __fastfail,
1464};
1465
1466static std::optional<CodeGenFunction::MSVCIntrin>
1467translateArmToMsvcIntrin(unsigned BuiltinID) {
1468  using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1469  switch (BuiltinID) {
1470  default:
1471    return std::nullopt;
1472  case clang::ARM::BI_BitScanForward:
1473  case clang::ARM::BI_BitScanForward64:
1474    return MSVCIntrin::_BitScanForward;
1475  case clang::ARM::BI_BitScanReverse:
1476  case clang::ARM::BI_BitScanReverse64:
1477    return MSVCIntrin::_BitScanReverse;
1478  case clang::ARM::BI_InterlockedAnd64:
1479    return MSVCIntrin::_InterlockedAnd;
1480  case clang::ARM::BI_InterlockedExchange64:
1481    return MSVCIntrin::_InterlockedExchange;
1482  case clang::ARM::BI_InterlockedExchangeAdd64:
1483    return MSVCIntrin::_InterlockedExchangeAdd;
1484  case clang::ARM::BI_InterlockedExchangeSub64:
1485    return MSVCIntrin::_InterlockedExchangeSub;
1486  case clang::ARM::BI_InterlockedOr64:
1487    return MSVCIntrin::_InterlockedOr;
1488  case clang::ARM::BI_InterlockedXor64:
1489    return MSVCIntrin::_InterlockedXor;
1490  case clang::ARM::BI_InterlockedDecrement64:
1491    return MSVCIntrin::_InterlockedDecrement;
1492  case clang::ARM::BI_InterlockedIncrement64:
1493    return MSVCIntrin::_InterlockedIncrement;
1494  case clang::ARM::BI_InterlockedExchangeAdd8_acq:
1495  case clang::ARM::BI_InterlockedExchangeAdd16_acq:
1496  case clang::ARM::BI_InterlockedExchangeAdd_acq:
1497  case clang::ARM::BI_InterlockedExchangeAdd64_acq:
1498    return MSVCIntrin::_InterlockedExchangeAdd_acq;
1499  case clang::ARM::BI_InterlockedExchangeAdd8_rel:
1500  case clang::ARM::BI_InterlockedExchangeAdd16_rel:
1501  case clang::ARM::BI_InterlockedExchangeAdd_rel:
1502  case clang::ARM::BI_InterlockedExchangeAdd64_rel:
1503    return MSVCIntrin::_InterlockedExchangeAdd_rel;
1504  case clang::ARM::BI_InterlockedExchangeAdd8_nf:
1505  case clang::ARM::BI_InterlockedExchangeAdd16_nf:
1506  case clang::ARM::BI_InterlockedExchangeAdd_nf:
1507  case clang::ARM::BI_InterlockedExchangeAdd64_nf:
1508    return MSVCIntrin::_InterlockedExchangeAdd_nf;
1509  case clang::ARM::BI_InterlockedExchange8_acq:
1510  case clang::ARM::BI_InterlockedExchange16_acq:
1511  case clang::ARM::BI_InterlockedExchange_acq:
1512  case clang::ARM::BI_InterlockedExchange64_acq:
1513    return MSVCIntrin::_InterlockedExchange_acq;
1514  case clang::ARM::BI_InterlockedExchange8_rel:
1515  case clang::ARM::BI_InterlockedExchange16_rel:
1516  case clang::ARM::BI_InterlockedExchange_rel:
1517  case clang::ARM::BI_InterlockedExchange64_rel:
1518    return MSVCIntrin::_InterlockedExchange_rel;
1519  case clang::ARM::BI_InterlockedExchange8_nf:
1520  case clang::ARM::BI_InterlockedExchange16_nf:
1521  case clang::ARM::BI_InterlockedExchange_nf:
1522  case clang::ARM::BI_InterlockedExchange64_nf:
1523    return MSVCIntrin::_InterlockedExchange_nf;
1524  case clang::ARM::BI_InterlockedCompareExchange8_acq:
1525  case clang::ARM::BI_InterlockedCompareExchange16_acq:
1526  case clang::ARM::BI_InterlockedCompareExchange_acq:
1527  case clang::ARM::BI_InterlockedCompareExchange64_acq:
1528    return MSVCIntrin::_InterlockedCompareExchange_acq;
1529  case clang::ARM::BI_InterlockedCompareExchange8_rel:
1530  case clang::ARM::BI_InterlockedCompareExchange16_rel:
1531  case clang::ARM::BI_InterlockedCompareExchange_rel:
1532  case clang::ARM::BI_InterlockedCompareExchange64_rel:
1533    return MSVCIntrin::_InterlockedCompareExchange_rel;
1534  case clang::ARM::BI_InterlockedCompareExchange8_nf:
1535  case clang::ARM::BI_InterlockedCompareExchange16_nf:
1536  case clang::ARM::BI_InterlockedCompareExchange_nf:
1537  case clang::ARM::BI_InterlockedCompareExchange64_nf:
1538    return MSVCIntrin::_InterlockedCompareExchange_nf;
1539  case clang::ARM::BI_InterlockedOr8_acq:
1540  case clang::ARM::BI_InterlockedOr16_acq:
1541  case clang::ARM::BI_InterlockedOr_acq:
1542  case clang::ARM::BI_InterlockedOr64_acq:
1543    return MSVCIntrin::_InterlockedOr_acq;
1544  case clang::ARM::BI_InterlockedOr8_rel:
1545  case clang::ARM::BI_InterlockedOr16_rel:
1546  case clang::ARM::BI_InterlockedOr_rel:
1547  case clang::ARM::BI_InterlockedOr64_rel:
1548    return MSVCIntrin::_InterlockedOr_rel;
1549  case clang::ARM::BI_InterlockedOr8_nf:
1550  case clang::ARM::BI_InterlockedOr16_nf:
1551  case clang::ARM::BI_InterlockedOr_nf:
1552  case clang::ARM::BI_InterlockedOr64_nf:
1553    return MSVCIntrin::_InterlockedOr_nf;
1554  case clang::ARM::BI_InterlockedXor8_acq:
1555  case clang::ARM::BI_InterlockedXor16_acq:
1556  case clang::ARM::BI_InterlockedXor_acq:
1557  case clang::ARM::BI_InterlockedXor64_acq:
1558    return MSVCIntrin::_InterlockedXor_acq;
1559  case clang::ARM::BI_InterlockedXor8_rel:
1560  case clang::ARM::BI_InterlockedXor16_rel:
1561  case clang::ARM::BI_InterlockedXor_rel:
1562  case clang::ARM::BI_InterlockedXor64_rel:
1563    return MSVCIntrin::_InterlockedXor_rel;
1564  case clang::ARM::BI_InterlockedXor8_nf:
1565  case clang::ARM::BI_InterlockedXor16_nf:
1566  case clang::ARM::BI_InterlockedXor_nf:
1567  case clang::ARM::BI_InterlockedXor64_nf:
1568    return MSVCIntrin::_InterlockedXor_nf;
1569  case clang::ARM::BI_InterlockedAnd8_acq:
1570  case clang::ARM::BI_InterlockedAnd16_acq:
1571  case clang::ARM::BI_InterlockedAnd_acq:
1572  case clang::ARM::BI_InterlockedAnd64_acq:
1573    return MSVCIntrin::_InterlockedAnd_acq;
1574  case clang::ARM::BI_InterlockedAnd8_rel:
1575  case clang::ARM::BI_InterlockedAnd16_rel:
1576  case clang::ARM::BI_InterlockedAnd_rel:
1577  case clang::ARM::BI_InterlockedAnd64_rel:
1578    return MSVCIntrin::_InterlockedAnd_rel;
1579  case clang::ARM::BI_InterlockedAnd8_nf:
1580  case clang::ARM::BI_InterlockedAnd16_nf:
1581  case clang::ARM::BI_InterlockedAnd_nf:
1582  case clang::ARM::BI_InterlockedAnd64_nf:
1583    return MSVCIntrin::_InterlockedAnd_nf;
1584  case clang::ARM::BI_InterlockedIncrement16_acq:
1585  case clang::ARM::BI_InterlockedIncrement_acq:
1586  case clang::ARM::BI_InterlockedIncrement64_acq:
1587    return MSVCIntrin::_InterlockedIncrement_acq;
1588  case clang::ARM::BI_InterlockedIncrement16_rel:
1589  case clang::ARM::BI_InterlockedIncrement_rel:
1590  case clang::ARM::BI_InterlockedIncrement64_rel:
1591    return MSVCIntrin::_InterlockedIncrement_rel;
1592  case clang::ARM::BI_InterlockedIncrement16_nf:
1593  case clang::ARM::BI_InterlockedIncrement_nf:
1594  case clang::ARM::BI_InterlockedIncrement64_nf:
1595    return MSVCIntrin::_InterlockedIncrement_nf;
1596  case clang::ARM::BI_InterlockedDecrement16_acq:
1597  case clang::ARM::BI_InterlockedDecrement_acq:
1598  case clang::ARM::BI_InterlockedDecrement64_acq:
1599    return MSVCIntrin::_InterlockedDecrement_acq;
1600  case clang::ARM::BI_InterlockedDecrement16_rel:
1601  case clang::ARM::BI_InterlockedDecrement_rel:
1602  case clang::ARM::BI_InterlockedDecrement64_rel:
1603    return MSVCIntrin::_InterlockedDecrement_rel;
1604  case clang::ARM::BI_InterlockedDecrement16_nf:
1605  case clang::ARM::BI_InterlockedDecrement_nf:
1606  case clang::ARM::BI_InterlockedDecrement64_nf:
1607    return MSVCIntrin::_InterlockedDecrement_nf;
1608  }
1609  llvm_unreachable("must return from switch");
1610}
1611
1612static std::optional<CodeGenFunction::MSVCIntrin>
1613translateAarch64ToMsvcIntrin(unsigned BuiltinID) {
1614  using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1615  switch (BuiltinID) {
1616  default:
1617    return std::nullopt;
1618  case clang::AArch64::BI_BitScanForward:
1619  case clang::AArch64::BI_BitScanForward64:
1620    return MSVCIntrin::_BitScanForward;
1621  case clang::AArch64::BI_BitScanReverse:
1622  case clang::AArch64::BI_BitScanReverse64:
1623    return MSVCIntrin::_BitScanReverse;
1624  case clang::AArch64::BI_InterlockedAnd64:
1625    return MSVCIntrin::_InterlockedAnd;
1626  case clang::AArch64::BI_InterlockedExchange64:
1627    return MSVCIntrin::_InterlockedExchange;
1628  case clang::AArch64::BI_InterlockedExchangeAdd64:
1629    return MSVCIntrin::_InterlockedExchangeAdd;
1630  case clang::AArch64::BI_InterlockedExchangeSub64:
1631    return MSVCIntrin::_InterlockedExchangeSub;
1632  case clang::AArch64::BI_InterlockedOr64:
1633    return MSVCIntrin::_InterlockedOr;
1634  case clang::AArch64::BI_InterlockedXor64:
1635    return MSVCIntrin::_InterlockedXor;
1636  case clang::AArch64::BI_InterlockedDecrement64:
1637    return MSVCIntrin::_InterlockedDecrement;
1638  case clang::AArch64::BI_InterlockedIncrement64:
1639    return MSVCIntrin::_InterlockedIncrement;
1640  case clang::AArch64::BI_InterlockedExchangeAdd8_acq:
1641  case clang::AArch64::BI_InterlockedExchangeAdd16_acq:
1642  case clang::AArch64::BI_InterlockedExchangeAdd_acq:
1643  case clang::AArch64::BI_InterlockedExchangeAdd64_acq:
1644    return MSVCIntrin::_InterlockedExchangeAdd_acq;
1645  case clang::AArch64::BI_InterlockedExchangeAdd8_rel:
1646  case clang::AArch64::BI_InterlockedExchangeAdd16_rel:
1647  case clang::AArch64::BI_InterlockedExchangeAdd_rel:
1648  case clang::AArch64::BI_InterlockedExchangeAdd64_rel:
1649    return MSVCIntrin::_InterlockedExchangeAdd_rel;
1650  case clang::AArch64::BI_InterlockedExchangeAdd8_nf:
1651  case clang::AArch64::BI_InterlockedExchangeAdd16_nf:
1652  case clang::AArch64::BI_InterlockedExchangeAdd_nf:
1653  case clang::AArch64::BI_InterlockedExchangeAdd64_nf:
1654    return MSVCIntrin::_InterlockedExchangeAdd_nf;
1655  case clang::AArch64::BI_InterlockedExchange8_acq:
1656  case clang::AArch64::BI_InterlockedExchange16_acq:
1657  case clang::AArch64::BI_InterlockedExchange_acq:
1658  case clang::AArch64::BI_InterlockedExchange64_acq:
1659    return MSVCIntrin::_InterlockedExchange_acq;
1660  case clang::AArch64::BI_InterlockedExchange8_rel:
1661  case clang::AArch64::BI_InterlockedExchange16_rel:
1662  case clang::AArch64::BI_InterlockedExchange_rel:
1663  case clang::AArch64::BI_InterlockedExchange64_rel:
1664    return MSVCIntrin::_InterlockedExchange_rel;
1665  case clang::AArch64::BI_InterlockedExchange8_nf:
1666  case clang::AArch64::BI_InterlockedExchange16_nf:
1667  case clang::AArch64::BI_InterlockedExchange_nf:
1668  case clang::AArch64::BI_InterlockedExchange64_nf:
1669    return MSVCIntrin::_InterlockedExchange_nf;
1670  case clang::AArch64::BI_InterlockedCompareExchange8_acq:
1671  case clang::AArch64::BI_InterlockedCompareExchange16_acq:
1672  case clang::AArch64::BI_InterlockedCompareExchange_acq:
1673  case clang::AArch64::BI_InterlockedCompareExchange64_acq:
1674    return MSVCIntrin::_InterlockedCompareExchange_acq;
1675  case clang::AArch64::BI_InterlockedCompareExchange8_rel:
1676  case clang::AArch64::BI_InterlockedCompareExchange16_rel:
1677  case clang::AArch64::BI_InterlockedCompareExchange_rel:
1678  case clang::AArch64::BI_InterlockedCompareExchange64_rel:
1679    return MSVCIntrin::_InterlockedCompareExchange_rel;
1680  case clang::AArch64::BI_InterlockedCompareExchange8_nf:
1681  case clang::AArch64::BI_InterlockedCompareExchange16_nf:
1682  case clang::AArch64::BI_InterlockedCompareExchange_nf:
1683  case clang::AArch64::BI_InterlockedCompareExchange64_nf:
1684    return MSVCIntrin::_InterlockedCompareExchange_nf;
1685  case clang::AArch64::BI_InterlockedCompareExchange128:
1686    return MSVCIntrin::_InterlockedCompareExchange128;
1687  case clang::AArch64::BI_InterlockedCompareExchange128_acq:
1688    return MSVCIntrin::_InterlockedCompareExchange128_acq;
1689  case clang::AArch64::BI_InterlockedCompareExchange128_nf:
1690    return MSVCIntrin::_InterlockedCompareExchange128_nf;
1691  case clang::AArch64::BI_InterlockedCompareExchange128_rel:
1692    return MSVCIntrin::_InterlockedCompareExchange128_rel;
1693  case clang::AArch64::BI_InterlockedOr8_acq:
1694  case clang::AArch64::BI_InterlockedOr16_acq:
1695  case clang::AArch64::BI_InterlockedOr_acq:
1696  case clang::AArch64::BI_InterlockedOr64_acq:
1697    return MSVCIntrin::_InterlockedOr_acq;
1698  case clang::AArch64::BI_InterlockedOr8_rel:
1699  case clang::AArch64::BI_InterlockedOr16_rel:
1700  case clang::AArch64::BI_InterlockedOr_rel:
1701  case clang::AArch64::BI_InterlockedOr64_rel:
1702    return MSVCIntrin::_InterlockedOr_rel;
1703  case clang::AArch64::BI_InterlockedOr8_nf:
1704  case clang::AArch64::BI_InterlockedOr16_nf:
1705  case clang::AArch64::BI_InterlockedOr_nf:
1706  case clang::AArch64::BI_InterlockedOr64_nf:
1707    return MSVCIntrin::_InterlockedOr_nf;
1708  case clang::AArch64::BI_InterlockedXor8_acq:
1709  case clang::AArch64::BI_InterlockedXor16_acq:
1710  case clang::AArch64::BI_InterlockedXor_acq:
1711  case clang::AArch64::BI_InterlockedXor64_acq:
1712    return MSVCIntrin::_InterlockedXor_acq;
1713  case clang::AArch64::BI_InterlockedXor8_rel:
1714  case clang::AArch64::BI_InterlockedXor16_rel:
1715  case clang::AArch64::BI_InterlockedXor_rel:
1716  case clang::AArch64::BI_InterlockedXor64_rel:
1717    return MSVCIntrin::_InterlockedXor_rel;
1718  case clang::AArch64::BI_InterlockedXor8_nf:
1719  case clang::AArch64::BI_InterlockedXor16_nf:
1720  case clang::AArch64::BI_InterlockedXor_nf:
1721  case clang::AArch64::BI_InterlockedXor64_nf:
1722    return MSVCIntrin::_InterlockedXor_nf;
1723  case clang::AArch64::BI_InterlockedAnd8_acq:
1724  case clang::AArch64::BI_InterlockedAnd16_acq:
1725  case clang::AArch64::BI_InterlockedAnd_acq:
1726  case clang::AArch64::BI_InterlockedAnd64_acq:
1727    return MSVCIntrin::_InterlockedAnd_acq;
1728  case clang::AArch64::BI_InterlockedAnd8_rel:
1729  case clang::AArch64::BI_InterlockedAnd16_rel:
1730  case clang::AArch64::BI_InterlockedAnd_rel:
1731  case clang::AArch64::BI_InterlockedAnd64_rel:
1732    return MSVCIntrin::_InterlockedAnd_rel;
1733  case clang::AArch64::BI_InterlockedAnd8_nf:
1734  case clang::AArch64::BI_InterlockedAnd16_nf:
1735  case clang::AArch64::BI_InterlockedAnd_nf:
1736  case clang::AArch64::BI_InterlockedAnd64_nf:
1737    return MSVCIntrin::_InterlockedAnd_nf;
1738  case clang::AArch64::BI_InterlockedIncrement16_acq:
1739  case clang::AArch64::BI_InterlockedIncrement_acq:
1740  case clang::AArch64::BI_InterlockedIncrement64_acq:
1741    return MSVCIntrin::_InterlockedIncrement_acq;
1742  case clang::AArch64::BI_InterlockedIncrement16_rel:
1743  case clang::AArch64::BI_InterlockedIncrement_rel:
1744  case clang::AArch64::BI_InterlockedIncrement64_rel:
1745    return MSVCIntrin::_InterlockedIncrement_rel;
1746  case clang::AArch64::BI_InterlockedIncrement16_nf:
1747  case clang::AArch64::BI_InterlockedIncrement_nf:
1748  case clang::AArch64::BI_InterlockedIncrement64_nf:
1749    return MSVCIntrin::_InterlockedIncrement_nf;
1750  case clang::AArch64::BI_InterlockedDecrement16_acq:
1751  case clang::AArch64::BI_InterlockedDecrement_acq:
1752  case clang::AArch64::BI_InterlockedDecrement64_acq:
1753    return MSVCIntrin::_InterlockedDecrement_acq;
1754  case clang::AArch64::BI_InterlockedDecrement16_rel:
1755  case clang::AArch64::BI_InterlockedDecrement_rel:
1756  case clang::AArch64::BI_InterlockedDecrement64_rel:
1757    return MSVCIntrin::_InterlockedDecrement_rel;
1758  case clang::AArch64::BI_InterlockedDecrement16_nf:
1759  case clang::AArch64::BI_InterlockedDecrement_nf:
1760  case clang::AArch64::BI_InterlockedDecrement64_nf:
1761    return MSVCIntrin::_InterlockedDecrement_nf;
1762  }
1763  llvm_unreachable("must return from switch");
1764}
1765
1766static std::optional<CodeGenFunction::MSVCIntrin>
1767translateX86ToMsvcIntrin(unsigned BuiltinID) {
1768  using MSVCIntrin = CodeGenFunction::MSVCIntrin;
1769  switch (BuiltinID) {
1770  default:
1771    return std::nullopt;
1772  case clang::X86::BI_BitScanForward:
1773  case clang::X86::BI_BitScanForward64:
1774    return MSVCIntrin::_BitScanForward;
1775  case clang::X86::BI_BitScanReverse:
1776  case clang::X86::BI_BitScanReverse64:
1777    return MSVCIntrin::_BitScanReverse;
1778  case clang::X86::BI_InterlockedAnd64:
1779    return MSVCIntrin::_InterlockedAnd;
1780  case clang::X86::BI_InterlockedCompareExchange128:
1781    return MSVCIntrin::_InterlockedCompareExchange128;
1782  case clang::X86::BI_InterlockedExchange64:
1783    return MSVCIntrin::_InterlockedExchange;
1784  case clang::X86::BI_InterlockedExchangeAdd64:
1785    return MSVCIntrin::_InterlockedExchangeAdd;
1786  case clang::X86::BI_InterlockedExchangeSub64:
1787    return MSVCIntrin::_InterlockedExchangeSub;
1788  case clang::X86::BI_InterlockedOr64:
1789    return MSVCIntrin::_InterlockedOr;
1790  case clang::X86::BI_InterlockedXor64:
1791    return MSVCIntrin::_InterlockedXor;
1792  case clang::X86::BI_InterlockedDecrement64:
1793    return MSVCIntrin::_InterlockedDecrement;
1794  case clang::X86::BI_InterlockedIncrement64:
1795    return MSVCIntrin::_InterlockedIncrement;
1796  }
1797  llvm_unreachable("must return from switch");
1798}
1799
1800// Emit an MSVC intrinsic. Assumes that arguments have *not* been evaluated.
1801Value *CodeGenFunction::EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID,
1802                                            const CallExpr *E) {
1803  switch (BuiltinID) {
1804  case MSVCIntrin::_BitScanForward:
1805  case MSVCIntrin::_BitScanReverse: {
1806    Address IndexAddress(EmitPointerWithAlignment(E->getArg(0)));
1807    Value *ArgValue = EmitScalarExpr(E->getArg(1));
1808
1809    llvm::Type *ArgType = ArgValue->getType();
1810    llvm::Type *IndexType = IndexAddress.getElementType();
1811    llvm::Type *ResultType = ConvertType(E->getType());
1812
1813    Value *ArgZero = llvm::Constant::getNullValue(ArgType);
1814    Value *ResZero = llvm::Constant::getNullValue(ResultType);
1815    Value *ResOne = llvm::ConstantInt::get(ResultType, 1);
1816
1817    BasicBlock *Begin = Builder.GetInsertBlock();
1818    BasicBlock *End = createBasicBlock("bitscan_end", this->CurFn);
1819    Builder.SetInsertPoint(End);
1820    PHINode *Result = Builder.CreatePHI(ResultType, 2, "bitscan_result");
1821
1822    Builder.SetInsertPoint(Begin);
1823    Value *IsZero = Builder.CreateICmpEQ(ArgValue, ArgZero);
1824    BasicBlock *NotZero = createBasicBlock("bitscan_not_zero", this->CurFn);
1825    Builder.CreateCondBr(IsZero, End, NotZero);
1826    Result->addIncoming(ResZero, Begin);
1827
1828    Builder.SetInsertPoint(NotZero);
1829
1830    if (BuiltinID == MSVCIntrin::_BitScanForward) {
1831      Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
1832      Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
1833      ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
1834      Builder.CreateStore(ZeroCount, IndexAddress, false);
1835    } else {
1836      unsigned ArgWidth = cast<llvm::IntegerType>(ArgType)->getBitWidth();
1837      Value *ArgTypeLastIndex = llvm::ConstantInt::get(IndexType, ArgWidth - 1);
1838
1839      Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
1840      Value *ZeroCount = Builder.CreateCall(F, {ArgValue, Builder.getTrue()});
1841      ZeroCount = Builder.CreateIntCast(ZeroCount, IndexType, false);
1842      Value *Index = Builder.CreateNSWSub(ArgTypeLastIndex, ZeroCount);
1843      Builder.CreateStore(Index, IndexAddress, false);
1844    }
1845    Builder.CreateBr(End);
1846    Result->addIncoming(ResOne, NotZero);
1847
1848    Builder.SetInsertPoint(End);
1849    return Result;
1850  }
1851  case MSVCIntrin::_InterlockedAnd:
1852    return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E);
1853  case MSVCIntrin::_InterlockedExchange:
1854    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E);
1855  case MSVCIntrin::_InterlockedExchangeAdd:
1856    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E);
1857  case MSVCIntrin::_InterlockedExchangeSub:
1858    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Sub, E);
1859  case MSVCIntrin::_InterlockedOr:
1860    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E);
1861  case MSVCIntrin::_InterlockedXor:
1862    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E);
1863  case MSVCIntrin::_InterlockedExchangeAdd_acq:
1864    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1865                                 AtomicOrdering::Acquire);
1866  case MSVCIntrin::_InterlockedExchangeAdd_rel:
1867    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1868                                 AtomicOrdering::Release);
1869  case MSVCIntrin::_InterlockedExchangeAdd_nf:
1870    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
1871                                 AtomicOrdering::Monotonic);
1872  case MSVCIntrin::_InterlockedExchange_acq:
1873    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1874                                 AtomicOrdering::Acquire);
1875  case MSVCIntrin::_InterlockedExchange_rel:
1876    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1877                                 AtomicOrdering::Release);
1878  case MSVCIntrin::_InterlockedExchange_nf:
1879    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
1880                                 AtomicOrdering::Monotonic);
1881  case MSVCIntrin::_InterlockedCompareExchange_acq:
1882    return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Acquire);
1883  case MSVCIntrin::_InterlockedCompareExchange_rel:
1884    return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Release);
1885  case MSVCIntrin::_InterlockedCompareExchange_nf:
1886    return EmitAtomicCmpXchgForMSIntrin(*this, E, AtomicOrdering::Monotonic);
1887  case MSVCIntrin::_InterlockedCompareExchange128:
1888    return EmitAtomicCmpXchg128ForMSIntrin(
1889        *this, E, AtomicOrdering::SequentiallyConsistent);
1890  case MSVCIntrin::_InterlockedCompareExchange128_acq:
1891    return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Acquire);
1892  case MSVCIntrin::_InterlockedCompareExchange128_rel:
1893    return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Release);
1894  case MSVCIntrin::_InterlockedCompareExchange128_nf:
1895    return EmitAtomicCmpXchg128ForMSIntrin(*this, E, AtomicOrdering::Monotonic);
1896  case MSVCIntrin::_InterlockedOr_acq:
1897    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1898                                 AtomicOrdering::Acquire);
1899  case MSVCIntrin::_InterlockedOr_rel:
1900    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1901                                 AtomicOrdering::Release);
1902  case MSVCIntrin::_InterlockedOr_nf:
1903    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
1904                                 AtomicOrdering::Monotonic);
1905  case MSVCIntrin::_InterlockedXor_acq:
1906    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1907                                 AtomicOrdering::Acquire);
1908  case MSVCIntrin::_InterlockedXor_rel:
1909    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1910                                 AtomicOrdering::Release);
1911  case MSVCIntrin::_InterlockedXor_nf:
1912    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xor, E,
1913                                 AtomicOrdering::Monotonic);
1914  case MSVCIntrin::_InterlockedAnd_acq:
1915    return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1916                                 AtomicOrdering::Acquire);
1917  case MSVCIntrin::_InterlockedAnd_rel:
1918    return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1919                                 AtomicOrdering::Release);
1920  case MSVCIntrin::_InterlockedAnd_nf:
1921    return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
1922                                 AtomicOrdering::Monotonic);
1923  case MSVCIntrin::_InterlockedIncrement_acq:
1924    return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Acquire);
1925  case MSVCIntrin::_InterlockedIncrement_rel:
1926    return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Release);
1927  case MSVCIntrin::_InterlockedIncrement_nf:
1928    return EmitAtomicIncrementValue(*this, E, AtomicOrdering::Monotonic);
1929  case MSVCIntrin::_InterlockedDecrement_acq:
1930    return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Acquire);
1931  case MSVCIntrin::_InterlockedDecrement_rel:
1932    return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Release);
1933  case MSVCIntrin::_InterlockedDecrement_nf:
1934    return EmitAtomicDecrementValue(*this, E, AtomicOrdering::Monotonic);
1935
1936  case MSVCIntrin::_InterlockedDecrement:
1937    return EmitAtomicDecrementValue(*this, E);
1938  case MSVCIntrin::_InterlockedIncrement:
1939    return EmitAtomicIncrementValue(*this, E);
1940
1941  case MSVCIntrin::__fastfail: {
1942    // Request immediate process termination from the kernel. The instruction
1943    // sequences to do this are documented on MSDN:
1944    // https://msdn.microsoft.com/en-us/library/dn774154.aspx
1945    llvm::Triple::ArchType ISA = getTarget().getTriple().getArch();
1946    StringRef Asm, Constraints;
1947    switch (ISA) {
1948    default:
1949      ErrorUnsupported(E, "__fastfail call for this architecture");
1950      break;
1951    case llvm::Triple::x86:
1952    case llvm::Triple::x86_64:
1953      Asm = "int $$0x29";
1954      Constraints = "{cx}";
1955      break;
1956    case llvm::Triple::thumb:
1957      Asm = "udf #251";
1958      Constraints = "{r0}";
1959      break;
1960    case llvm::Triple::aarch64:
1961      Asm = "brk #0xF003";
1962      Constraints = "{w0}";
1963    }
1964    llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, {Int32Ty}, false);
1965    llvm::InlineAsm *IA =
1966        llvm::InlineAsm::get(FTy, Asm, Constraints, /*hasSideEffects=*/true);
1967    llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
1968        getLLVMContext(), llvm::AttributeList::FunctionIndex,
1969        llvm::Attribute::NoReturn);
1970    llvm::CallInst *CI = Builder.CreateCall(IA, EmitScalarExpr(E->getArg(0)));
1971    CI->setAttributes(NoReturnAttr);
1972    return CI;
1973  }
1974  }
1975  llvm_unreachable("Incorrect MSVC intrinsic!");
1976}
1977
1978namespace {
1979// ARC cleanup for __builtin_os_log_format
1980struct CallObjCArcUse final : EHScopeStack::Cleanup {
1981  CallObjCArcUse(llvm::Value *object) : object(object) {}
1982  llvm::Value *object;
1983
1984  void Emit(CodeGenFunction &CGF, Flags flags) override {
1985    CGF.EmitARCIntrinsicUse(object);
1986  }
1987};
1988}
1989
1990Value *CodeGenFunction::EmitCheckedArgForBuiltin(const Expr *E,
1991                                                 BuiltinCheckKind Kind) {
1992  assert((Kind == BCK_CLZPassedZero || Kind == BCK_CTZPassedZero)
1993          && "Unsupported builtin check kind");
1994
1995  Value *ArgValue = EmitScalarExpr(E);
1996  if (!SanOpts.has(SanitizerKind::Builtin))
1997    return ArgValue;
1998
1999  SanitizerScope SanScope(this);
2000  Value *Cond = Builder.CreateICmpNE(
2001      ArgValue, llvm::Constant::getNullValue(ArgValue->getType()));
2002  EmitCheck(std::make_pair(Cond, SanitizerKind::Builtin),
2003            SanitizerHandler::InvalidBuiltin,
2004            {EmitCheckSourceLocation(E->getExprLoc()),
2005             llvm::ConstantInt::get(Builder.getInt8Ty(), Kind)},
2006            std::nullopt);
2007  return ArgValue;
2008}
2009
2010static Value *EmitAbs(CodeGenFunction &CGF, Value *ArgValue, bool HasNSW) {
2011  return CGF.Builder.CreateBinaryIntrinsic(
2012      Intrinsic::abs, ArgValue,
2013      ConstantInt::get(CGF.Builder.getInt1Ty(), HasNSW));
2014}
2015
2016static Value *EmitOverflowCheckedAbs(CodeGenFunction &CGF, const CallExpr *E,
2017                                     bool SanitizeOverflow) {
2018  Value *ArgValue = CGF.EmitScalarExpr(E->getArg(0));
2019
2020  // Try to eliminate overflow check.
2021  if (const auto *VCI = dyn_cast<llvm::ConstantInt>(ArgValue)) {
2022    if (!VCI->isMinSignedValue())
2023      return EmitAbs(CGF, ArgValue, true);
2024  }
2025
2026  CodeGenFunction::SanitizerScope SanScope(&CGF);
2027
2028  Constant *Zero = Constant::getNullValue(ArgValue->getType());
2029  Value *ResultAndOverflow = CGF.Builder.CreateBinaryIntrinsic(
2030      Intrinsic::ssub_with_overflow, Zero, ArgValue);
2031  Value *Result = CGF.Builder.CreateExtractValue(ResultAndOverflow, 0);
2032  Value *NotOverflow = CGF.Builder.CreateNot(
2033      CGF.Builder.CreateExtractValue(ResultAndOverflow, 1));
2034
2035  // TODO: support -ftrapv-handler.
2036  if (SanitizeOverflow) {
2037    CGF.EmitCheck({{NotOverflow, SanitizerKind::SignedIntegerOverflow}},
2038                  SanitizerHandler::NegateOverflow,
2039                  {CGF.EmitCheckSourceLocation(E->getArg(0)->getExprLoc()),
2040                   CGF.EmitCheckTypeDescriptor(E->getType())},
2041                  {ArgValue});
2042  } else
2043    CGF.EmitTrapCheck(NotOverflow, SanitizerHandler::SubOverflow);
2044
2045  Value *CmpResult = CGF.Builder.CreateICmpSLT(ArgValue, Zero, "abscond");
2046  return CGF.Builder.CreateSelect(CmpResult, Result, ArgValue, "abs");
2047}
2048
2049/// Get the argument type for arguments to os_log_helper.
2050static CanQualType getOSLogArgType(ASTContext &C, int Size) {
2051  QualType UnsignedTy = C.getIntTypeForBitwidth(Size * 8, /*Signed=*/false);
2052  return C.getCanonicalType(UnsignedTy);
2053}
2054
2055llvm::Function *CodeGenFunction::generateBuiltinOSLogHelperFunction(
2056    const analyze_os_log::OSLogBufferLayout &Layout,
2057    CharUnits BufferAlignment) {
2058  ASTContext &Ctx = getContext();
2059
2060  llvm::SmallString<64> Name;
2061  {
2062    raw_svector_ostream OS(Name);
2063    OS << "__os_log_helper";
2064    OS << "_" << BufferAlignment.getQuantity();
2065    OS << "_" << int(Layout.getSummaryByte());
2066    OS << "_" << int(Layout.getNumArgsByte());
2067    for (const auto &Item : Layout.Items)
2068      OS << "_" << int(Item.getSizeByte()) << "_"
2069         << int(Item.getDescriptorByte());
2070  }
2071
2072  if (llvm::Function *F = CGM.getModule().getFunction(Name))
2073    return F;
2074
2075  llvm::SmallVector<QualType, 4> ArgTys;
2076  FunctionArgList Args;
2077  Args.push_back(ImplicitParamDecl::Create(
2078      Ctx, nullptr, SourceLocation(), &Ctx.Idents.get("buffer"), Ctx.VoidPtrTy,
2079      ImplicitParamKind::Other));
2080  ArgTys.emplace_back(Ctx.VoidPtrTy);
2081
2082  for (unsigned int I = 0, E = Layout.Items.size(); I < E; ++I) {
2083    char Size = Layout.Items[I].getSizeByte();
2084    if (!Size)
2085      continue;
2086
2087    QualType ArgTy = getOSLogArgType(Ctx, Size);
2088    Args.push_back(ImplicitParamDecl::Create(
2089        Ctx, nullptr, SourceLocation(),
2090        &Ctx.Idents.get(std::string("arg") + llvm::to_string(I)), ArgTy,
2091        ImplicitParamKind::Other));
2092    ArgTys.emplace_back(ArgTy);
2093  }
2094
2095  QualType ReturnTy = Ctx.VoidTy;
2096
2097  // The helper function has linkonce_odr linkage to enable the linker to merge
2098  // identical functions. To ensure the merging always happens, 'noinline' is
2099  // attached to the function when compiling with -Oz.
2100  const CGFunctionInfo &FI =
2101      CGM.getTypes().arrangeBuiltinFunctionDeclaration(ReturnTy, Args);
2102  llvm::FunctionType *FuncTy = CGM.getTypes().GetFunctionType(FI);
2103  llvm::Function *Fn = llvm::Function::Create(
2104      FuncTy, llvm::GlobalValue::LinkOnceODRLinkage, Name, &CGM.getModule());
2105  Fn->setVisibility(llvm::GlobalValue::HiddenVisibility);
2106  CGM.SetLLVMFunctionAttributes(GlobalDecl(), FI, Fn, /*IsThunk=*/false);
2107  CGM.SetLLVMFunctionAttributesForDefinition(nullptr, Fn);
2108  Fn->setDoesNotThrow();
2109
2110  // Attach 'noinline' at -Oz.
2111  if (CGM.getCodeGenOpts().OptimizeSize == 2)
2112    Fn->addFnAttr(llvm::Attribute::NoInline);
2113
2114  auto NL = ApplyDebugLocation::CreateEmpty(*this);
2115  StartFunction(GlobalDecl(), ReturnTy, Fn, FI, Args);
2116
2117  // Create a scope with an artificial location for the body of this function.
2118  auto AL = ApplyDebugLocation::CreateArtificial(*this);
2119
2120  CharUnits Offset;
2121  Address BufAddr =
2122      Address(Builder.CreateLoad(GetAddrOfLocalVar(Args[0]), "buf"), Int8Ty,
2123              BufferAlignment);
2124  Builder.CreateStore(Builder.getInt8(Layout.getSummaryByte()),
2125                      Builder.CreateConstByteGEP(BufAddr, Offset++, "summary"));
2126  Builder.CreateStore(Builder.getInt8(Layout.getNumArgsByte()),
2127                      Builder.CreateConstByteGEP(BufAddr, Offset++, "numArgs"));
2128
2129  unsigned I = 1;
2130  for (const auto &Item : Layout.Items) {
2131    Builder.CreateStore(
2132        Builder.getInt8(Item.getDescriptorByte()),
2133        Builder.CreateConstByteGEP(BufAddr, Offset++, "argDescriptor"));
2134    Builder.CreateStore(
2135        Builder.getInt8(Item.getSizeByte()),
2136        Builder.CreateConstByteGEP(BufAddr, Offset++, "argSize"));
2137
2138    CharUnits Size = Item.size();
2139    if (!Size.getQuantity())
2140      continue;
2141
2142    Address Arg = GetAddrOfLocalVar(Args[I]);
2143    Address Addr = Builder.CreateConstByteGEP(BufAddr, Offset, "argData");
2144    Addr = Addr.withElementType(Arg.getElementType());
2145    Builder.CreateStore(Builder.CreateLoad(Arg), Addr);
2146    Offset += Size;
2147    ++I;
2148  }
2149
2150  FinishFunction();
2151
2152  return Fn;
2153}
2154
2155RValue CodeGenFunction::emitBuiltinOSLogFormat(const CallExpr &E) {
2156  assert(E.getNumArgs() >= 2 &&
2157         "__builtin_os_log_format takes at least 2 arguments");
2158  ASTContext &Ctx = getContext();
2159  analyze_os_log::OSLogBufferLayout Layout;
2160  analyze_os_log::computeOSLogBufferLayout(Ctx, &E, Layout);
2161  Address BufAddr = EmitPointerWithAlignment(E.getArg(0));
2162  llvm::SmallVector<llvm::Value *, 4> RetainableOperands;
2163
2164  // Ignore argument 1, the format string. It is not currently used.
2165  CallArgList Args;
2166  Args.add(RValue::get(BufAddr.getPointer()), Ctx.VoidPtrTy);
2167
2168  for (const auto &Item : Layout.Items) {
2169    int Size = Item.getSizeByte();
2170    if (!Size)
2171      continue;
2172
2173    llvm::Value *ArgVal;
2174
2175    if (Item.getKind() == analyze_os_log::OSLogBufferItem::MaskKind) {
2176      uint64_t Val = 0;
2177      for (unsigned I = 0, E = Item.getMaskType().size(); I < E; ++I)
2178        Val |= ((uint64_t)Item.getMaskType()[I]) << I * 8;
2179      ArgVal = llvm::Constant::getIntegerValue(Int64Ty, llvm::APInt(64, Val));
2180    } else if (const Expr *TheExpr = Item.getExpr()) {
2181      ArgVal = EmitScalarExpr(TheExpr, /*Ignore*/ false);
2182
2183      // If a temporary object that requires destruction after the full
2184      // expression is passed, push a lifetime-extended cleanup to extend its
2185      // lifetime to the end of the enclosing block scope.
2186      auto LifetimeExtendObject = [&](const Expr *E) {
2187        E = E->IgnoreParenCasts();
2188        // Extend lifetimes of objects returned by function calls and message
2189        // sends.
2190
2191        // FIXME: We should do this in other cases in which temporaries are
2192        //        created including arguments of non-ARC types (e.g., C++
2193        //        temporaries).
2194        if (isa<CallExpr>(E) || isa<ObjCMessageExpr>(E))
2195          return true;
2196        return false;
2197      };
2198
2199      if (TheExpr->getType()->isObjCRetainableType() &&
2200          getLangOpts().ObjCAutoRefCount && LifetimeExtendObject(TheExpr)) {
2201        assert(getEvaluationKind(TheExpr->getType()) == TEK_Scalar &&
2202               "Only scalar can be a ObjC retainable type");
2203        if (!isa<Constant>(ArgVal)) {
2204          CleanupKind Cleanup = getARCCleanupKind();
2205          QualType Ty = TheExpr->getType();
2206          Address Alloca = Address::invalid();
2207          Address Addr = CreateMemTemp(Ty, "os.log.arg", &Alloca);
2208          ArgVal = EmitARCRetain(Ty, ArgVal);
2209          Builder.CreateStore(ArgVal, Addr);
2210          pushLifetimeExtendedDestroy(Cleanup, Alloca, Ty,
2211                                      CodeGenFunction::destroyARCStrongPrecise,
2212                                      Cleanup & EHCleanup);
2213
2214          // Push a clang.arc.use call to ensure ARC optimizer knows that the
2215          // argument has to be alive.
2216          if (CGM.getCodeGenOpts().OptimizationLevel != 0)
2217            pushCleanupAfterFullExpr<CallObjCArcUse>(Cleanup, ArgVal);
2218        }
2219      }
2220    } else {
2221      ArgVal = Builder.getInt32(Item.getConstValue().getQuantity());
2222    }
2223
2224    unsigned ArgValSize =
2225        CGM.getDataLayout().getTypeSizeInBits(ArgVal->getType());
2226    llvm::IntegerType *IntTy = llvm::Type::getIntNTy(getLLVMContext(),
2227                                                     ArgValSize);
2228    ArgVal = Builder.CreateBitOrPointerCast(ArgVal, IntTy);
2229    CanQualType ArgTy = getOSLogArgType(Ctx, Size);
2230    // If ArgVal has type x86_fp80, zero-extend ArgVal.
2231    ArgVal = Builder.CreateZExtOrBitCast(ArgVal, ConvertType(ArgTy));
2232    Args.add(RValue::get(ArgVal), ArgTy);
2233  }
2234
2235  const CGFunctionInfo &FI =
2236      CGM.getTypes().arrangeBuiltinFunctionCall(Ctx.VoidTy, Args);
2237  llvm::Function *F = CodeGenFunction(CGM).generateBuiltinOSLogHelperFunction(
2238      Layout, BufAddr.getAlignment());
2239  EmitCall(FI, CGCallee::forDirect(F), ReturnValueSlot(), Args);
2240  return RValue::get(BufAddr.getPointer());
2241}
2242
2243static bool isSpecialUnsignedMultiplySignedResult(
2244    unsigned BuiltinID, WidthAndSignedness Op1Info, WidthAndSignedness Op2Info,
2245    WidthAndSignedness ResultInfo) {
2246  return BuiltinID == Builtin::BI__builtin_mul_overflow &&
2247         Op1Info.Width == Op2Info.Width && Op2Info.Width == ResultInfo.Width &&
2248         !Op1Info.Signed && !Op2Info.Signed && ResultInfo.Signed;
2249}
2250
2251static RValue EmitCheckedUnsignedMultiplySignedResult(
2252    CodeGenFunction &CGF, const clang::Expr *Op1, WidthAndSignedness Op1Info,
2253    const clang::Expr *Op2, WidthAndSignedness Op2Info,
2254    const clang::Expr *ResultArg, QualType ResultQTy,
2255    WidthAndSignedness ResultInfo) {
2256  assert(isSpecialUnsignedMultiplySignedResult(
2257             Builtin::BI__builtin_mul_overflow, Op1Info, Op2Info, ResultInfo) &&
2258         "Cannot specialize this multiply");
2259
2260  llvm::Value *V1 = CGF.EmitScalarExpr(Op1);
2261  llvm::Value *V2 = CGF.EmitScalarExpr(Op2);
2262
2263  llvm::Value *HasOverflow;
2264  llvm::Value *Result = EmitOverflowIntrinsic(
2265      CGF, llvm::Intrinsic::umul_with_overflow, V1, V2, HasOverflow);
2266
2267  // The intrinsic call will detect overflow when the value is > UINT_MAX,
2268  // however, since the original builtin had a signed result, we need to report
2269  // an overflow when the result is greater than INT_MAX.
2270  auto IntMax = llvm::APInt::getSignedMaxValue(ResultInfo.Width);
2271  llvm::Value *IntMaxValue = llvm::ConstantInt::get(Result->getType(), IntMax);
2272
2273  llvm::Value *IntMaxOverflow = CGF.Builder.CreateICmpUGT(Result, IntMaxValue);
2274  HasOverflow = CGF.Builder.CreateOr(HasOverflow, IntMaxOverflow);
2275
2276  bool isVolatile =
2277      ResultArg->getType()->getPointeeType().isVolatileQualified();
2278  Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
2279  CGF.Builder.CreateStore(CGF.EmitToMemory(Result, ResultQTy), ResultPtr,
2280                          isVolatile);
2281  return RValue::get(HasOverflow);
2282}
2283
2284/// Determine if a binop is a checked mixed-sign multiply we can specialize.
2285static bool isSpecialMixedSignMultiply(unsigned BuiltinID,
2286                                       WidthAndSignedness Op1Info,
2287                                       WidthAndSignedness Op2Info,
2288                                       WidthAndSignedness ResultInfo) {
2289  return BuiltinID == Builtin::BI__builtin_mul_overflow &&
2290         std::max(Op1Info.Width, Op2Info.Width) >= ResultInfo.Width &&
2291         Op1Info.Signed != Op2Info.Signed;
2292}
2293
2294/// Emit a checked mixed-sign multiply. This is a cheaper specialization of
2295/// the generic checked-binop irgen.
2296static RValue
2297EmitCheckedMixedSignMultiply(CodeGenFunction &CGF, const clang::Expr *Op1,
2298                             WidthAndSignedness Op1Info, const clang::Expr *Op2,
2299                             WidthAndSignedness Op2Info,
2300                             const clang::Expr *ResultArg, QualType ResultQTy,
2301                             WidthAndSignedness ResultInfo) {
2302  assert(isSpecialMixedSignMultiply(Builtin::BI__builtin_mul_overflow, Op1Info,
2303                                    Op2Info, ResultInfo) &&
2304         "Not a mixed-sign multipliction we can specialize");
2305
2306  // Emit the signed and unsigned operands.
2307  const clang::Expr *SignedOp = Op1Info.Signed ? Op1 : Op2;
2308  const clang::Expr *UnsignedOp = Op1Info.Signed ? Op2 : Op1;
2309  llvm::Value *Signed = CGF.EmitScalarExpr(SignedOp);
2310  llvm::Value *Unsigned = CGF.EmitScalarExpr(UnsignedOp);
2311  unsigned SignedOpWidth = Op1Info.Signed ? Op1Info.Width : Op2Info.Width;
2312  unsigned UnsignedOpWidth = Op1Info.Signed ? Op2Info.Width : Op1Info.Width;
2313
2314  // One of the operands may be smaller than the other. If so, [s|z]ext it.
2315  if (SignedOpWidth < UnsignedOpWidth)
2316    Signed = CGF.Builder.CreateSExt(Signed, Unsigned->getType(), "op.sext");
2317  if (UnsignedOpWidth < SignedOpWidth)
2318    Unsigned = CGF.Builder.CreateZExt(Unsigned, Signed->getType(), "op.zext");
2319
2320  llvm::Type *OpTy = Signed->getType();
2321  llvm::Value *Zero = llvm::Constant::getNullValue(OpTy);
2322  Address ResultPtr = CGF.EmitPointerWithAlignment(ResultArg);
2323  llvm::Type *ResTy = ResultPtr.getElementType();
2324  unsigned OpWidth = std::max(Op1Info.Width, Op2Info.Width);
2325
2326  // Take the absolute value of the signed operand.
2327  llvm::Value *IsNegative = CGF.Builder.CreateICmpSLT(Signed, Zero);
2328  llvm::Value *AbsOfNegative = CGF.Builder.CreateSub(Zero, Signed);
2329  llvm::Value *AbsSigned =
2330      CGF.Builder.CreateSelect(IsNegative, AbsOfNegative, Signed);
2331
2332  // Perform a checked unsigned multiplication.
2333  llvm::Value *UnsignedOverflow;
2334  llvm::Value *UnsignedResult =
2335      EmitOverflowIntrinsic(CGF, llvm::Intrinsic::umul_with_overflow, AbsSigned,
2336                            Unsigned, UnsignedOverflow);
2337
2338  llvm::Value *Overflow, *Result;
2339  if (ResultInfo.Signed) {
2340    // Signed overflow occurs if the result is greater than INT_MAX or lesser
2341    // than INT_MIN, i.e when |Result| > (INT_MAX + IsNegative).
2342    auto IntMax =
2343        llvm::APInt::getSignedMaxValue(ResultInfo.Width).zext(OpWidth);
2344    llvm::Value *MaxResult =
2345        CGF.Builder.CreateAdd(llvm::ConstantInt::get(OpTy, IntMax),
2346                              CGF.Builder.CreateZExt(IsNegative, OpTy));
2347    llvm::Value *SignedOverflow =
2348        CGF.Builder.CreateICmpUGT(UnsignedResult, MaxResult);
2349    Overflow = CGF.Builder.CreateOr(UnsignedOverflow, SignedOverflow);
2350
2351    // Prepare the signed result (possibly by negating it).
2352    llvm::Value *NegativeResult = CGF.Builder.CreateNeg(UnsignedResult);
2353    llvm::Value *SignedResult =
2354        CGF.Builder.CreateSelect(IsNegative, NegativeResult, UnsignedResult);
2355    Result = CGF.Builder.CreateTrunc(SignedResult, ResTy);
2356  } else {
2357    // Unsigned overflow occurs if the result is < 0 or greater than UINT_MAX.
2358    llvm::Value *Underflow = CGF.Builder.CreateAnd(
2359        IsNegative, CGF.Builder.CreateIsNotNull(UnsignedResult));
2360    Overflow = CGF.Builder.CreateOr(UnsignedOverflow, Underflow);
2361    if (ResultInfo.Width < OpWidth) {
2362      auto IntMax =
2363          llvm::APInt::getMaxValue(ResultInfo.Width).zext(OpWidth);
2364      llvm::Value *TruncOverflow = CGF.Builder.CreateICmpUGT(
2365          UnsignedResult, llvm::ConstantInt::get(OpTy, IntMax));
2366      Overflow = CGF.Builder.CreateOr(Overflow, TruncOverflow);
2367    }
2368
2369    // Negate the product if it would be negative in infinite precision.
2370    Result = CGF.Builder.CreateSelect(
2371        IsNegative, CGF.Builder.CreateNeg(UnsignedResult), UnsignedResult);
2372
2373    Result = CGF.Builder.CreateTrunc(Result, ResTy);
2374  }
2375  assert(Overflow && Result && "Missing overflow or result");
2376
2377  bool isVolatile =
2378      ResultArg->getType()->getPointeeType().isVolatileQualified();
2379  CGF.Builder.CreateStore(CGF.EmitToMemory(Result, ResultQTy), ResultPtr,
2380                          isVolatile);
2381  return RValue::get(Overflow);
2382}
2383
2384static bool
2385TypeRequiresBuiltinLaunderImp(const ASTContext &Ctx, QualType Ty,
2386                              llvm::SmallPtrSetImpl<const Decl *> &Seen) {
2387  if (const auto *Arr = Ctx.getAsArrayType(Ty))
2388    Ty = Ctx.getBaseElementType(Arr);
2389
2390  const auto *Record = Ty->getAsCXXRecordDecl();
2391  if (!Record)
2392    return false;
2393
2394  // We've already checked this type, or are in the process of checking it.
2395  if (!Seen.insert(Record).second)
2396    return false;
2397
2398  assert(Record->hasDefinition() &&
2399         "Incomplete types should already be diagnosed");
2400
2401  if (Record->isDynamicClass())
2402    return true;
2403
2404  for (FieldDecl *F : Record->fields()) {
2405    if (TypeRequiresBuiltinLaunderImp(Ctx, F->getType(), Seen))
2406      return true;
2407  }
2408  return false;
2409}
2410
2411/// Determine if the specified type requires laundering by checking if it is a
2412/// dynamic class type or contains a subobject which is a dynamic class type.
2413static bool TypeRequiresBuiltinLaunder(CodeGenModule &CGM, QualType Ty) {
2414  if (!CGM.getCodeGenOpts().StrictVTablePointers)
2415    return false;
2416  llvm::SmallPtrSet<const Decl *, 16> Seen;
2417  return TypeRequiresBuiltinLaunderImp(CGM.getContext(), Ty, Seen);
2418}
2419
2420RValue CodeGenFunction::emitRotate(const CallExpr *E, bool IsRotateRight) {
2421  llvm::Value *Src = EmitScalarExpr(E->getArg(0));
2422  llvm::Value *ShiftAmt = EmitScalarExpr(E->getArg(1));
2423
2424  // The builtin's shift arg may have a different type than the source arg and
2425  // result, but the LLVM intrinsic uses the same type for all values.
2426  llvm::Type *Ty = Src->getType();
2427  ShiftAmt = Builder.CreateIntCast(ShiftAmt, Ty, false);
2428
2429  // Rotate is a special case of LLVM funnel shift - 1st 2 args are the same.
2430  unsigned IID = IsRotateRight ? Intrinsic::fshr : Intrinsic::fshl;
2431  Function *F = CGM.getIntrinsic(IID, Ty);
2432  return RValue::get(Builder.CreateCall(F, { Src, Src, ShiftAmt }));
2433}
2434
2435// Map math builtins for long-double to f128 version.
2436static unsigned mutateLongDoubleBuiltin(unsigned BuiltinID) {
2437  switch (BuiltinID) {
2438#define MUTATE_LDBL(func) \
2439  case Builtin::BI__builtin_##func##l: \
2440    return Builtin::BI__builtin_##func##f128;
2441  MUTATE_LDBL(sqrt)
2442  MUTATE_LDBL(cbrt)
2443  MUTATE_LDBL(fabs)
2444  MUTATE_LDBL(log)
2445  MUTATE_LDBL(log2)
2446  MUTATE_LDBL(log10)
2447  MUTATE_LDBL(log1p)
2448  MUTATE_LDBL(logb)
2449  MUTATE_LDBL(exp)
2450  MUTATE_LDBL(exp2)
2451  MUTATE_LDBL(expm1)
2452  MUTATE_LDBL(fdim)
2453  MUTATE_LDBL(hypot)
2454  MUTATE_LDBL(ilogb)
2455  MUTATE_LDBL(pow)
2456  MUTATE_LDBL(fmin)
2457  MUTATE_LDBL(fmax)
2458  MUTATE_LDBL(ceil)
2459  MUTATE_LDBL(trunc)
2460  MUTATE_LDBL(rint)
2461  MUTATE_LDBL(nearbyint)
2462  MUTATE_LDBL(round)
2463  MUTATE_LDBL(floor)
2464  MUTATE_LDBL(lround)
2465  MUTATE_LDBL(llround)
2466  MUTATE_LDBL(lrint)
2467  MUTATE_LDBL(llrint)
2468  MUTATE_LDBL(fmod)
2469  MUTATE_LDBL(modf)
2470  MUTATE_LDBL(nan)
2471  MUTATE_LDBL(nans)
2472  MUTATE_LDBL(inf)
2473  MUTATE_LDBL(fma)
2474  MUTATE_LDBL(sin)
2475  MUTATE_LDBL(cos)
2476  MUTATE_LDBL(tan)
2477  MUTATE_LDBL(sinh)
2478  MUTATE_LDBL(cosh)
2479  MUTATE_LDBL(tanh)
2480  MUTATE_LDBL(asin)
2481  MUTATE_LDBL(acos)
2482  MUTATE_LDBL(atan)
2483  MUTATE_LDBL(asinh)
2484  MUTATE_LDBL(acosh)
2485  MUTATE_LDBL(atanh)
2486  MUTATE_LDBL(atan2)
2487  MUTATE_LDBL(erf)
2488  MUTATE_LDBL(erfc)
2489  MUTATE_LDBL(ldexp)
2490  MUTATE_LDBL(frexp)
2491  MUTATE_LDBL(huge_val)
2492  MUTATE_LDBL(copysign)
2493  MUTATE_LDBL(nextafter)
2494  MUTATE_LDBL(nexttoward)
2495  MUTATE_LDBL(remainder)
2496  MUTATE_LDBL(remquo)
2497  MUTATE_LDBL(scalbln)
2498  MUTATE_LDBL(scalbn)
2499  MUTATE_LDBL(tgamma)
2500  MUTATE_LDBL(lgamma)
2501#undef MUTATE_LDBL
2502  default:
2503    return BuiltinID;
2504  }
2505}
2506
2507static Value *tryUseTestFPKind(CodeGenFunction &CGF, unsigned BuiltinID,
2508                               Value *V) {
2509  if (CGF.Builder.getIsFPConstrained() &&
2510      CGF.Builder.getDefaultConstrainedExcept() != fp::ebIgnore) {
2511    if (Value *Result =
2512            CGF.getTargetHooks().testFPKind(V, BuiltinID, CGF.Builder, CGF.CGM))
2513      return Result;
2514  }
2515  return nullptr;
2516}
2517
2518static RValue EmitHipStdParUnsupportedBuiltin(CodeGenFunction *CGF,
2519                                              const FunctionDecl *FD) {
2520  auto Name = FD->getNameAsString() + "__hipstdpar_unsupported";
2521  auto FnTy = CGF->CGM.getTypes().GetFunctionType(FD);
2522  auto UBF = CGF->CGM.getModule().getOrInsertFunction(Name, FnTy);
2523
2524  SmallVector<Value *, 16> Args;
2525  for (auto &&FormalTy : FnTy->params())
2526    Args.push_back(llvm::PoisonValue::get(FormalTy));
2527
2528  return RValue::get(CGF->Builder.CreateCall(UBF, Args));
2529}
2530
2531RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
2532                                        const CallExpr *E,
2533                                        ReturnValueSlot ReturnValue) {
2534  const FunctionDecl *FD = GD.getDecl()->getAsFunction();
2535  // See if we can constant fold this builtin.  If so, don't emit it at all.
2536  // TODO: Extend this handling to all builtin calls that we can constant-fold.
2537  Expr::EvalResult Result;
2538  if (E->isPRValue() && E->EvaluateAsRValue(Result, CGM.getContext()) &&
2539      !Result.hasSideEffects()) {
2540    if (Result.Val.isInt())
2541      return RValue::get(llvm::ConstantInt::get(getLLVMContext(),
2542                                                Result.Val.getInt()));
2543    if (Result.Val.isFloat())
2544      return RValue::get(llvm::ConstantFP::get(getLLVMContext(),
2545                                               Result.Val.getFloat()));
2546  }
2547
2548  // If current long-double semantics is IEEE 128-bit, replace math builtins
2549  // of long-double with f128 equivalent.
2550  // TODO: This mutation should also be applied to other targets other than PPC,
2551  // after backend supports IEEE 128-bit style libcalls.
2552  if (getTarget().getTriple().isPPC64() &&
2553      &getTarget().getLongDoubleFormat() == &llvm::APFloat::IEEEquad())
2554    BuiltinID = mutateLongDoubleBuiltin(BuiltinID);
2555
2556  // If the builtin has been declared explicitly with an assembler label,
2557  // disable the specialized emitting below. Ideally we should communicate the
2558  // rename in IR, or at least avoid generating the intrinsic calls that are
2559  // likely to get lowered to the renamed library functions.
2560  const unsigned BuiltinIDIfNoAsmLabel =
2561      FD->hasAttr<AsmLabelAttr>() ? 0 : BuiltinID;
2562
2563  std::optional<bool> ErrnoOverriden;
2564  // ErrnoOverriden is true if math-errno is overriden via the
2565  // '#pragma float_control(precise, on)'. This pragma disables fast-math,
2566  // which implies math-errno.
2567  if (E->hasStoredFPFeatures()) {
2568    FPOptionsOverride OP = E->getFPFeatures();
2569    if (OP.hasMathErrnoOverride())
2570      ErrnoOverriden = OP.getMathErrnoOverride();
2571  }
2572  // True if 'atttibute__((optnone)) is used. This attibute overrides
2573  // fast-math which implies math-errno.
2574  bool OptNone = CurFuncDecl && CurFuncDecl->hasAttr<OptimizeNoneAttr>();
2575
2576  // True if we are compiling at -O2 and errno has been disabled
2577  // using the '#pragma float_control(precise, off)', and
2578  // attribute opt-none hasn't been seen.
2579  bool ErrnoOverridenToFalseWithOpt =
2580       ErrnoOverriden.has_value() && !ErrnoOverriden.value() && !OptNone &&
2581       CGM.getCodeGenOpts().OptimizationLevel != 0;
2582
2583  // There are LLVM math intrinsics/instructions corresponding to math library
2584  // functions except the LLVM op will never set errno while the math library
2585  // might. Also, math builtins have the same semantics as their math library
2586  // twins. Thus, we can transform math library and builtin calls to their
2587  // LLVM counterparts if the call is marked 'const' (known to never set errno).
2588  // In case FP exceptions are enabled, the experimental versions of the
2589  // intrinsics model those.
2590  bool ConstAlways =
2591      getContext().BuiltinInfo.isConst(BuiltinID);
2592
2593  // There's a special case with the fma builtins where they are always const
2594  // if the target environment is GNU or the target is OS is Windows and we're
2595  // targeting the MSVCRT.dll environment.
2596  // FIXME: This list can be become outdated. Need to find a way to get it some
2597  // other way.
2598  switch (BuiltinID) {
2599  case Builtin::BI__builtin_fma:
2600  case Builtin::BI__builtin_fmaf:
2601  case Builtin::BI__builtin_fmal:
2602  case Builtin::BIfma:
2603  case Builtin::BIfmaf:
2604  case Builtin::BIfmal: {
2605    auto &Trip = CGM.getTriple();
2606    if (Trip.isGNUEnvironment() || Trip.isOSMSVCRT())
2607      ConstAlways = true;
2608    break;
2609  }
2610  default:
2611    break;
2612  }
2613
2614  bool ConstWithoutErrnoAndExceptions =
2615      getContext().BuiltinInfo.isConstWithoutErrnoAndExceptions(BuiltinID);
2616  bool ConstWithoutExceptions =
2617      getContext().BuiltinInfo.isConstWithoutExceptions(BuiltinID);
2618
2619  // ConstAttr is enabled in fast-math mode. In fast-math mode, math-errno is
2620  // disabled.
2621  // Math intrinsics are generated only when math-errno is disabled. Any pragmas
2622  // or attributes that affect math-errno should prevent or allow math
2623  // intrincs to be generated. Intrinsics are generated:
2624  //   1- In fast math mode, unless math-errno is overriden
2625  //      via '#pragma float_control(precise, on)', or via an
2626  //      'attribute__((optnone))'.
2627  //   2- If math-errno was enabled on command line but overriden
2628  //      to false via '#pragma float_control(precise, off))' and
2629  //      'attribute__((optnone))' hasn't been used.
2630  //   3- If we are compiling with optimization and errno has been disabled
2631  //      via '#pragma float_control(precise, off)', and
2632  //      'attribute__((optnone))' hasn't been used.
2633
2634  bool ConstWithoutErrnoOrExceptions =
2635      ConstWithoutErrnoAndExceptions || ConstWithoutExceptions;
2636  bool GenerateIntrinsics =
2637      (ConstAlways && !OptNone) ||
2638      (!getLangOpts().MathErrno &&
2639       !(ErrnoOverriden.has_value() && ErrnoOverriden.value()) && !OptNone);
2640  if (!GenerateIntrinsics) {
2641    GenerateIntrinsics =
2642        ConstWithoutErrnoOrExceptions && !ConstWithoutErrnoAndExceptions;
2643    if (!GenerateIntrinsics)
2644      GenerateIntrinsics =
2645          ConstWithoutErrnoOrExceptions &&
2646          (!getLangOpts().MathErrno &&
2647           !(ErrnoOverriden.has_value() && ErrnoOverriden.value()) && !OptNone);
2648    if (!GenerateIntrinsics)
2649      GenerateIntrinsics =
2650          ConstWithoutErrnoOrExceptions && ErrnoOverridenToFalseWithOpt;
2651  }
2652  if (GenerateIntrinsics) {
2653    switch (BuiltinIDIfNoAsmLabel) {
2654    case Builtin::BIceil:
2655    case Builtin::BIceilf:
2656    case Builtin::BIceill:
2657    case Builtin::BI__builtin_ceil:
2658    case Builtin::BI__builtin_ceilf:
2659    case Builtin::BI__builtin_ceilf16:
2660    case Builtin::BI__builtin_ceill:
2661    case Builtin::BI__builtin_ceilf128:
2662      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2663                                   Intrinsic::ceil,
2664                                   Intrinsic::experimental_constrained_ceil));
2665
2666    case Builtin::BIcopysign:
2667    case Builtin::BIcopysignf:
2668    case Builtin::BIcopysignl:
2669    case Builtin::BI__builtin_copysign:
2670    case Builtin::BI__builtin_copysignf:
2671    case Builtin::BI__builtin_copysignf16:
2672    case Builtin::BI__builtin_copysignl:
2673    case Builtin::BI__builtin_copysignf128:
2674      return RValue::get(emitBinaryBuiltin(*this, E, Intrinsic::copysign));
2675
2676    case Builtin::BIcos:
2677    case Builtin::BIcosf:
2678    case Builtin::BIcosl:
2679    case Builtin::BI__builtin_cos:
2680    case Builtin::BI__builtin_cosf:
2681    case Builtin::BI__builtin_cosf16:
2682    case Builtin::BI__builtin_cosl:
2683    case Builtin::BI__builtin_cosf128:
2684      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2685                                   Intrinsic::cos,
2686                                   Intrinsic::experimental_constrained_cos));
2687
2688    case Builtin::BIexp:
2689    case Builtin::BIexpf:
2690    case Builtin::BIexpl:
2691    case Builtin::BI__builtin_exp:
2692    case Builtin::BI__builtin_expf:
2693    case Builtin::BI__builtin_expf16:
2694    case Builtin::BI__builtin_expl:
2695    case Builtin::BI__builtin_expf128:
2696      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2697                                   Intrinsic::exp,
2698                                   Intrinsic::experimental_constrained_exp));
2699
2700    case Builtin::BIexp2:
2701    case Builtin::BIexp2f:
2702    case Builtin::BIexp2l:
2703    case Builtin::BI__builtin_exp2:
2704    case Builtin::BI__builtin_exp2f:
2705    case Builtin::BI__builtin_exp2f16:
2706    case Builtin::BI__builtin_exp2l:
2707    case Builtin::BI__builtin_exp2f128:
2708      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2709                                   Intrinsic::exp2,
2710                                   Intrinsic::experimental_constrained_exp2));
2711    case Builtin::BI__builtin_exp10:
2712    case Builtin::BI__builtin_exp10f:
2713    case Builtin::BI__builtin_exp10f16:
2714    case Builtin::BI__builtin_exp10l:
2715    case Builtin::BI__builtin_exp10f128: {
2716      // TODO: strictfp support
2717      if (Builder.getIsFPConstrained())
2718        break;
2719      return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::exp10));
2720    }
2721    case Builtin::BIfabs:
2722    case Builtin::BIfabsf:
2723    case Builtin::BIfabsl:
2724    case Builtin::BI__builtin_fabs:
2725    case Builtin::BI__builtin_fabsf:
2726    case Builtin::BI__builtin_fabsf16:
2727    case Builtin::BI__builtin_fabsl:
2728    case Builtin::BI__builtin_fabsf128:
2729      return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::fabs));
2730
2731    case Builtin::BIfloor:
2732    case Builtin::BIfloorf:
2733    case Builtin::BIfloorl:
2734    case Builtin::BI__builtin_floor:
2735    case Builtin::BI__builtin_floorf:
2736    case Builtin::BI__builtin_floorf16:
2737    case Builtin::BI__builtin_floorl:
2738    case Builtin::BI__builtin_floorf128:
2739      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2740                                   Intrinsic::floor,
2741                                   Intrinsic::experimental_constrained_floor));
2742
2743    case Builtin::BIfma:
2744    case Builtin::BIfmaf:
2745    case Builtin::BIfmal:
2746    case Builtin::BI__builtin_fma:
2747    case Builtin::BI__builtin_fmaf:
2748    case Builtin::BI__builtin_fmaf16:
2749    case Builtin::BI__builtin_fmal:
2750    case Builtin::BI__builtin_fmaf128:
2751      return RValue::get(emitTernaryMaybeConstrainedFPBuiltin(*this, E,
2752                                   Intrinsic::fma,
2753                                   Intrinsic::experimental_constrained_fma));
2754
2755    case Builtin::BIfmax:
2756    case Builtin::BIfmaxf:
2757    case Builtin::BIfmaxl:
2758    case Builtin::BI__builtin_fmax:
2759    case Builtin::BI__builtin_fmaxf:
2760    case Builtin::BI__builtin_fmaxf16:
2761    case Builtin::BI__builtin_fmaxl:
2762    case Builtin::BI__builtin_fmaxf128:
2763      return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2764                                   Intrinsic::maxnum,
2765                                   Intrinsic::experimental_constrained_maxnum));
2766
2767    case Builtin::BIfmin:
2768    case Builtin::BIfminf:
2769    case Builtin::BIfminl:
2770    case Builtin::BI__builtin_fmin:
2771    case Builtin::BI__builtin_fminf:
2772    case Builtin::BI__builtin_fminf16:
2773    case Builtin::BI__builtin_fminl:
2774    case Builtin::BI__builtin_fminf128:
2775      return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2776                                   Intrinsic::minnum,
2777                                   Intrinsic::experimental_constrained_minnum));
2778
2779    // fmod() is a special-case. It maps to the frem instruction rather than an
2780    // LLVM intrinsic.
2781    case Builtin::BIfmod:
2782    case Builtin::BIfmodf:
2783    case Builtin::BIfmodl:
2784    case Builtin::BI__builtin_fmod:
2785    case Builtin::BI__builtin_fmodf:
2786    case Builtin::BI__builtin_fmodf16:
2787    case Builtin::BI__builtin_fmodl:
2788    case Builtin::BI__builtin_fmodf128: {
2789      CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
2790      Value *Arg1 = EmitScalarExpr(E->getArg(0));
2791      Value *Arg2 = EmitScalarExpr(E->getArg(1));
2792      return RValue::get(Builder.CreateFRem(Arg1, Arg2, "fmod"));
2793    }
2794
2795    case Builtin::BIlog:
2796    case Builtin::BIlogf:
2797    case Builtin::BIlogl:
2798    case Builtin::BI__builtin_log:
2799    case Builtin::BI__builtin_logf:
2800    case Builtin::BI__builtin_logf16:
2801    case Builtin::BI__builtin_logl:
2802    case Builtin::BI__builtin_logf128:
2803      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2804                                   Intrinsic::log,
2805                                   Intrinsic::experimental_constrained_log));
2806
2807    case Builtin::BIlog10:
2808    case Builtin::BIlog10f:
2809    case Builtin::BIlog10l:
2810    case Builtin::BI__builtin_log10:
2811    case Builtin::BI__builtin_log10f:
2812    case Builtin::BI__builtin_log10f16:
2813    case Builtin::BI__builtin_log10l:
2814    case Builtin::BI__builtin_log10f128:
2815      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2816                                   Intrinsic::log10,
2817                                   Intrinsic::experimental_constrained_log10));
2818
2819    case Builtin::BIlog2:
2820    case Builtin::BIlog2f:
2821    case Builtin::BIlog2l:
2822    case Builtin::BI__builtin_log2:
2823    case Builtin::BI__builtin_log2f:
2824    case Builtin::BI__builtin_log2f16:
2825    case Builtin::BI__builtin_log2l:
2826    case Builtin::BI__builtin_log2f128:
2827      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2828                                   Intrinsic::log2,
2829                                   Intrinsic::experimental_constrained_log2));
2830
2831    case Builtin::BInearbyint:
2832    case Builtin::BInearbyintf:
2833    case Builtin::BInearbyintl:
2834    case Builtin::BI__builtin_nearbyint:
2835    case Builtin::BI__builtin_nearbyintf:
2836    case Builtin::BI__builtin_nearbyintl:
2837    case Builtin::BI__builtin_nearbyintf128:
2838      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2839                                Intrinsic::nearbyint,
2840                                Intrinsic::experimental_constrained_nearbyint));
2841
2842    case Builtin::BIpow:
2843    case Builtin::BIpowf:
2844    case Builtin::BIpowl:
2845    case Builtin::BI__builtin_pow:
2846    case Builtin::BI__builtin_powf:
2847    case Builtin::BI__builtin_powf16:
2848    case Builtin::BI__builtin_powl:
2849    case Builtin::BI__builtin_powf128:
2850      return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
2851                                   Intrinsic::pow,
2852                                   Intrinsic::experimental_constrained_pow));
2853
2854    case Builtin::BIrint:
2855    case Builtin::BIrintf:
2856    case Builtin::BIrintl:
2857    case Builtin::BI__builtin_rint:
2858    case Builtin::BI__builtin_rintf:
2859    case Builtin::BI__builtin_rintf16:
2860    case Builtin::BI__builtin_rintl:
2861    case Builtin::BI__builtin_rintf128:
2862      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2863                                   Intrinsic::rint,
2864                                   Intrinsic::experimental_constrained_rint));
2865
2866    case Builtin::BIround:
2867    case Builtin::BIroundf:
2868    case Builtin::BIroundl:
2869    case Builtin::BI__builtin_round:
2870    case Builtin::BI__builtin_roundf:
2871    case Builtin::BI__builtin_roundf16:
2872    case Builtin::BI__builtin_roundl:
2873    case Builtin::BI__builtin_roundf128:
2874      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2875                                   Intrinsic::round,
2876                                   Intrinsic::experimental_constrained_round));
2877
2878    case Builtin::BIroundeven:
2879    case Builtin::BIroundevenf:
2880    case Builtin::BIroundevenl:
2881    case Builtin::BI__builtin_roundeven:
2882    case Builtin::BI__builtin_roundevenf:
2883    case Builtin::BI__builtin_roundevenf16:
2884    case Builtin::BI__builtin_roundevenl:
2885    case Builtin::BI__builtin_roundevenf128:
2886      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2887                                   Intrinsic::roundeven,
2888                                   Intrinsic::experimental_constrained_roundeven));
2889
2890    case Builtin::BIsin:
2891    case Builtin::BIsinf:
2892    case Builtin::BIsinl:
2893    case Builtin::BI__builtin_sin:
2894    case Builtin::BI__builtin_sinf:
2895    case Builtin::BI__builtin_sinf16:
2896    case Builtin::BI__builtin_sinl:
2897    case Builtin::BI__builtin_sinf128:
2898      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2899                                   Intrinsic::sin,
2900                                   Intrinsic::experimental_constrained_sin));
2901
2902    case Builtin::BIsqrt:
2903    case Builtin::BIsqrtf:
2904    case Builtin::BIsqrtl:
2905    case Builtin::BI__builtin_sqrt:
2906    case Builtin::BI__builtin_sqrtf:
2907    case Builtin::BI__builtin_sqrtf16:
2908    case Builtin::BI__builtin_sqrtl:
2909    case Builtin::BI__builtin_sqrtf128:
2910    case Builtin::BI__builtin_elementwise_sqrt: {
2911      llvm::Value *Call = emitUnaryMaybeConstrainedFPBuiltin(
2912          *this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt);
2913      SetSqrtFPAccuracy(Call);
2914      return RValue::get(Call);
2915    }
2916    case Builtin::BItrunc:
2917    case Builtin::BItruncf:
2918    case Builtin::BItruncl:
2919    case Builtin::BI__builtin_trunc:
2920    case Builtin::BI__builtin_truncf:
2921    case Builtin::BI__builtin_truncf16:
2922    case Builtin::BI__builtin_truncl:
2923    case Builtin::BI__builtin_truncf128:
2924      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
2925                                   Intrinsic::trunc,
2926                                   Intrinsic::experimental_constrained_trunc));
2927
2928    case Builtin::BIlround:
2929    case Builtin::BIlroundf:
2930    case Builtin::BIlroundl:
2931    case Builtin::BI__builtin_lround:
2932    case Builtin::BI__builtin_lroundf:
2933    case Builtin::BI__builtin_lroundl:
2934    case Builtin::BI__builtin_lroundf128:
2935      return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2936          *this, E, Intrinsic::lround,
2937          Intrinsic::experimental_constrained_lround));
2938
2939    case Builtin::BIllround:
2940    case Builtin::BIllroundf:
2941    case Builtin::BIllroundl:
2942    case Builtin::BI__builtin_llround:
2943    case Builtin::BI__builtin_llroundf:
2944    case Builtin::BI__builtin_llroundl:
2945    case Builtin::BI__builtin_llroundf128:
2946      return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2947          *this, E, Intrinsic::llround,
2948          Intrinsic::experimental_constrained_llround));
2949
2950    case Builtin::BIlrint:
2951    case Builtin::BIlrintf:
2952    case Builtin::BIlrintl:
2953    case Builtin::BI__builtin_lrint:
2954    case Builtin::BI__builtin_lrintf:
2955    case Builtin::BI__builtin_lrintl:
2956    case Builtin::BI__builtin_lrintf128:
2957      return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2958          *this, E, Intrinsic::lrint,
2959          Intrinsic::experimental_constrained_lrint));
2960
2961    case Builtin::BIllrint:
2962    case Builtin::BIllrintf:
2963    case Builtin::BIllrintl:
2964    case Builtin::BI__builtin_llrint:
2965    case Builtin::BI__builtin_llrintf:
2966    case Builtin::BI__builtin_llrintl:
2967    case Builtin::BI__builtin_llrintf128:
2968      return RValue::get(emitMaybeConstrainedFPToIntRoundBuiltin(
2969          *this, E, Intrinsic::llrint,
2970          Intrinsic::experimental_constrained_llrint));
2971    case Builtin::BI__builtin_ldexp:
2972    case Builtin::BI__builtin_ldexpf:
2973    case Builtin::BI__builtin_ldexpl:
2974    case Builtin::BI__builtin_ldexpf16:
2975    case Builtin::BI__builtin_ldexpf128: {
2976      return RValue::get(emitBinaryExpMaybeConstrainedFPBuiltin(
2977          *this, E, Intrinsic::ldexp,
2978          Intrinsic::experimental_constrained_ldexp));
2979    }
2980    default:
2981      break;
2982    }
2983  }
2984
2985  // Check NonnullAttribute/NullabilityArg and Alignment.
2986  auto EmitArgCheck = [&](TypeCheckKind Kind, Address A, const Expr *Arg,
2987                          unsigned ParmNum) {
2988    Value *Val = A.getPointer();
2989    EmitNonNullArgCheck(RValue::get(Val), Arg->getType(), Arg->getExprLoc(), FD,
2990                        ParmNum);
2991
2992    if (SanOpts.has(SanitizerKind::Alignment)) {
2993      SanitizerSet SkippedChecks;
2994      SkippedChecks.set(SanitizerKind::All);
2995      SkippedChecks.clear(SanitizerKind::Alignment);
2996      SourceLocation Loc = Arg->getExprLoc();
2997      // Strip an implicit cast.
2998      if (auto *CE = dyn_cast<ImplicitCastExpr>(Arg))
2999        if (CE->getCastKind() == CK_BitCast)
3000          Arg = CE->getSubExpr();
3001      EmitTypeCheck(Kind, Loc, Val, Arg->getType(), A.getAlignment(),
3002                    SkippedChecks);
3003    }
3004  };
3005
3006  switch (BuiltinIDIfNoAsmLabel) {
3007  default: break;
3008  case Builtin::BI__builtin___CFStringMakeConstantString:
3009  case Builtin::BI__builtin___NSStringMakeConstantString:
3010    return RValue::get(ConstantEmitter(*this).emitAbstract(E, E->getType()));
3011  case Builtin::BI__builtin_stdarg_start:
3012  case Builtin::BI__builtin_va_start:
3013  case Builtin::BI__va_start:
3014  case Builtin::BI__builtin_va_end:
3015    EmitVAStartEnd(BuiltinID == Builtin::BI__va_start
3016                       ? EmitScalarExpr(E->getArg(0))
3017                       : EmitVAListRef(E->getArg(0)).getPointer(),
3018                   BuiltinID != Builtin::BI__builtin_va_end);
3019    return RValue::get(nullptr);
3020  case Builtin::BI__builtin_va_copy: {
3021    Value *DstPtr = EmitVAListRef(E->getArg(0)).getPointer();
3022    Value *SrcPtr = EmitVAListRef(E->getArg(1)).getPointer();
3023    Builder.CreateCall(CGM.getIntrinsic(Intrinsic::vacopy), {DstPtr, SrcPtr});
3024    return RValue::get(nullptr);
3025  }
3026  case Builtin::BIabs:
3027  case Builtin::BIlabs:
3028  case Builtin::BIllabs:
3029  case Builtin::BI__builtin_abs:
3030  case Builtin::BI__builtin_labs:
3031  case Builtin::BI__builtin_llabs: {
3032    bool SanitizeOverflow = SanOpts.has(SanitizerKind::SignedIntegerOverflow);
3033
3034    Value *Result;
3035    switch (getLangOpts().getSignedOverflowBehavior()) {
3036    case LangOptions::SOB_Defined:
3037      Result = EmitAbs(*this, EmitScalarExpr(E->getArg(0)), false);
3038      break;
3039    case LangOptions::SOB_Undefined:
3040      if (!SanitizeOverflow) {
3041        Result = EmitAbs(*this, EmitScalarExpr(E->getArg(0)), true);
3042        break;
3043      }
3044      [[fallthrough]];
3045    case LangOptions::SOB_Trapping:
3046      // TODO: Somehow handle the corner case when the address of abs is taken.
3047      Result = EmitOverflowCheckedAbs(*this, E, SanitizeOverflow);
3048      break;
3049    }
3050    return RValue::get(Result);
3051  }
3052  case Builtin::BI__builtin_complex: {
3053    Value *Real = EmitScalarExpr(E->getArg(0));
3054    Value *Imag = EmitScalarExpr(E->getArg(1));
3055    return RValue::getComplex({Real, Imag});
3056  }
3057  case Builtin::BI__builtin_conj:
3058  case Builtin::BI__builtin_conjf:
3059  case Builtin::BI__builtin_conjl:
3060  case Builtin::BIconj:
3061  case Builtin::BIconjf:
3062  case Builtin::BIconjl: {
3063    ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
3064    Value *Real = ComplexVal.first;
3065    Value *Imag = ComplexVal.second;
3066    Imag = Builder.CreateFNeg(Imag, "neg");
3067    return RValue::getComplex(std::make_pair(Real, Imag));
3068  }
3069  case Builtin::BI__builtin_creal:
3070  case Builtin::BI__builtin_crealf:
3071  case Builtin::BI__builtin_creall:
3072  case Builtin::BIcreal:
3073  case Builtin::BIcrealf:
3074  case Builtin::BIcreall: {
3075    ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
3076    return RValue::get(ComplexVal.first);
3077  }
3078
3079  case Builtin::BI__builtin_preserve_access_index: {
3080    // Only enabled preserved access index region when debuginfo
3081    // is available as debuginfo is needed to preserve user-level
3082    // access pattern.
3083    if (!getDebugInfo()) {
3084      CGM.Error(E->getExprLoc(), "using builtin_preserve_access_index() without -g");
3085      return RValue::get(EmitScalarExpr(E->getArg(0)));
3086    }
3087
3088    // Nested builtin_preserve_access_index() not supported
3089    if (IsInPreservedAIRegion) {
3090      CGM.Error(E->getExprLoc(), "nested builtin_preserve_access_index() not supported");
3091      return RValue::get(EmitScalarExpr(E->getArg(0)));
3092    }
3093
3094    IsInPreservedAIRegion = true;
3095    Value *Res = EmitScalarExpr(E->getArg(0));
3096    IsInPreservedAIRegion = false;
3097    return RValue::get(Res);
3098  }
3099
3100  case Builtin::BI__builtin_cimag:
3101  case Builtin::BI__builtin_cimagf:
3102  case Builtin::BI__builtin_cimagl:
3103  case Builtin::BIcimag:
3104  case Builtin::BIcimagf:
3105  case Builtin::BIcimagl: {
3106    ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
3107    return RValue::get(ComplexVal.second);
3108  }
3109
3110  case Builtin::BI__builtin_clrsb:
3111  case Builtin::BI__builtin_clrsbl:
3112  case Builtin::BI__builtin_clrsbll: {
3113    // clrsb(x) -> clz(x < 0 ? ~x : x) - 1 or
3114    Value *ArgValue = EmitScalarExpr(E->getArg(0));
3115
3116    llvm::Type *ArgType = ArgValue->getType();
3117    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
3118
3119    llvm::Type *ResultType = ConvertType(E->getType());
3120    Value *Zero = llvm::Constant::getNullValue(ArgType);
3121    Value *IsNeg = Builder.CreateICmpSLT(ArgValue, Zero, "isneg");
3122    Value *Inverse = Builder.CreateNot(ArgValue, "not");
3123    Value *Tmp = Builder.CreateSelect(IsNeg, Inverse, ArgValue);
3124    Value *Ctlz = Builder.CreateCall(F, {Tmp, Builder.getFalse()});
3125    Value *Result = Builder.CreateSub(Ctlz, llvm::ConstantInt::get(ArgType, 1));
3126    Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3127                                   "cast");
3128    return RValue::get(Result);
3129  }
3130  case Builtin::BI__builtin_ctzs:
3131  case Builtin::BI__builtin_ctz:
3132  case Builtin::BI__builtin_ctzl:
3133  case Builtin::BI__builtin_ctzll: {
3134    Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CTZPassedZero);
3135
3136    llvm::Type *ArgType = ArgValue->getType();
3137    Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
3138
3139    llvm::Type *ResultType = ConvertType(E->getType());
3140    Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
3141    Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
3142    if (Result->getType() != ResultType)
3143      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3144                                     "cast");
3145    return RValue::get(Result);
3146  }
3147  case Builtin::BI__builtin_clzs:
3148  case Builtin::BI__builtin_clz:
3149  case Builtin::BI__builtin_clzl:
3150  case Builtin::BI__builtin_clzll: {
3151    Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CLZPassedZero);
3152
3153    llvm::Type *ArgType = ArgValue->getType();
3154    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
3155
3156    llvm::Type *ResultType = ConvertType(E->getType());
3157    Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
3158    Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
3159    if (Result->getType() != ResultType)
3160      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3161                                     "cast");
3162    return RValue::get(Result);
3163  }
3164  case Builtin::BI__builtin_ffs:
3165  case Builtin::BI__builtin_ffsl:
3166  case Builtin::BI__builtin_ffsll: {
3167    // ffs(x) -> x ? cttz(x) + 1 : 0
3168    Value *ArgValue = EmitScalarExpr(E->getArg(0));
3169
3170    llvm::Type *ArgType = ArgValue->getType();
3171    Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
3172
3173    llvm::Type *ResultType = ConvertType(E->getType());
3174    Value *Tmp =
3175        Builder.CreateAdd(Builder.CreateCall(F, {ArgValue, Builder.getTrue()}),
3176                          llvm::ConstantInt::get(ArgType, 1));
3177    Value *Zero = llvm::Constant::getNullValue(ArgType);
3178    Value *IsZero = Builder.CreateICmpEQ(ArgValue, Zero, "iszero");
3179    Value *Result = Builder.CreateSelect(IsZero, Zero, Tmp, "ffs");
3180    if (Result->getType() != ResultType)
3181      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3182                                     "cast");
3183    return RValue::get(Result);
3184  }
3185  case Builtin::BI__builtin_parity:
3186  case Builtin::BI__builtin_parityl:
3187  case Builtin::BI__builtin_parityll: {
3188    // parity(x) -> ctpop(x) & 1
3189    Value *ArgValue = EmitScalarExpr(E->getArg(0));
3190
3191    llvm::Type *ArgType = ArgValue->getType();
3192    Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
3193
3194    llvm::Type *ResultType = ConvertType(E->getType());
3195    Value *Tmp = Builder.CreateCall(F, ArgValue);
3196    Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
3197    if (Result->getType() != ResultType)
3198      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3199                                     "cast");
3200    return RValue::get(Result);
3201  }
3202  case Builtin::BI__lzcnt16:
3203  case Builtin::BI__lzcnt:
3204  case Builtin::BI__lzcnt64: {
3205    Value *ArgValue = EmitScalarExpr(E->getArg(0));
3206
3207    llvm::Type *ArgType = ArgValue->getType();
3208    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
3209
3210    llvm::Type *ResultType = ConvertType(E->getType());
3211    Value *Result = Builder.CreateCall(F, {ArgValue, Builder.getFalse()});
3212    if (Result->getType() != ResultType)
3213      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3214                                     "cast");
3215    return RValue::get(Result);
3216  }
3217  case Builtin::BI__popcnt16:
3218  case Builtin::BI__popcnt:
3219  case Builtin::BI__popcnt64:
3220  case Builtin::BI__builtin_popcount:
3221  case Builtin::BI__builtin_popcountl:
3222  case Builtin::BI__builtin_popcountll: {
3223    Value *ArgValue = EmitScalarExpr(E->getArg(0));
3224
3225    llvm::Type *ArgType = ArgValue->getType();
3226    Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
3227
3228    llvm::Type *ResultType = ConvertType(E->getType());
3229    Value *Result = Builder.CreateCall(F, ArgValue);
3230    if (Result->getType() != ResultType)
3231      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3232                                     "cast");
3233    return RValue::get(Result);
3234  }
3235  case Builtin::BI__builtin_unpredictable: {
3236    // Always return the argument of __builtin_unpredictable. LLVM does not
3237    // handle this builtin. Metadata for this builtin should be added directly
3238    // to instructions such as branches or switches that use it.
3239    return RValue::get(EmitScalarExpr(E->getArg(0)));
3240  }
3241  case Builtin::BI__builtin_expect: {
3242    Value *ArgValue = EmitScalarExpr(E->getArg(0));
3243    llvm::Type *ArgType = ArgValue->getType();
3244
3245    Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
3246    // Don't generate llvm.expect on -O0 as the backend won't use it for
3247    // anything.
3248    // Note, we still IRGen ExpectedValue because it could have side-effects.
3249    if (CGM.getCodeGenOpts().OptimizationLevel == 0)
3250      return RValue::get(ArgValue);
3251
3252    Function *FnExpect = CGM.getIntrinsic(Intrinsic::expect, ArgType);
3253    Value *Result =
3254        Builder.CreateCall(FnExpect, {ArgValue, ExpectedValue}, "expval");
3255    return RValue::get(Result);
3256  }
3257  case Builtin::BI__builtin_expect_with_probability: {
3258    Value *ArgValue = EmitScalarExpr(E->getArg(0));
3259    llvm::Type *ArgType = ArgValue->getType();
3260
3261    Value *ExpectedValue = EmitScalarExpr(E->getArg(1));
3262    llvm::APFloat Probability(0.0);
3263    const Expr *ProbArg = E->getArg(2);
3264    bool EvalSucceed = ProbArg->EvaluateAsFloat(Probability, CGM.getContext());
3265    assert(EvalSucceed && "probability should be able to evaluate as float");
3266    (void)EvalSucceed;
3267    bool LoseInfo = false;
3268    Probability.convert(llvm::APFloat::IEEEdouble(),
3269                        llvm::RoundingMode::Dynamic, &LoseInfo);
3270    llvm::Type *Ty = ConvertType(ProbArg->getType());
3271    Constant *Confidence = ConstantFP::get(Ty, Probability);
3272    // Don't generate llvm.expect.with.probability on -O0 as the backend
3273    // won't use it for anything.
3274    // Note, we still IRGen ExpectedValue because it could have side-effects.
3275    if (CGM.getCodeGenOpts().OptimizationLevel == 0)
3276      return RValue::get(ArgValue);
3277
3278    Function *FnExpect =
3279        CGM.getIntrinsic(Intrinsic::expect_with_probability, ArgType);
3280    Value *Result = Builder.CreateCall(
3281        FnExpect, {ArgValue, ExpectedValue, Confidence}, "expval");
3282    return RValue::get(Result);
3283  }
3284  case Builtin::BI__builtin_assume_aligned: {
3285    const Expr *Ptr = E->getArg(0);
3286    Value *PtrValue = EmitScalarExpr(Ptr);
3287    Value *OffsetValue =
3288      (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) : nullptr;
3289
3290    Value *AlignmentValue = EmitScalarExpr(E->getArg(1));
3291    ConstantInt *AlignmentCI = cast<ConstantInt>(AlignmentValue);
3292    if (AlignmentCI->getValue().ugt(llvm::Value::MaximumAlignment))
3293      AlignmentCI = ConstantInt::get(AlignmentCI->getIntegerType(),
3294                                     llvm::Value::MaximumAlignment);
3295
3296    emitAlignmentAssumption(PtrValue, Ptr,
3297                            /*The expr loc is sufficient.*/ SourceLocation(),
3298                            AlignmentCI, OffsetValue);
3299    return RValue::get(PtrValue);
3300  }
3301  case Builtin::BI__assume:
3302  case Builtin::BI__builtin_assume: {
3303    if (E->getArg(0)->HasSideEffects(getContext()))
3304      return RValue::get(nullptr);
3305
3306    Value *ArgValue = EmitScalarExpr(E->getArg(0));
3307    Function *FnAssume = CGM.getIntrinsic(Intrinsic::assume);
3308    Builder.CreateCall(FnAssume, ArgValue);
3309    return RValue::get(nullptr);
3310  }
3311  case Builtin::BI__builtin_assume_separate_storage: {
3312    const Expr *Arg0 = E->getArg(0);
3313    const Expr *Arg1 = E->getArg(1);
3314
3315    Value *Value0 = EmitScalarExpr(Arg0);
3316    Value *Value1 = EmitScalarExpr(Arg1);
3317
3318    Value *Values[] = {Value0, Value1};
3319    OperandBundleDefT<Value *> OBD("separate_storage", Values);
3320    Builder.CreateAssumption(ConstantInt::getTrue(getLLVMContext()), {OBD});
3321    return RValue::get(nullptr);
3322  }
3323  case Builtin::BI__arithmetic_fence: {
3324    // Create the builtin call if FastMath is selected, and the target
3325    // supports the builtin, otherwise just return the argument.
3326    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3327    llvm::FastMathFlags FMF = Builder.getFastMathFlags();
3328    bool isArithmeticFenceEnabled =
3329        FMF.allowReassoc() &&
3330        getContext().getTargetInfo().checkArithmeticFenceSupported();
3331    QualType ArgType = E->getArg(0)->getType();
3332    if (ArgType->isComplexType()) {
3333      if (isArithmeticFenceEnabled) {
3334        QualType ElementType = ArgType->castAs<ComplexType>()->getElementType();
3335        ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
3336        Value *Real = Builder.CreateArithmeticFence(ComplexVal.first,
3337                                                    ConvertType(ElementType));
3338        Value *Imag = Builder.CreateArithmeticFence(ComplexVal.second,
3339                                                    ConvertType(ElementType));
3340        return RValue::getComplex(std::make_pair(Real, Imag));
3341      }
3342      ComplexPairTy ComplexVal = EmitComplexExpr(E->getArg(0));
3343      Value *Real = ComplexVal.first;
3344      Value *Imag = ComplexVal.second;
3345      return RValue::getComplex(std::make_pair(Real, Imag));
3346    }
3347    Value *ArgValue = EmitScalarExpr(E->getArg(0));
3348    if (isArithmeticFenceEnabled)
3349      return RValue::get(
3350          Builder.CreateArithmeticFence(ArgValue, ConvertType(ArgType)));
3351    return RValue::get(ArgValue);
3352  }
3353  case Builtin::BI__builtin_bswap16:
3354  case Builtin::BI__builtin_bswap32:
3355  case Builtin::BI__builtin_bswap64:
3356  case Builtin::BI_byteswap_ushort:
3357  case Builtin::BI_byteswap_ulong:
3358  case Builtin::BI_byteswap_uint64: {
3359    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bswap));
3360  }
3361  case Builtin::BI__builtin_bitreverse8:
3362  case Builtin::BI__builtin_bitreverse16:
3363  case Builtin::BI__builtin_bitreverse32:
3364  case Builtin::BI__builtin_bitreverse64: {
3365    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::bitreverse));
3366  }
3367  case Builtin::BI__builtin_rotateleft8:
3368  case Builtin::BI__builtin_rotateleft16:
3369  case Builtin::BI__builtin_rotateleft32:
3370  case Builtin::BI__builtin_rotateleft64:
3371  case Builtin::BI_rotl8: // Microsoft variants of rotate left
3372  case Builtin::BI_rotl16:
3373  case Builtin::BI_rotl:
3374  case Builtin::BI_lrotl:
3375  case Builtin::BI_rotl64:
3376    return emitRotate(E, false);
3377
3378  case Builtin::BI__builtin_rotateright8:
3379  case Builtin::BI__builtin_rotateright16:
3380  case Builtin::BI__builtin_rotateright32:
3381  case Builtin::BI__builtin_rotateright64:
3382  case Builtin::BI_rotr8: // Microsoft variants of rotate right
3383  case Builtin::BI_rotr16:
3384  case Builtin::BI_rotr:
3385  case Builtin::BI_lrotr:
3386  case Builtin::BI_rotr64:
3387    return emitRotate(E, true);
3388
3389  case Builtin::BI__builtin_constant_p: {
3390    llvm::Type *ResultType = ConvertType(E->getType());
3391
3392    const Expr *Arg = E->getArg(0);
3393    QualType ArgType = Arg->getType();
3394    // FIXME: The allowance for Obj-C pointers and block pointers is historical
3395    // and likely a mistake.
3396    if (!ArgType->isIntegralOrEnumerationType() && !ArgType->isFloatingType() &&
3397        !ArgType->isObjCObjectPointerType() && !ArgType->isBlockPointerType())
3398      // Per the GCC documentation, only numeric constants are recognized after
3399      // inlining.
3400      return RValue::get(ConstantInt::get(ResultType, 0));
3401
3402    if (Arg->HasSideEffects(getContext()))
3403      // The argument is unevaluated, so be conservative if it might have
3404      // side-effects.
3405      return RValue::get(ConstantInt::get(ResultType, 0));
3406
3407    Value *ArgValue = EmitScalarExpr(Arg);
3408    if (ArgType->isObjCObjectPointerType()) {
3409      // Convert Objective-C objects to id because we cannot distinguish between
3410      // LLVM types for Obj-C classes as they are opaque.
3411      ArgType = CGM.getContext().getObjCIdType();
3412      ArgValue = Builder.CreateBitCast(ArgValue, ConvertType(ArgType));
3413    }
3414    Function *F =
3415        CGM.getIntrinsic(Intrinsic::is_constant, ConvertType(ArgType));
3416    Value *Result = Builder.CreateCall(F, ArgValue);
3417    if (Result->getType() != ResultType)
3418      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/false);
3419    return RValue::get(Result);
3420  }
3421  case Builtin::BI__builtin_dynamic_object_size:
3422  case Builtin::BI__builtin_object_size: {
3423    unsigned Type =
3424        E->getArg(1)->EvaluateKnownConstInt(getContext()).getZExtValue();
3425    auto *ResType = cast<llvm::IntegerType>(ConvertType(E->getType()));
3426
3427    // We pass this builtin onto the optimizer so that it can figure out the
3428    // object size in more complex cases.
3429    bool IsDynamic = BuiltinID == Builtin::BI__builtin_dynamic_object_size;
3430    return RValue::get(emitBuiltinObjectSize(E->getArg(0), Type, ResType,
3431                                             /*EmittedE=*/nullptr, IsDynamic));
3432  }
3433  case Builtin::BI__builtin_prefetch: {
3434    Value *Locality, *RW, *Address = EmitScalarExpr(E->getArg(0));
3435    // FIXME: Technically these constants should of type 'int', yes?
3436    RW = (E->getNumArgs() > 1) ? EmitScalarExpr(E->getArg(1)) :
3437      llvm::ConstantInt::get(Int32Ty, 0);
3438    Locality = (E->getNumArgs() > 2) ? EmitScalarExpr(E->getArg(2)) :
3439      llvm::ConstantInt::get(Int32Ty, 3);
3440    Value *Data = llvm::ConstantInt::get(Int32Ty, 1);
3441    Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
3442    Builder.CreateCall(F, {Address, RW, Locality, Data});
3443    return RValue::get(nullptr);
3444  }
3445  case Builtin::BI__builtin_readcyclecounter: {
3446    Function *F = CGM.getIntrinsic(Intrinsic::readcyclecounter);
3447    return RValue::get(Builder.CreateCall(F));
3448  }
3449  case Builtin::BI__builtin___clear_cache: {
3450    Value *Begin = EmitScalarExpr(E->getArg(0));
3451    Value *End = EmitScalarExpr(E->getArg(1));
3452    Function *F = CGM.getIntrinsic(Intrinsic::clear_cache);
3453    return RValue::get(Builder.CreateCall(F, {Begin, End}));
3454  }
3455  case Builtin::BI__builtin_trap:
3456    EmitTrapCall(Intrinsic::trap);
3457    return RValue::get(nullptr);
3458  case Builtin::BI__debugbreak:
3459    EmitTrapCall(Intrinsic::debugtrap);
3460    return RValue::get(nullptr);
3461  case Builtin::BI__builtin_unreachable: {
3462    EmitUnreachable(E->getExprLoc());
3463
3464    // We do need to preserve an insertion point.
3465    EmitBlock(createBasicBlock("unreachable.cont"));
3466
3467    return RValue::get(nullptr);
3468  }
3469
3470  case Builtin::BI__builtin_powi:
3471  case Builtin::BI__builtin_powif:
3472  case Builtin::BI__builtin_powil: {
3473    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
3474    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
3475
3476    if (Builder.getIsFPConstrained()) {
3477      // FIXME: llvm.powi has 2 mangling types,
3478      // llvm.experimental.constrained.powi has one.
3479      CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3480      Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_powi,
3481                                     Src0->getType());
3482      return RValue::get(Builder.CreateConstrainedFPCall(F, { Src0, Src1 }));
3483    }
3484
3485    Function *F = CGM.getIntrinsic(Intrinsic::powi,
3486                                   { Src0->getType(), Src1->getType() });
3487    return RValue::get(Builder.CreateCall(F, { Src0, Src1 }));
3488  }
3489  case Builtin::BI__builtin_frexpl: {
3490    // Linux PPC will not be adding additional PPCDoubleDouble support.
3491    // WIP to switch default to IEEE long double. Will emit libcall for
3492    // frexpl instead of legalizing this type in the BE.
3493    if (&getTarget().getLongDoubleFormat() == &llvm::APFloat::PPCDoubleDouble())
3494      break;
3495    LLVM_FALLTHROUGH;
3496  }
3497  case Builtin::BI__builtin_frexp:
3498  case Builtin::BI__builtin_frexpf:
3499  case Builtin::BI__builtin_frexpf128:
3500  case Builtin::BI__builtin_frexpf16:
3501    return RValue::get(emitFrexpBuiltin(*this, E, Intrinsic::frexp));
3502  case Builtin::BI__builtin_isgreater:
3503  case Builtin::BI__builtin_isgreaterequal:
3504  case Builtin::BI__builtin_isless:
3505  case Builtin::BI__builtin_islessequal:
3506  case Builtin::BI__builtin_islessgreater:
3507  case Builtin::BI__builtin_isunordered: {
3508    // Ordered comparisons: we know the arguments to these are matching scalar
3509    // floating point values.
3510    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3511    Value *LHS = EmitScalarExpr(E->getArg(0));
3512    Value *RHS = EmitScalarExpr(E->getArg(1));
3513
3514    switch (BuiltinID) {
3515    default: llvm_unreachable("Unknown ordered comparison");
3516    case Builtin::BI__builtin_isgreater:
3517      LHS = Builder.CreateFCmpOGT(LHS, RHS, "cmp");
3518      break;
3519    case Builtin::BI__builtin_isgreaterequal:
3520      LHS = Builder.CreateFCmpOGE(LHS, RHS, "cmp");
3521      break;
3522    case Builtin::BI__builtin_isless:
3523      LHS = Builder.CreateFCmpOLT(LHS, RHS, "cmp");
3524      break;
3525    case Builtin::BI__builtin_islessequal:
3526      LHS = Builder.CreateFCmpOLE(LHS, RHS, "cmp");
3527      break;
3528    case Builtin::BI__builtin_islessgreater:
3529      LHS = Builder.CreateFCmpONE(LHS, RHS, "cmp");
3530      break;
3531    case Builtin::BI__builtin_isunordered:
3532      LHS = Builder.CreateFCmpUNO(LHS, RHS, "cmp");
3533      break;
3534    }
3535    // ZExt bool to int type.
3536    return RValue::get(Builder.CreateZExt(LHS, ConvertType(E->getType())));
3537  }
3538
3539  case Builtin::BI__builtin_isnan: {
3540    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3541    Value *V = EmitScalarExpr(E->getArg(0));
3542    if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V))
3543      return RValue::get(Result);
3544    return RValue::get(
3545        Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcNan),
3546                           ConvertType(E->getType())));
3547  }
3548
3549  case Builtin::BI__builtin_issignaling: {
3550    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3551    Value *V = EmitScalarExpr(E->getArg(0));
3552    return RValue::get(
3553        Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcSNan),
3554                           ConvertType(E->getType())));
3555  }
3556
3557  case Builtin::BI__builtin_isinf: {
3558    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3559    Value *V = EmitScalarExpr(E->getArg(0));
3560    if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V))
3561      return RValue::get(Result);
3562    return RValue::get(
3563        Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcInf),
3564                           ConvertType(E->getType())));
3565  }
3566
3567  case Builtin::BIfinite:
3568  case Builtin::BI__finite:
3569  case Builtin::BIfinitef:
3570  case Builtin::BI__finitef:
3571  case Builtin::BIfinitel:
3572  case Builtin::BI__finitel:
3573  case Builtin::BI__builtin_isfinite: {
3574    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3575    Value *V = EmitScalarExpr(E->getArg(0));
3576    if (Value *Result = tryUseTestFPKind(*this, BuiltinID, V))
3577      return RValue::get(Result);
3578    return RValue::get(
3579        Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcFinite),
3580                           ConvertType(E->getType())));
3581  }
3582
3583  case Builtin::BI__builtin_isnormal: {
3584    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3585    Value *V = EmitScalarExpr(E->getArg(0));
3586    return RValue::get(
3587        Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcNormal),
3588                           ConvertType(E->getType())));
3589  }
3590
3591  case Builtin::BI__builtin_issubnormal: {
3592    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3593    Value *V = EmitScalarExpr(E->getArg(0));
3594    return RValue::get(
3595        Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcSubnormal),
3596                           ConvertType(E->getType())));
3597  }
3598
3599  case Builtin::BI__builtin_iszero: {
3600    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3601    Value *V = EmitScalarExpr(E->getArg(0));
3602    return RValue::get(
3603        Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcZero),
3604                           ConvertType(E->getType())));
3605  }
3606
3607  case Builtin::BI__builtin_isfpclass: {
3608    Expr::EvalResult Result;
3609    if (!E->getArg(1)->EvaluateAsInt(Result, CGM.getContext()))
3610      break;
3611    uint64_t Test = Result.Val.getInt().getLimitedValue();
3612    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3613    Value *V = EmitScalarExpr(E->getArg(0));
3614    return RValue::get(Builder.CreateZExt(Builder.createIsFPClass(V, Test),
3615                                          ConvertType(E->getType())));
3616  }
3617
3618  case Builtin::BI__builtin_nondeterministic_value: {
3619    llvm::Type *Ty = ConvertType(E->getArg(0)->getType());
3620
3621    Value *Result = PoisonValue::get(Ty);
3622    Result = Builder.CreateFreeze(Result);
3623
3624    return RValue::get(Result);
3625  }
3626
3627  case Builtin::BI__builtin_elementwise_abs: {
3628    Value *Result;
3629    QualType QT = E->getArg(0)->getType();
3630
3631    if (auto *VecTy = QT->getAs<VectorType>())
3632      QT = VecTy->getElementType();
3633    if (QT->isIntegerType())
3634      Result = Builder.CreateBinaryIntrinsic(
3635          llvm::Intrinsic::abs, EmitScalarExpr(E->getArg(0)),
3636          Builder.getFalse(), nullptr, "elt.abs");
3637    else
3638      Result = emitUnaryBuiltin(*this, E, llvm::Intrinsic::fabs, "elt.abs");
3639
3640    return RValue::get(Result);
3641  }
3642
3643  case Builtin::BI__builtin_elementwise_ceil:
3644    return RValue::get(
3645        emitUnaryBuiltin(*this, E, llvm::Intrinsic::ceil, "elt.ceil"));
3646  case Builtin::BI__builtin_elementwise_exp:
3647    return RValue::get(
3648        emitUnaryBuiltin(*this, E, llvm::Intrinsic::exp, "elt.exp"));
3649  case Builtin::BI__builtin_elementwise_exp2:
3650    return RValue::get(
3651        emitUnaryBuiltin(*this, E, llvm::Intrinsic::exp2, "elt.exp2"));
3652  case Builtin::BI__builtin_elementwise_log:
3653    return RValue::get(
3654        emitUnaryBuiltin(*this, E, llvm::Intrinsic::log, "elt.log"));
3655  case Builtin::BI__builtin_elementwise_log2:
3656    return RValue::get(
3657        emitUnaryBuiltin(*this, E, llvm::Intrinsic::log2, "elt.log2"));
3658  case Builtin::BI__builtin_elementwise_log10:
3659    return RValue::get(
3660        emitUnaryBuiltin(*this, E, llvm::Intrinsic::log10, "elt.log10"));
3661  case Builtin::BI__builtin_elementwise_pow: {
3662    return RValue::get(emitBinaryBuiltin(*this, E, llvm::Intrinsic::pow));
3663  }
3664  case Builtin::BI__builtin_elementwise_bitreverse:
3665    return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::bitreverse,
3666                                        "elt.bitreverse"));
3667  case Builtin::BI__builtin_elementwise_cos:
3668    return RValue::get(
3669        emitUnaryBuiltin(*this, E, llvm::Intrinsic::cos, "elt.cos"));
3670  case Builtin::BI__builtin_elementwise_floor:
3671    return RValue::get(
3672        emitUnaryBuiltin(*this, E, llvm::Intrinsic::floor, "elt.floor"));
3673  case Builtin::BI__builtin_elementwise_roundeven:
3674    return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::roundeven,
3675                                        "elt.roundeven"));
3676  case Builtin::BI__builtin_elementwise_round:
3677    return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::round,
3678                                        "elt.round"));
3679  case Builtin::BI__builtin_elementwise_rint:
3680    return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::rint,
3681                                        "elt.rint"));
3682  case Builtin::BI__builtin_elementwise_nearbyint:
3683    return RValue::get(emitUnaryBuiltin(*this, E, llvm::Intrinsic::nearbyint,
3684                                        "elt.nearbyint"));
3685  case Builtin::BI__builtin_elementwise_sin:
3686    return RValue::get(
3687        emitUnaryBuiltin(*this, E, llvm::Intrinsic::sin, "elt.sin"));
3688
3689  case Builtin::BI__builtin_elementwise_trunc:
3690    return RValue::get(
3691        emitUnaryBuiltin(*this, E, llvm::Intrinsic::trunc, "elt.trunc"));
3692  case Builtin::BI__builtin_elementwise_canonicalize:
3693    return RValue::get(
3694        emitUnaryBuiltin(*this, E, llvm::Intrinsic::canonicalize, "elt.canonicalize"));
3695  case Builtin::BI__builtin_elementwise_copysign:
3696    return RValue::get(emitBinaryBuiltin(*this, E, llvm::Intrinsic::copysign));
3697  case Builtin::BI__builtin_elementwise_fma:
3698    return RValue::get(emitTernaryBuiltin(*this, E, llvm::Intrinsic::fma));
3699  case Builtin::BI__builtin_elementwise_add_sat:
3700  case Builtin::BI__builtin_elementwise_sub_sat: {
3701    Value *Op0 = EmitScalarExpr(E->getArg(0));
3702    Value *Op1 = EmitScalarExpr(E->getArg(1));
3703    Value *Result;
3704    assert(Op0->getType()->isIntOrIntVectorTy() && "integer type expected");
3705    QualType Ty = E->getArg(0)->getType();
3706    if (auto *VecTy = Ty->getAs<VectorType>())
3707      Ty = VecTy->getElementType();
3708    bool IsSigned = Ty->isSignedIntegerType();
3709    unsigned Opc;
3710    if (BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_elementwise_add_sat)
3711      Opc = IsSigned ? llvm::Intrinsic::sadd_sat : llvm::Intrinsic::uadd_sat;
3712    else
3713      Opc = IsSigned ? llvm::Intrinsic::ssub_sat : llvm::Intrinsic::usub_sat;
3714    Result = Builder.CreateBinaryIntrinsic(Opc, Op0, Op1, nullptr, "elt.sat");
3715    return RValue::get(Result);
3716  }
3717
3718  case Builtin::BI__builtin_elementwise_max: {
3719    Value *Op0 = EmitScalarExpr(E->getArg(0));
3720    Value *Op1 = EmitScalarExpr(E->getArg(1));
3721    Value *Result;
3722    if (Op0->getType()->isIntOrIntVectorTy()) {
3723      QualType Ty = E->getArg(0)->getType();
3724      if (auto *VecTy = Ty->getAs<VectorType>())
3725        Ty = VecTy->getElementType();
3726      Result = Builder.CreateBinaryIntrinsic(Ty->isSignedIntegerType()
3727                                                 ? llvm::Intrinsic::smax
3728                                                 : llvm::Intrinsic::umax,
3729                                             Op0, Op1, nullptr, "elt.max");
3730    } else
3731      Result = Builder.CreateMaxNum(Op0, Op1, "elt.max");
3732    return RValue::get(Result);
3733  }
3734  case Builtin::BI__builtin_elementwise_min: {
3735    Value *Op0 = EmitScalarExpr(E->getArg(0));
3736    Value *Op1 = EmitScalarExpr(E->getArg(1));
3737    Value *Result;
3738    if (Op0->getType()->isIntOrIntVectorTy()) {
3739      QualType Ty = E->getArg(0)->getType();
3740      if (auto *VecTy = Ty->getAs<VectorType>())
3741        Ty = VecTy->getElementType();
3742      Result = Builder.CreateBinaryIntrinsic(Ty->isSignedIntegerType()
3743                                                 ? llvm::Intrinsic::smin
3744                                                 : llvm::Intrinsic::umin,
3745                                             Op0, Op1, nullptr, "elt.min");
3746    } else
3747      Result = Builder.CreateMinNum(Op0, Op1, "elt.min");
3748    return RValue::get(Result);
3749  }
3750
3751  case Builtin::BI__builtin_reduce_max: {
3752    auto GetIntrinsicID = [](QualType QT) {
3753      if (auto *VecTy = QT->getAs<VectorType>())
3754        QT = VecTy->getElementType();
3755      if (QT->isSignedIntegerType())
3756        return llvm::Intrinsic::vector_reduce_smax;
3757      if (QT->isUnsignedIntegerType())
3758        return llvm::Intrinsic::vector_reduce_umax;
3759      assert(QT->isFloatingType() && "must have a float here");
3760      return llvm::Intrinsic::vector_reduce_fmax;
3761    };
3762    return RValue::get(emitUnaryBuiltin(
3763        *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
3764  }
3765
3766  case Builtin::BI__builtin_reduce_min: {
3767    auto GetIntrinsicID = [](QualType QT) {
3768      if (auto *VecTy = QT->getAs<VectorType>())
3769        QT = VecTy->getElementType();
3770      if (QT->isSignedIntegerType())
3771        return llvm::Intrinsic::vector_reduce_smin;
3772      if (QT->isUnsignedIntegerType())
3773        return llvm::Intrinsic::vector_reduce_umin;
3774      assert(QT->isFloatingType() && "must have a float here");
3775      return llvm::Intrinsic::vector_reduce_fmin;
3776    };
3777
3778    return RValue::get(emitUnaryBuiltin(
3779        *this, E, GetIntrinsicID(E->getArg(0)->getType()), "rdx.min"));
3780  }
3781
3782  case Builtin::BI__builtin_reduce_add:
3783    return RValue::get(emitUnaryBuiltin(
3784        *this, E, llvm::Intrinsic::vector_reduce_add, "rdx.add"));
3785  case Builtin::BI__builtin_reduce_mul:
3786    return RValue::get(emitUnaryBuiltin(
3787        *this, E, llvm::Intrinsic::vector_reduce_mul, "rdx.mul"));
3788  case Builtin::BI__builtin_reduce_xor:
3789    return RValue::get(emitUnaryBuiltin(
3790        *this, E, llvm::Intrinsic::vector_reduce_xor, "rdx.xor"));
3791  case Builtin::BI__builtin_reduce_or:
3792    return RValue::get(emitUnaryBuiltin(
3793        *this, E, llvm::Intrinsic::vector_reduce_or, "rdx.or"));
3794  case Builtin::BI__builtin_reduce_and:
3795    return RValue::get(emitUnaryBuiltin(
3796        *this, E, llvm::Intrinsic::vector_reduce_and, "rdx.and"));
3797
3798  case Builtin::BI__builtin_matrix_transpose: {
3799    auto *MatrixTy = E->getArg(0)->getType()->castAs<ConstantMatrixType>();
3800    Value *MatValue = EmitScalarExpr(E->getArg(0));
3801    MatrixBuilder MB(Builder);
3802    Value *Result = MB.CreateMatrixTranspose(MatValue, MatrixTy->getNumRows(),
3803                                             MatrixTy->getNumColumns());
3804    return RValue::get(Result);
3805  }
3806
3807  case Builtin::BI__builtin_matrix_column_major_load: {
3808    MatrixBuilder MB(Builder);
3809    // Emit everything that isn't dependent on the first parameter type
3810    Value *Stride = EmitScalarExpr(E->getArg(3));
3811    const auto *ResultTy = E->getType()->getAs<ConstantMatrixType>();
3812    auto *PtrTy = E->getArg(0)->getType()->getAs<PointerType>();
3813    assert(PtrTy && "arg0 must be of pointer type");
3814    bool IsVolatile = PtrTy->getPointeeType().isVolatileQualified();
3815
3816    Address Src = EmitPointerWithAlignment(E->getArg(0));
3817    EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(0)->getType(),
3818                        E->getArg(0)->getExprLoc(), FD, 0);
3819    Value *Result = MB.CreateColumnMajorLoad(
3820        Src.getElementType(), Src.getPointer(),
3821        Align(Src.getAlignment().getQuantity()), Stride, IsVolatile,
3822        ResultTy->getNumRows(), ResultTy->getNumColumns(),
3823        "matrix");
3824    return RValue::get(Result);
3825  }
3826
3827  case Builtin::BI__builtin_matrix_column_major_store: {
3828    MatrixBuilder MB(Builder);
3829    Value *Matrix = EmitScalarExpr(E->getArg(0));
3830    Address Dst = EmitPointerWithAlignment(E->getArg(1));
3831    Value *Stride = EmitScalarExpr(E->getArg(2));
3832
3833    const auto *MatrixTy = E->getArg(0)->getType()->getAs<ConstantMatrixType>();
3834    auto *PtrTy = E->getArg(1)->getType()->getAs<PointerType>();
3835    assert(PtrTy && "arg1 must be of pointer type");
3836    bool IsVolatile = PtrTy->getPointeeType().isVolatileQualified();
3837
3838    EmitNonNullArgCheck(RValue::get(Dst.getPointer()), E->getArg(1)->getType(),
3839                        E->getArg(1)->getExprLoc(), FD, 0);
3840    Value *Result = MB.CreateColumnMajorStore(
3841        Matrix, Dst.getPointer(), Align(Dst.getAlignment().getQuantity()),
3842        Stride, IsVolatile, MatrixTy->getNumRows(), MatrixTy->getNumColumns());
3843    return RValue::get(Result);
3844  }
3845
3846  case Builtin::BI__builtin_isinf_sign: {
3847    // isinf_sign(x) -> fabs(x) == infinity ? (signbit(x) ? -1 : 1) : 0
3848    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3849    // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here.
3850    Value *Arg = EmitScalarExpr(E->getArg(0));
3851    Value *AbsArg = EmitFAbs(*this, Arg);
3852    Value *IsInf = Builder.CreateFCmpOEQ(
3853        AbsArg, ConstantFP::getInfinity(Arg->getType()), "isinf");
3854    Value *IsNeg = EmitSignBit(*this, Arg);
3855
3856    llvm::Type *IntTy = ConvertType(E->getType());
3857    Value *Zero = Constant::getNullValue(IntTy);
3858    Value *One = ConstantInt::get(IntTy, 1);
3859    Value *NegativeOne = ConstantInt::get(IntTy, -1);
3860    Value *SignResult = Builder.CreateSelect(IsNeg, NegativeOne, One);
3861    Value *Result = Builder.CreateSelect(IsInf, SignResult, Zero);
3862    return RValue::get(Result);
3863  }
3864
3865  case Builtin::BI__builtin_flt_rounds: {
3866    Function *F = CGM.getIntrinsic(Intrinsic::get_rounding);
3867
3868    llvm::Type *ResultType = ConvertType(E->getType());
3869    Value *Result = Builder.CreateCall(F);
3870    if (Result->getType() != ResultType)
3871      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
3872                                     "cast");
3873    return RValue::get(Result);
3874  }
3875
3876  case Builtin::BI__builtin_set_flt_rounds: {
3877    Function *F = CGM.getIntrinsic(Intrinsic::set_rounding);
3878
3879    Value *V = EmitScalarExpr(E->getArg(0));
3880    Builder.CreateCall(F, V);
3881    return RValue::get(nullptr);
3882  }
3883
3884  case Builtin::BI__builtin_fpclassify: {
3885    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
3886    // FIXME: for strictfp/IEEE-754 we need to not trap on SNaN here.
3887    Value *V = EmitScalarExpr(E->getArg(5));
3888    llvm::Type *Ty = ConvertType(E->getArg(5)->getType());
3889
3890    // Create Result
3891    BasicBlock *Begin = Builder.GetInsertBlock();
3892    BasicBlock *End = createBasicBlock("fpclassify_end", this->CurFn);
3893    Builder.SetInsertPoint(End);
3894    PHINode *Result =
3895      Builder.CreatePHI(ConvertType(E->getArg(0)->getType()), 4,
3896                        "fpclassify_result");
3897
3898    // if (V==0) return FP_ZERO
3899    Builder.SetInsertPoint(Begin);
3900    Value *IsZero = Builder.CreateFCmpOEQ(V, Constant::getNullValue(Ty),
3901                                          "iszero");
3902    Value *ZeroLiteral = EmitScalarExpr(E->getArg(4));
3903    BasicBlock *NotZero = createBasicBlock("fpclassify_not_zero", this->CurFn);
3904    Builder.CreateCondBr(IsZero, End, NotZero);
3905    Result->addIncoming(ZeroLiteral, Begin);
3906
3907    // if (V != V) return FP_NAN
3908    Builder.SetInsertPoint(NotZero);
3909    Value *IsNan = Builder.CreateFCmpUNO(V, V, "cmp");
3910    Value *NanLiteral = EmitScalarExpr(E->getArg(0));
3911    BasicBlock *NotNan = createBasicBlock("fpclassify_not_nan", this->CurFn);
3912    Builder.CreateCondBr(IsNan, End, NotNan);
3913    Result->addIncoming(NanLiteral, NotZero);
3914
3915    // if (fabs(V) == infinity) return FP_INFINITY
3916    Builder.SetInsertPoint(NotNan);
3917    Value *VAbs = EmitFAbs(*this, V);
3918    Value *IsInf =
3919      Builder.CreateFCmpOEQ(VAbs, ConstantFP::getInfinity(V->getType()),
3920                            "isinf");
3921    Value *InfLiteral = EmitScalarExpr(E->getArg(1));
3922    BasicBlock *NotInf = createBasicBlock("fpclassify_not_inf", this->CurFn);
3923    Builder.CreateCondBr(IsInf, End, NotInf);
3924    Result->addIncoming(InfLiteral, NotNan);
3925
3926    // if (fabs(V) >= MIN_NORMAL) return FP_NORMAL else FP_SUBNORMAL
3927    Builder.SetInsertPoint(NotInf);
3928    APFloat Smallest = APFloat::getSmallestNormalized(
3929        getContext().getFloatTypeSemantics(E->getArg(5)->getType()));
3930    Value *IsNormal =
3931      Builder.CreateFCmpUGE(VAbs, ConstantFP::get(V->getContext(), Smallest),
3932                            "isnormal");
3933    Value *NormalResult =
3934      Builder.CreateSelect(IsNormal, EmitScalarExpr(E->getArg(2)),
3935                           EmitScalarExpr(E->getArg(3)));
3936    Builder.CreateBr(End);
3937    Result->addIncoming(NormalResult, NotInf);
3938
3939    // return Result
3940    Builder.SetInsertPoint(End);
3941    return RValue::get(Result);
3942  }
3943
3944  // An alloca will always return a pointer to the alloca (stack) address
3945  // space. This address space need not be the same as the AST / Language
3946  // default (e.g. in C / C++ auto vars are in the generic address space). At
3947  // the AST level this is handled within CreateTempAlloca et al., but for the
3948  // builtin / dynamic alloca we have to handle it here. We use an explicit cast
3949  // instead of passing an AS to CreateAlloca so as to not inhibit optimisation.
3950  case Builtin::BIalloca:
3951  case Builtin::BI_alloca:
3952  case Builtin::BI__builtin_alloca_uninitialized:
3953  case Builtin::BI__builtin_alloca: {
3954    Value *Size = EmitScalarExpr(E->getArg(0));
3955    const TargetInfo &TI = getContext().getTargetInfo();
3956    // The alignment of the alloca should correspond to __BIGGEST_ALIGNMENT__.
3957    const Align SuitableAlignmentInBytes =
3958        CGM.getContext()
3959            .toCharUnitsFromBits(TI.getSuitableAlign())
3960            .getAsAlign();
3961    AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
3962    AI->setAlignment(SuitableAlignmentInBytes);
3963    if (BuiltinID != Builtin::BI__builtin_alloca_uninitialized)
3964      initializeAlloca(*this, AI, Size, SuitableAlignmentInBytes);
3965    LangAS AAS = getASTAllocaAddressSpace();
3966    LangAS EAS = E->getType()->getPointeeType().getAddressSpace();
3967    if (AAS != EAS) {
3968      llvm::Type *Ty = CGM.getTypes().ConvertType(E->getType());
3969      return RValue::get(getTargetHooks().performAddrSpaceCast(*this, AI, AAS,
3970                                                               EAS, Ty));
3971    }
3972    return RValue::get(AI);
3973  }
3974
3975  case Builtin::BI__builtin_alloca_with_align_uninitialized:
3976  case Builtin::BI__builtin_alloca_with_align: {
3977    Value *Size = EmitScalarExpr(E->getArg(0));
3978    Value *AlignmentInBitsValue = EmitScalarExpr(E->getArg(1));
3979    auto *AlignmentInBitsCI = cast<ConstantInt>(AlignmentInBitsValue);
3980    unsigned AlignmentInBits = AlignmentInBitsCI->getZExtValue();
3981    const Align AlignmentInBytes =
3982        CGM.getContext().toCharUnitsFromBits(AlignmentInBits).getAsAlign();
3983    AllocaInst *AI = Builder.CreateAlloca(Builder.getInt8Ty(), Size);
3984    AI->setAlignment(AlignmentInBytes);
3985    if (BuiltinID != Builtin::BI__builtin_alloca_with_align_uninitialized)
3986      initializeAlloca(*this, AI, Size, AlignmentInBytes);
3987    LangAS AAS = getASTAllocaAddressSpace();
3988    LangAS EAS = E->getType()->getPointeeType().getAddressSpace();
3989    if (AAS != EAS) {
3990      llvm::Type *Ty = CGM.getTypes().ConvertType(E->getType());
3991      return RValue::get(getTargetHooks().performAddrSpaceCast(*this, AI, AAS,
3992                                                               EAS, Ty));
3993    }
3994    return RValue::get(AI);
3995  }
3996
3997  case Builtin::BIbzero:
3998  case Builtin::BI__builtin_bzero: {
3999    Address Dest = EmitPointerWithAlignment(E->getArg(0));
4000    Value *SizeVal = EmitScalarExpr(E->getArg(1));
4001    EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
4002                        E->getArg(0)->getExprLoc(), FD, 0);
4003    Builder.CreateMemSet(Dest, Builder.getInt8(0), SizeVal, false);
4004    return RValue::get(nullptr);
4005  }
4006
4007  case Builtin::BIbcopy:
4008  case Builtin::BI__builtin_bcopy: {
4009    Address Src = EmitPointerWithAlignment(E->getArg(0));
4010    Address Dest = EmitPointerWithAlignment(E->getArg(1));
4011    Value *SizeVal = EmitScalarExpr(E->getArg(2));
4012    EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(0)->getType(),
4013                        E->getArg(0)->getExprLoc(), FD, 0);
4014    EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(1)->getType(),
4015                        E->getArg(1)->getExprLoc(), FD, 0);
4016    Builder.CreateMemMove(Dest, Src, SizeVal, false);
4017    return RValue::get(Dest.getPointer());
4018  }
4019
4020  case Builtin::BImemcpy:
4021  case Builtin::BI__builtin_memcpy:
4022  case Builtin::BImempcpy:
4023  case Builtin::BI__builtin_mempcpy: {
4024    Address Dest = EmitPointerWithAlignment(E->getArg(0));
4025    Address Src = EmitPointerWithAlignment(E->getArg(1));
4026    Value *SizeVal = EmitScalarExpr(E->getArg(2));
4027    EmitArgCheck(TCK_Store, Dest, E->getArg(0), 0);
4028    EmitArgCheck(TCK_Load, Src, E->getArg(1), 1);
4029    Builder.CreateMemCpy(Dest, Src, SizeVal, false);
4030    if (BuiltinID == Builtin::BImempcpy ||
4031        BuiltinID == Builtin::BI__builtin_mempcpy)
4032      return RValue::get(Builder.CreateInBoundsGEP(Dest.getElementType(),
4033                                                   Dest.getPointer(), SizeVal));
4034    else
4035      return RValue::get(Dest.getPointer());
4036  }
4037
4038  case Builtin::BI__builtin_memcpy_inline: {
4039    Address Dest = EmitPointerWithAlignment(E->getArg(0));
4040    Address Src = EmitPointerWithAlignment(E->getArg(1));
4041    uint64_t Size =
4042        E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue();
4043    EmitArgCheck(TCK_Store, Dest, E->getArg(0), 0);
4044    EmitArgCheck(TCK_Load, Src, E->getArg(1), 1);
4045    Builder.CreateMemCpyInline(Dest, Src, Size);
4046    return RValue::get(nullptr);
4047  }
4048
4049  case Builtin::BI__builtin_char_memchr:
4050    BuiltinID = Builtin::BI__builtin_memchr;
4051    break;
4052
4053  case Builtin::BI__builtin___memcpy_chk: {
4054    // fold __builtin_memcpy_chk(x, y, cst1, cst2) to memcpy iff cst1<=cst2.
4055    Expr::EvalResult SizeResult, DstSizeResult;
4056    if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
4057        !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
4058      break;
4059    llvm::APSInt Size = SizeResult.Val.getInt();
4060    llvm::APSInt DstSize = DstSizeResult.Val.getInt();
4061    if (Size.ugt(DstSize))
4062      break;
4063    Address Dest = EmitPointerWithAlignment(E->getArg(0));
4064    Address Src = EmitPointerWithAlignment(E->getArg(1));
4065    Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
4066    Builder.CreateMemCpy(Dest, Src, SizeVal, false);
4067    return RValue::get(Dest.getPointer());
4068  }
4069
4070  case Builtin::BI__builtin_objc_memmove_collectable: {
4071    Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
4072    Address SrcAddr = EmitPointerWithAlignment(E->getArg(1));
4073    Value *SizeVal = EmitScalarExpr(E->getArg(2));
4074    CGM.getObjCRuntime().EmitGCMemmoveCollectable(*this,
4075                                                  DestAddr, SrcAddr, SizeVal);
4076    return RValue::get(DestAddr.getPointer());
4077  }
4078
4079  case Builtin::BI__builtin___memmove_chk: {
4080    // fold __builtin_memmove_chk(x, y, cst1, cst2) to memmove iff cst1<=cst2.
4081    Expr::EvalResult SizeResult, DstSizeResult;
4082    if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
4083        !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
4084      break;
4085    llvm::APSInt Size = SizeResult.Val.getInt();
4086    llvm::APSInt DstSize = DstSizeResult.Val.getInt();
4087    if (Size.ugt(DstSize))
4088      break;
4089    Address Dest = EmitPointerWithAlignment(E->getArg(0));
4090    Address Src = EmitPointerWithAlignment(E->getArg(1));
4091    Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
4092    Builder.CreateMemMove(Dest, Src, SizeVal, false);
4093    return RValue::get(Dest.getPointer());
4094  }
4095
4096  case Builtin::BImemmove:
4097  case Builtin::BI__builtin_memmove: {
4098    Address Dest = EmitPointerWithAlignment(E->getArg(0));
4099    Address Src = EmitPointerWithAlignment(E->getArg(1));
4100    Value *SizeVal = EmitScalarExpr(E->getArg(2));
4101    EmitArgCheck(TCK_Store, Dest, E->getArg(0), 0);
4102    EmitArgCheck(TCK_Load, Src, E->getArg(1), 1);
4103    Builder.CreateMemMove(Dest, Src, SizeVal, false);
4104    return RValue::get(Dest.getPointer());
4105  }
4106  case Builtin::BImemset:
4107  case Builtin::BI__builtin_memset: {
4108    Address Dest = EmitPointerWithAlignment(E->getArg(0));
4109    Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
4110                                         Builder.getInt8Ty());
4111    Value *SizeVal = EmitScalarExpr(E->getArg(2));
4112    EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
4113                        E->getArg(0)->getExprLoc(), FD, 0);
4114    Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
4115    return RValue::get(Dest.getPointer());
4116  }
4117  case Builtin::BI__builtin_memset_inline: {
4118    Address Dest = EmitPointerWithAlignment(E->getArg(0));
4119    Value *ByteVal =
4120        Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)), Builder.getInt8Ty());
4121    uint64_t Size =
4122        E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue();
4123    EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
4124                        E->getArg(0)->getExprLoc(), FD, 0);
4125    Builder.CreateMemSetInline(Dest, ByteVal, Size);
4126    return RValue::get(nullptr);
4127  }
4128  case Builtin::BI__builtin___memset_chk: {
4129    // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2.
4130    Expr::EvalResult SizeResult, DstSizeResult;
4131    if (!E->getArg(2)->EvaluateAsInt(SizeResult, CGM.getContext()) ||
4132        !E->getArg(3)->EvaluateAsInt(DstSizeResult, CGM.getContext()))
4133      break;
4134    llvm::APSInt Size = SizeResult.Val.getInt();
4135    llvm::APSInt DstSize = DstSizeResult.Val.getInt();
4136    if (Size.ugt(DstSize))
4137      break;
4138    Address Dest = EmitPointerWithAlignment(E->getArg(0));
4139    Value *ByteVal = Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)),
4140                                         Builder.getInt8Ty());
4141    Value *SizeVal = llvm::ConstantInt::get(Builder.getContext(), Size);
4142    Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
4143    return RValue::get(Dest.getPointer());
4144  }
4145  case Builtin::BI__builtin_wmemchr: {
4146    // The MSVC runtime library does not provide a definition of wmemchr, so we
4147    // need an inline implementation.
4148    if (!getTarget().getTriple().isOSMSVCRT())
4149      break;
4150
4151    llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
4152    Value *Str = EmitScalarExpr(E->getArg(0));
4153    Value *Chr = EmitScalarExpr(E->getArg(1));
4154    Value *Size = EmitScalarExpr(E->getArg(2));
4155
4156    BasicBlock *Entry = Builder.GetInsertBlock();
4157    BasicBlock *CmpEq = createBasicBlock("wmemchr.eq");
4158    BasicBlock *Next = createBasicBlock("wmemchr.next");
4159    BasicBlock *Exit = createBasicBlock("wmemchr.exit");
4160    Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
4161    Builder.CreateCondBr(SizeEq0, Exit, CmpEq);
4162
4163    EmitBlock(CmpEq);
4164    PHINode *StrPhi = Builder.CreatePHI(Str->getType(), 2);
4165    StrPhi->addIncoming(Str, Entry);
4166    PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
4167    SizePhi->addIncoming(Size, Entry);
4168    CharUnits WCharAlign =
4169        getContext().getTypeAlignInChars(getContext().WCharTy);
4170    Value *StrCh = Builder.CreateAlignedLoad(WCharTy, StrPhi, WCharAlign);
4171    Value *FoundChr = Builder.CreateConstInBoundsGEP1_32(WCharTy, StrPhi, 0);
4172    Value *StrEqChr = Builder.CreateICmpEQ(StrCh, Chr);
4173    Builder.CreateCondBr(StrEqChr, Exit, Next);
4174
4175    EmitBlock(Next);
4176    Value *NextStr = Builder.CreateConstInBoundsGEP1_32(WCharTy, StrPhi, 1);
4177    Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
4178    Value *NextSizeEq0 =
4179        Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
4180    Builder.CreateCondBr(NextSizeEq0, Exit, CmpEq);
4181    StrPhi->addIncoming(NextStr, Next);
4182    SizePhi->addIncoming(NextSize, Next);
4183
4184    EmitBlock(Exit);
4185    PHINode *Ret = Builder.CreatePHI(Str->getType(), 3);
4186    Ret->addIncoming(llvm::Constant::getNullValue(Str->getType()), Entry);
4187    Ret->addIncoming(llvm::Constant::getNullValue(Str->getType()), Next);
4188    Ret->addIncoming(FoundChr, CmpEq);
4189    return RValue::get(Ret);
4190  }
4191  case Builtin::BI__builtin_wmemcmp: {
4192    // The MSVC runtime library does not provide a definition of wmemcmp, so we
4193    // need an inline implementation.
4194    if (!getTarget().getTriple().isOSMSVCRT())
4195      break;
4196
4197    llvm::Type *WCharTy = ConvertType(getContext().WCharTy);
4198
4199    Value *Dst = EmitScalarExpr(E->getArg(0));
4200    Value *Src = EmitScalarExpr(E->getArg(1));
4201    Value *Size = EmitScalarExpr(E->getArg(2));
4202
4203    BasicBlock *Entry = Builder.GetInsertBlock();
4204    BasicBlock *CmpGT = createBasicBlock("wmemcmp.gt");
4205    BasicBlock *CmpLT = createBasicBlock("wmemcmp.lt");
4206    BasicBlock *Next = createBasicBlock("wmemcmp.next");
4207    BasicBlock *Exit = createBasicBlock("wmemcmp.exit");
4208    Value *SizeEq0 = Builder.CreateICmpEQ(Size, ConstantInt::get(SizeTy, 0));
4209    Builder.CreateCondBr(SizeEq0, Exit, CmpGT);
4210
4211    EmitBlock(CmpGT);
4212    PHINode *DstPhi = Builder.CreatePHI(Dst->getType(), 2);
4213    DstPhi->addIncoming(Dst, Entry);
4214    PHINode *SrcPhi = Builder.CreatePHI(Src->getType(), 2);
4215    SrcPhi->addIncoming(Src, Entry);
4216    PHINode *SizePhi = Builder.CreatePHI(SizeTy, 2);
4217    SizePhi->addIncoming(Size, Entry);
4218    CharUnits WCharAlign =
4219        getContext().getTypeAlignInChars(getContext().WCharTy);
4220    Value *DstCh = Builder.CreateAlignedLoad(WCharTy, DstPhi, WCharAlign);
4221    Value *SrcCh = Builder.CreateAlignedLoad(WCharTy, SrcPhi, WCharAlign);
4222    Value *DstGtSrc = Builder.CreateICmpUGT(DstCh, SrcCh);
4223    Builder.CreateCondBr(DstGtSrc, Exit, CmpLT);
4224
4225    EmitBlock(CmpLT);
4226    Value *DstLtSrc = Builder.CreateICmpULT(DstCh, SrcCh);
4227    Builder.CreateCondBr(DstLtSrc, Exit, Next);
4228
4229    EmitBlock(Next);
4230    Value *NextDst = Builder.CreateConstInBoundsGEP1_32(WCharTy, DstPhi, 1);
4231    Value *NextSrc = Builder.CreateConstInBoundsGEP1_32(WCharTy, SrcPhi, 1);
4232    Value *NextSize = Builder.CreateSub(SizePhi, ConstantInt::get(SizeTy, 1));
4233    Value *NextSizeEq0 =
4234        Builder.CreateICmpEQ(NextSize, ConstantInt::get(SizeTy, 0));
4235    Builder.CreateCondBr(NextSizeEq0, Exit, CmpGT);
4236    DstPhi->addIncoming(NextDst, Next);
4237    SrcPhi->addIncoming(NextSrc, Next);
4238    SizePhi->addIncoming(NextSize, Next);
4239
4240    EmitBlock(Exit);
4241    PHINode *Ret = Builder.CreatePHI(IntTy, 4);
4242    Ret->addIncoming(ConstantInt::get(IntTy, 0), Entry);
4243    Ret->addIncoming(ConstantInt::get(IntTy, 1), CmpGT);
4244    Ret->addIncoming(ConstantInt::get(IntTy, -1), CmpLT);
4245    Ret->addIncoming(ConstantInt::get(IntTy, 0), Next);
4246    return RValue::get(Ret);
4247  }
4248  case Builtin::BI__builtin_dwarf_cfa: {
4249    // The offset in bytes from the first argument to the CFA.
4250    //
4251    // Why on earth is this in the frontend?  Is there any reason at
4252    // all that the backend can't reasonably determine this while
4253    // lowering llvm.eh.dwarf.cfa()?
4254    //
4255    // TODO: If there's a satisfactory reason, add a target hook for
4256    // this instead of hard-coding 0, which is correct for most targets.
4257    int32_t Offset = 0;
4258
4259    Function *F = CGM.getIntrinsic(Intrinsic::eh_dwarf_cfa);
4260    return RValue::get(Builder.CreateCall(F,
4261                                      llvm::ConstantInt::get(Int32Ty, Offset)));
4262  }
4263  case Builtin::BI__builtin_return_address: {
4264    Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
4265                                                   getContext().UnsignedIntTy);
4266    Function *F = CGM.getIntrinsic(Intrinsic::returnaddress);
4267    return RValue::get(Builder.CreateCall(F, Depth));
4268  }
4269  case Builtin::BI_ReturnAddress: {
4270    Function *F = CGM.getIntrinsic(Intrinsic::returnaddress);
4271    return RValue::get(Builder.CreateCall(F, Builder.getInt32(0)));
4272  }
4273  case Builtin::BI__builtin_frame_address: {
4274    Value *Depth = ConstantEmitter(*this).emitAbstract(E->getArg(0),
4275                                                   getContext().UnsignedIntTy);
4276    Function *F = CGM.getIntrinsic(Intrinsic::frameaddress, AllocaInt8PtrTy);
4277    return RValue::get(Builder.CreateCall(F, Depth));
4278  }
4279  case Builtin::BI__builtin_extract_return_addr: {
4280    Value *Address = EmitScalarExpr(E->getArg(0));
4281    Value *Result = getTargetHooks().decodeReturnAddress(*this, Address);
4282    return RValue::get(Result);
4283  }
4284  case Builtin::BI__builtin_frob_return_addr: {
4285    Value *Address = EmitScalarExpr(E->getArg(0));
4286    Value *Result = getTargetHooks().encodeReturnAddress(*this, Address);
4287    return RValue::get(Result);
4288  }
4289  case Builtin::BI__builtin_dwarf_sp_column: {
4290    llvm::IntegerType *Ty
4291      = cast<llvm::IntegerType>(ConvertType(E->getType()));
4292    int Column = getTargetHooks().getDwarfEHStackPointer(CGM);
4293    if (Column == -1) {
4294      CGM.ErrorUnsupported(E, "__builtin_dwarf_sp_column");
4295      return RValue::get(llvm::UndefValue::get(Ty));
4296    }
4297    return RValue::get(llvm::ConstantInt::get(Ty, Column, true));
4298  }
4299  case Builtin::BI__builtin_init_dwarf_reg_size_table: {
4300    Value *Address = EmitScalarExpr(E->getArg(0));
4301    if (getTargetHooks().initDwarfEHRegSizeTable(*this, Address))
4302      CGM.ErrorUnsupported(E, "__builtin_init_dwarf_reg_size_table");
4303    return RValue::get(llvm::UndefValue::get(ConvertType(E->getType())));
4304  }
4305  case Builtin::BI__builtin_eh_return: {
4306    Value *Int = EmitScalarExpr(E->getArg(0));
4307    Value *Ptr = EmitScalarExpr(E->getArg(1));
4308
4309    llvm::IntegerType *IntTy = cast<llvm::IntegerType>(Int->getType());
4310    assert((IntTy->getBitWidth() == 32 || IntTy->getBitWidth() == 64) &&
4311           "LLVM's __builtin_eh_return only supports 32- and 64-bit variants");
4312    Function *F =
4313        CGM.getIntrinsic(IntTy->getBitWidth() == 32 ? Intrinsic::eh_return_i32
4314                                                    : Intrinsic::eh_return_i64);
4315    Builder.CreateCall(F, {Int, Ptr});
4316    Builder.CreateUnreachable();
4317
4318    // We do need to preserve an insertion point.
4319    EmitBlock(createBasicBlock("builtin_eh_return.cont"));
4320
4321    return RValue::get(nullptr);
4322  }
4323  case Builtin::BI__builtin_unwind_init: {
4324    Function *F = CGM.getIntrinsic(Intrinsic::eh_unwind_init);
4325    Builder.CreateCall(F);
4326    return RValue::get(nullptr);
4327  }
4328  case Builtin::BI__builtin_extend_pointer: {
4329    // Extends a pointer to the size of an _Unwind_Word, which is
4330    // uint64_t on all platforms.  Generally this gets poked into a
4331    // register and eventually used as an address, so if the
4332    // addressing registers are wider than pointers and the platform
4333    // doesn't implicitly ignore high-order bits when doing
4334    // addressing, we need to make sure we zext / sext based on
4335    // the platform's expectations.
4336    //
4337    // See: http://gcc.gnu.org/ml/gcc-bugs/2002-02/msg00237.html
4338
4339    // Cast the pointer to intptr_t.
4340    Value *Ptr = EmitScalarExpr(E->getArg(0));
4341    Value *Result = Builder.CreatePtrToInt(Ptr, IntPtrTy, "extend.cast");
4342
4343    // If that's 64 bits, we're done.
4344    if (IntPtrTy->getBitWidth() == 64)
4345      return RValue::get(Result);
4346
4347    // Otherwise, ask the codegen data what to do.
4348    if (getTargetHooks().extendPointerWithSExt())
4349      return RValue::get(Builder.CreateSExt(Result, Int64Ty, "extend.sext"));
4350    else
4351      return RValue::get(Builder.CreateZExt(Result, Int64Ty, "extend.zext"));
4352  }
4353  case Builtin::BI__builtin_setjmp: {
4354    // Buffer is a void**.
4355    Address Buf = EmitPointerWithAlignment(E->getArg(0));
4356
4357    // Store the frame pointer to the setjmp buffer.
4358    Value *FrameAddr = Builder.CreateCall(
4359        CGM.getIntrinsic(Intrinsic::frameaddress, AllocaInt8PtrTy),
4360        ConstantInt::get(Int32Ty, 0));
4361    Builder.CreateStore(FrameAddr, Buf);
4362
4363    // Store the stack pointer to the setjmp buffer.
4364    Value *StackAddr = Builder.CreateStackSave();
4365    assert(Buf.getPointer()->getType() == StackAddr->getType());
4366
4367    Address StackSaveSlot = Builder.CreateConstInBoundsGEP(Buf, 2);
4368    Builder.CreateStore(StackAddr, StackSaveSlot);
4369
4370    // Call LLVM's EH setjmp, which is lightweight.
4371    Function *F = CGM.getIntrinsic(Intrinsic::eh_sjlj_setjmp);
4372    return RValue::get(Builder.CreateCall(F, Buf.getPointer()));
4373  }
4374  case Builtin::BI__builtin_longjmp: {
4375    Value *Buf = EmitScalarExpr(E->getArg(0));
4376
4377    // Call LLVM's EH longjmp, which is lightweight.
4378    Builder.CreateCall(CGM.getIntrinsic(Intrinsic::eh_sjlj_longjmp), Buf);
4379
4380    // longjmp doesn't return; mark this as unreachable.
4381    Builder.CreateUnreachable();
4382
4383    // We do need to preserve an insertion point.
4384    EmitBlock(createBasicBlock("longjmp.cont"));
4385
4386    return RValue::get(nullptr);
4387  }
4388  case Builtin::BI__builtin_launder: {
4389    const Expr *Arg = E->getArg(0);
4390    QualType ArgTy = Arg->getType()->getPointeeType();
4391    Value *Ptr = EmitScalarExpr(Arg);
4392    if (TypeRequiresBuiltinLaunder(CGM, ArgTy))
4393      Ptr = Builder.CreateLaunderInvariantGroup(Ptr);
4394
4395    return RValue::get(Ptr);
4396  }
4397  case Builtin::BI__sync_fetch_and_add:
4398  case Builtin::BI__sync_fetch_and_sub:
4399  case Builtin::BI__sync_fetch_and_or:
4400  case Builtin::BI__sync_fetch_and_and:
4401  case Builtin::BI__sync_fetch_and_xor:
4402  case Builtin::BI__sync_fetch_and_nand:
4403  case Builtin::BI__sync_add_and_fetch:
4404  case Builtin::BI__sync_sub_and_fetch:
4405  case Builtin::BI__sync_and_and_fetch:
4406  case Builtin::BI__sync_or_and_fetch:
4407  case Builtin::BI__sync_xor_and_fetch:
4408  case Builtin::BI__sync_nand_and_fetch:
4409  case Builtin::BI__sync_val_compare_and_swap:
4410  case Builtin::BI__sync_bool_compare_and_swap:
4411  case Builtin::BI__sync_lock_test_and_set:
4412  case Builtin::BI__sync_lock_release:
4413  case Builtin::BI__sync_swap:
4414    llvm_unreachable("Shouldn't make it through sema");
4415  case Builtin::BI__sync_fetch_and_add_1:
4416  case Builtin::BI__sync_fetch_and_add_2:
4417  case Builtin::BI__sync_fetch_and_add_4:
4418  case Builtin::BI__sync_fetch_and_add_8:
4419  case Builtin::BI__sync_fetch_and_add_16:
4420    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Add, E);
4421  case Builtin::BI__sync_fetch_and_sub_1:
4422  case Builtin::BI__sync_fetch_and_sub_2:
4423  case Builtin::BI__sync_fetch_and_sub_4:
4424  case Builtin::BI__sync_fetch_and_sub_8:
4425  case Builtin::BI__sync_fetch_and_sub_16:
4426    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Sub, E);
4427  case Builtin::BI__sync_fetch_and_or_1:
4428  case Builtin::BI__sync_fetch_and_or_2:
4429  case Builtin::BI__sync_fetch_and_or_4:
4430  case Builtin::BI__sync_fetch_and_or_8:
4431  case Builtin::BI__sync_fetch_and_or_16:
4432    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Or, E);
4433  case Builtin::BI__sync_fetch_and_and_1:
4434  case Builtin::BI__sync_fetch_and_and_2:
4435  case Builtin::BI__sync_fetch_and_and_4:
4436  case Builtin::BI__sync_fetch_and_and_8:
4437  case Builtin::BI__sync_fetch_and_and_16:
4438    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::And, E);
4439  case Builtin::BI__sync_fetch_and_xor_1:
4440  case Builtin::BI__sync_fetch_and_xor_2:
4441  case Builtin::BI__sync_fetch_and_xor_4:
4442  case Builtin::BI__sync_fetch_and_xor_8:
4443  case Builtin::BI__sync_fetch_and_xor_16:
4444    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xor, E);
4445  case Builtin::BI__sync_fetch_and_nand_1:
4446  case Builtin::BI__sync_fetch_and_nand_2:
4447  case Builtin::BI__sync_fetch_and_nand_4:
4448  case Builtin::BI__sync_fetch_and_nand_8:
4449  case Builtin::BI__sync_fetch_and_nand_16:
4450    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Nand, E);
4451
4452  // Clang extensions: not overloaded yet.
4453  case Builtin::BI__sync_fetch_and_min:
4454    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Min, E);
4455  case Builtin::BI__sync_fetch_and_max:
4456    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Max, E);
4457  case Builtin::BI__sync_fetch_and_umin:
4458    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMin, E);
4459  case Builtin::BI__sync_fetch_and_umax:
4460    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::UMax, E);
4461
4462  case Builtin::BI__sync_add_and_fetch_1:
4463  case Builtin::BI__sync_add_and_fetch_2:
4464  case Builtin::BI__sync_add_and_fetch_4:
4465  case Builtin::BI__sync_add_and_fetch_8:
4466  case Builtin::BI__sync_add_and_fetch_16:
4467    return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Add, E,
4468                                llvm::Instruction::Add);
4469  case Builtin::BI__sync_sub_and_fetch_1:
4470  case Builtin::BI__sync_sub_and_fetch_2:
4471  case Builtin::BI__sync_sub_and_fetch_4:
4472  case Builtin::BI__sync_sub_and_fetch_8:
4473  case Builtin::BI__sync_sub_and_fetch_16:
4474    return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Sub, E,
4475                                llvm::Instruction::Sub);
4476  case Builtin::BI__sync_and_and_fetch_1:
4477  case Builtin::BI__sync_and_and_fetch_2:
4478  case Builtin::BI__sync_and_and_fetch_4:
4479  case Builtin::BI__sync_and_and_fetch_8:
4480  case Builtin::BI__sync_and_and_fetch_16:
4481    return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::And, E,
4482                                llvm::Instruction::And);
4483  case Builtin::BI__sync_or_and_fetch_1:
4484  case Builtin::BI__sync_or_and_fetch_2:
4485  case Builtin::BI__sync_or_and_fetch_4:
4486  case Builtin::BI__sync_or_and_fetch_8:
4487  case Builtin::BI__sync_or_and_fetch_16:
4488    return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Or, E,
4489                                llvm::Instruction::Or);
4490  case Builtin::BI__sync_xor_and_fetch_1:
4491  case Builtin::BI__sync_xor_and_fetch_2:
4492  case Builtin::BI__sync_xor_and_fetch_4:
4493  case Builtin::BI__sync_xor_and_fetch_8:
4494  case Builtin::BI__sync_xor_and_fetch_16:
4495    return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Xor, E,
4496                                llvm::Instruction::Xor);
4497  case Builtin::BI__sync_nand_and_fetch_1:
4498  case Builtin::BI__sync_nand_and_fetch_2:
4499  case Builtin::BI__sync_nand_and_fetch_4:
4500  case Builtin::BI__sync_nand_and_fetch_8:
4501  case Builtin::BI__sync_nand_and_fetch_16:
4502    return EmitBinaryAtomicPost(*this, llvm::AtomicRMWInst::Nand, E,
4503                                llvm::Instruction::And, true);
4504
4505  case Builtin::BI__sync_val_compare_and_swap_1:
4506  case Builtin::BI__sync_val_compare_and_swap_2:
4507  case Builtin::BI__sync_val_compare_and_swap_4:
4508  case Builtin::BI__sync_val_compare_and_swap_8:
4509  case Builtin::BI__sync_val_compare_and_swap_16:
4510    return RValue::get(MakeAtomicCmpXchgValue(*this, E, false));
4511
4512  case Builtin::BI__sync_bool_compare_and_swap_1:
4513  case Builtin::BI__sync_bool_compare_and_swap_2:
4514  case Builtin::BI__sync_bool_compare_and_swap_4:
4515  case Builtin::BI__sync_bool_compare_and_swap_8:
4516  case Builtin::BI__sync_bool_compare_and_swap_16:
4517    return RValue::get(MakeAtomicCmpXchgValue(*this, E, true));
4518
4519  case Builtin::BI__sync_swap_1:
4520  case Builtin::BI__sync_swap_2:
4521  case Builtin::BI__sync_swap_4:
4522  case Builtin::BI__sync_swap_8:
4523  case Builtin::BI__sync_swap_16:
4524    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
4525
4526  case Builtin::BI__sync_lock_test_and_set_1:
4527  case Builtin::BI__sync_lock_test_and_set_2:
4528  case Builtin::BI__sync_lock_test_and_set_4:
4529  case Builtin::BI__sync_lock_test_and_set_8:
4530  case Builtin::BI__sync_lock_test_and_set_16:
4531    return EmitBinaryAtomic(*this, llvm::AtomicRMWInst::Xchg, E);
4532
4533  case Builtin::BI__sync_lock_release_1:
4534  case Builtin::BI__sync_lock_release_2:
4535  case Builtin::BI__sync_lock_release_4:
4536  case Builtin::BI__sync_lock_release_8:
4537  case Builtin::BI__sync_lock_release_16: {
4538    Address Ptr = CheckAtomicAlignment(*this, E);
4539    QualType ElTy = E->getArg(0)->getType()->getPointeeType();
4540
4541    llvm::Type *ITy = llvm::IntegerType::get(getLLVMContext(),
4542                                             getContext().getTypeSize(ElTy));
4543    llvm::StoreInst *Store =
4544        Builder.CreateStore(llvm::Constant::getNullValue(ITy), Ptr);
4545    Store->setAtomic(llvm::AtomicOrdering::Release);
4546    return RValue::get(nullptr);
4547  }
4548
4549  case Builtin::BI__sync_synchronize: {
4550    // We assume this is supposed to correspond to a C++0x-style
4551    // sequentially-consistent fence (i.e. this is only usable for
4552    // synchronization, not device I/O or anything like that). This intrinsic
4553    // is really badly designed in the sense that in theory, there isn't
4554    // any way to safely use it... but in practice, it mostly works
4555    // to use it with non-atomic loads and stores to get acquire/release
4556    // semantics.
4557    Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent);
4558    return RValue::get(nullptr);
4559  }
4560
4561  case Builtin::BI__builtin_nontemporal_load:
4562    return RValue::get(EmitNontemporalLoad(*this, E));
4563  case Builtin::BI__builtin_nontemporal_store:
4564    return RValue::get(EmitNontemporalStore(*this, E));
4565  case Builtin::BI__c11_atomic_is_lock_free:
4566  case Builtin::BI__atomic_is_lock_free: {
4567    // Call "bool __atomic_is_lock_free(size_t size, void *ptr)". For the
4568    // __c11 builtin, ptr is 0 (indicating a properly-aligned object), since
4569    // _Atomic(T) is always properly-aligned.
4570    const char *LibCallName = "__atomic_is_lock_free";
4571    CallArgList Args;
4572    Args.add(RValue::get(EmitScalarExpr(E->getArg(0))),
4573             getContext().getSizeType());
4574    if (BuiltinID == Builtin::BI__atomic_is_lock_free)
4575      Args.add(RValue::get(EmitScalarExpr(E->getArg(1))),
4576               getContext().VoidPtrTy);
4577    else
4578      Args.add(RValue::get(llvm::Constant::getNullValue(VoidPtrTy)),
4579               getContext().VoidPtrTy);
4580    const CGFunctionInfo &FuncInfo =
4581        CGM.getTypes().arrangeBuiltinFunctionCall(E->getType(), Args);
4582    llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FuncInfo);
4583    llvm::FunctionCallee Func = CGM.CreateRuntimeFunction(FTy, LibCallName);
4584    return EmitCall(FuncInfo, CGCallee::forDirect(Func),
4585                    ReturnValueSlot(), Args);
4586  }
4587
4588  case Builtin::BI__atomic_test_and_set: {
4589    // Look at the argument type to determine whether this is a volatile
4590    // operation. The parameter type is always volatile.
4591    QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
4592    bool Volatile =
4593        PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
4594
4595    Address Ptr =
4596        EmitPointerWithAlignment(E->getArg(0)).withElementType(Int8Ty);
4597
4598    Value *NewVal = Builder.getInt8(1);
4599    Value *Order = EmitScalarExpr(E->getArg(1));
4600    if (isa<llvm::ConstantInt>(Order)) {
4601      int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4602      AtomicRMWInst *Result = nullptr;
4603      switch (ord) {
4604      case 0:  // memory_order_relaxed
4605      default: // invalid order
4606        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4607                                         llvm::AtomicOrdering::Monotonic);
4608        break;
4609      case 1: // memory_order_consume
4610      case 2: // memory_order_acquire
4611        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4612                                         llvm::AtomicOrdering::Acquire);
4613        break;
4614      case 3: // memory_order_release
4615        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4616                                         llvm::AtomicOrdering::Release);
4617        break;
4618      case 4: // memory_order_acq_rel
4619
4620        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4621                                         llvm::AtomicOrdering::AcquireRelease);
4622        break;
4623      case 5: // memory_order_seq_cst
4624        Result = Builder.CreateAtomicRMW(
4625            llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
4626            llvm::AtomicOrdering::SequentiallyConsistent);
4627        break;
4628      }
4629      Result->setVolatile(Volatile);
4630      return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
4631    }
4632
4633    llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4634
4635    llvm::BasicBlock *BBs[5] = {
4636      createBasicBlock("monotonic", CurFn),
4637      createBasicBlock("acquire", CurFn),
4638      createBasicBlock("release", CurFn),
4639      createBasicBlock("acqrel", CurFn),
4640      createBasicBlock("seqcst", CurFn)
4641    };
4642    llvm::AtomicOrdering Orders[5] = {
4643        llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Acquire,
4644        llvm::AtomicOrdering::Release, llvm::AtomicOrdering::AcquireRelease,
4645        llvm::AtomicOrdering::SequentiallyConsistent};
4646
4647    Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4648    llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
4649
4650    Builder.SetInsertPoint(ContBB);
4651    PHINode *Result = Builder.CreatePHI(Int8Ty, 5, "was_set");
4652
4653    for (unsigned i = 0; i < 5; ++i) {
4654      Builder.SetInsertPoint(BBs[i]);
4655      AtomicRMWInst *RMW = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
4656                                                   Ptr, NewVal, Orders[i]);
4657      RMW->setVolatile(Volatile);
4658      Result->addIncoming(RMW, BBs[i]);
4659      Builder.CreateBr(ContBB);
4660    }
4661
4662    SI->addCase(Builder.getInt32(0), BBs[0]);
4663    SI->addCase(Builder.getInt32(1), BBs[1]);
4664    SI->addCase(Builder.getInt32(2), BBs[1]);
4665    SI->addCase(Builder.getInt32(3), BBs[2]);
4666    SI->addCase(Builder.getInt32(4), BBs[3]);
4667    SI->addCase(Builder.getInt32(5), BBs[4]);
4668
4669    Builder.SetInsertPoint(ContBB);
4670    return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
4671  }
4672
4673  case Builtin::BI__atomic_clear: {
4674    QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
4675    bool Volatile =
4676        PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
4677
4678    Address Ptr = EmitPointerWithAlignment(E->getArg(0));
4679    Ptr = Ptr.withElementType(Int8Ty);
4680    Value *NewVal = Builder.getInt8(0);
4681    Value *Order = EmitScalarExpr(E->getArg(1));
4682    if (isa<llvm::ConstantInt>(Order)) {
4683      int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4684      StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
4685      switch (ord) {
4686      case 0:  // memory_order_relaxed
4687      default: // invalid order
4688        Store->setOrdering(llvm::AtomicOrdering::Monotonic);
4689        break;
4690      case 3:  // memory_order_release
4691        Store->setOrdering(llvm::AtomicOrdering::Release);
4692        break;
4693      case 5:  // memory_order_seq_cst
4694        Store->setOrdering(llvm::AtomicOrdering::SequentiallyConsistent);
4695        break;
4696      }
4697      return RValue::get(nullptr);
4698    }
4699
4700    llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4701
4702    llvm::BasicBlock *BBs[3] = {
4703      createBasicBlock("monotonic", CurFn),
4704      createBasicBlock("release", CurFn),
4705      createBasicBlock("seqcst", CurFn)
4706    };
4707    llvm::AtomicOrdering Orders[3] = {
4708        llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Release,
4709        llvm::AtomicOrdering::SequentiallyConsistent};
4710
4711    Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4712    llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
4713
4714    for (unsigned i = 0; i < 3; ++i) {
4715      Builder.SetInsertPoint(BBs[i]);
4716      StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
4717      Store->setOrdering(Orders[i]);
4718      Builder.CreateBr(ContBB);
4719    }
4720
4721    SI->addCase(Builder.getInt32(0), BBs[0]);
4722    SI->addCase(Builder.getInt32(3), BBs[1]);
4723    SI->addCase(Builder.getInt32(5), BBs[2]);
4724
4725    Builder.SetInsertPoint(ContBB);
4726    return RValue::get(nullptr);
4727  }
4728
4729  case Builtin::BI__atomic_thread_fence:
4730  case Builtin::BI__atomic_signal_fence:
4731  case Builtin::BI__c11_atomic_thread_fence:
4732  case Builtin::BI__c11_atomic_signal_fence: {
4733    llvm::SyncScope::ID SSID;
4734    if (BuiltinID == Builtin::BI__atomic_signal_fence ||
4735        BuiltinID == Builtin::BI__c11_atomic_signal_fence)
4736      SSID = llvm::SyncScope::SingleThread;
4737    else
4738      SSID = llvm::SyncScope::System;
4739    Value *Order = EmitScalarExpr(E->getArg(0));
4740    if (isa<llvm::ConstantInt>(Order)) {
4741      int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
4742      switch (ord) {
4743      case 0:  // memory_order_relaxed
4744      default: // invalid order
4745        break;
4746      case 1:  // memory_order_consume
4747      case 2:  // memory_order_acquire
4748        Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
4749        break;
4750      case 3:  // memory_order_release
4751        Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
4752        break;
4753      case 4:  // memory_order_acq_rel
4754        Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
4755        break;
4756      case 5:  // memory_order_seq_cst
4757        Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
4758        break;
4759      }
4760      return RValue::get(nullptr);
4761    }
4762
4763    llvm::BasicBlock *AcquireBB, *ReleaseBB, *AcqRelBB, *SeqCstBB;
4764    AcquireBB = createBasicBlock("acquire", CurFn);
4765    ReleaseBB = createBasicBlock("release", CurFn);
4766    AcqRelBB = createBasicBlock("acqrel", CurFn);
4767    SeqCstBB = createBasicBlock("seqcst", CurFn);
4768    llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
4769
4770    Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
4771    llvm::SwitchInst *SI = Builder.CreateSwitch(Order, ContBB);
4772
4773    Builder.SetInsertPoint(AcquireBB);
4774    Builder.CreateFence(llvm::AtomicOrdering::Acquire, SSID);
4775    Builder.CreateBr(ContBB);
4776    SI->addCase(Builder.getInt32(1), AcquireBB);
4777    SI->addCase(Builder.getInt32(2), AcquireBB);
4778
4779    Builder.SetInsertPoint(ReleaseBB);
4780    Builder.CreateFence(llvm::AtomicOrdering::Release, SSID);
4781    Builder.CreateBr(ContBB);
4782    SI->addCase(Builder.getInt32(3), ReleaseBB);
4783
4784    Builder.SetInsertPoint(AcqRelBB);
4785    Builder.CreateFence(llvm::AtomicOrdering::AcquireRelease, SSID);
4786    Builder.CreateBr(ContBB);
4787    SI->addCase(Builder.getInt32(4), AcqRelBB);
4788
4789    Builder.SetInsertPoint(SeqCstBB);
4790    Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent, SSID);
4791    Builder.CreateBr(ContBB);
4792    SI->addCase(Builder.getInt32(5), SeqCstBB);
4793
4794    Builder.SetInsertPoint(ContBB);
4795    return RValue::get(nullptr);
4796  }
4797
4798  case Builtin::BI__builtin_signbit:
4799  case Builtin::BI__builtin_signbitf:
4800  case Builtin::BI__builtin_signbitl: {
4801    return RValue::get(
4802        Builder.CreateZExt(EmitSignBit(*this, EmitScalarExpr(E->getArg(0))),
4803                           ConvertType(E->getType())));
4804  }
4805  case Builtin::BI__warn_memset_zero_len:
4806    return RValue::getIgnored();
4807  case Builtin::BI__annotation: {
4808    // Re-encode each wide string to UTF8 and make an MDString.
4809    SmallVector<Metadata *, 1> Strings;
4810    for (const Expr *Arg : E->arguments()) {
4811      const auto *Str = cast<StringLiteral>(Arg->IgnoreParenCasts());
4812      assert(Str->getCharByteWidth() == 2);
4813      StringRef WideBytes = Str->getBytes();
4814      std::string StrUtf8;
4815      if (!convertUTF16ToUTF8String(
4816              ArrayRef(WideBytes.data(), WideBytes.size()), StrUtf8)) {
4817        CGM.ErrorUnsupported(E, "non-UTF16 __annotation argument");
4818        continue;
4819      }
4820      Strings.push_back(llvm::MDString::get(getLLVMContext(), StrUtf8));
4821    }
4822
4823    // Build and MDTuple of MDStrings and emit the intrinsic call.
4824    llvm::Function *F =
4825        CGM.getIntrinsic(llvm::Intrinsic::codeview_annotation, {});
4826    MDTuple *StrTuple = MDTuple::get(getLLVMContext(), Strings);
4827    Builder.CreateCall(F, MetadataAsValue::get(getLLVMContext(), StrTuple));
4828    return RValue::getIgnored();
4829  }
4830  case Builtin::BI__builtin_annotation: {
4831    llvm::Value *AnnVal = EmitScalarExpr(E->getArg(0));
4832    llvm::Function *F =
4833        CGM.getIntrinsic(llvm::Intrinsic::annotation,
4834                         {AnnVal->getType(), CGM.ConstGlobalsPtrTy});
4835
4836    // Get the annotation string, go through casts. Sema requires this to be a
4837    // non-wide string literal, potentially casted, so the cast<> is safe.
4838    const Expr *AnnotationStrExpr = E->getArg(1)->IgnoreParenCasts();
4839    StringRef Str = cast<StringLiteral>(AnnotationStrExpr)->getString();
4840    return RValue::get(
4841        EmitAnnotationCall(F, AnnVal, Str, E->getExprLoc(), nullptr));
4842  }
4843  case Builtin::BI__builtin_addcb:
4844  case Builtin::BI__builtin_addcs:
4845  case Builtin::BI__builtin_addc:
4846  case Builtin::BI__builtin_addcl:
4847  case Builtin::BI__builtin_addcll:
4848  case Builtin::BI__builtin_subcb:
4849  case Builtin::BI__builtin_subcs:
4850  case Builtin::BI__builtin_subc:
4851  case Builtin::BI__builtin_subcl:
4852  case Builtin::BI__builtin_subcll: {
4853
4854    // We translate all of these builtins from expressions of the form:
4855    //   int x = ..., y = ..., carryin = ..., carryout, result;
4856    //   result = __builtin_addc(x, y, carryin, &carryout);
4857    //
4858    // to LLVM IR of the form:
4859    //
4860    //   %tmp1 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
4861    //   %tmpsum1 = extractvalue {i32, i1} %tmp1, 0
4862    //   %carry1 = extractvalue {i32, i1} %tmp1, 1
4863    //   %tmp2 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %tmpsum1,
4864    //                                                       i32 %carryin)
4865    //   %result = extractvalue {i32, i1} %tmp2, 0
4866    //   %carry2 = extractvalue {i32, i1} %tmp2, 1
4867    //   %tmp3 = or i1 %carry1, %carry2
4868    //   %tmp4 = zext i1 %tmp3 to i32
4869    //   store i32 %tmp4, i32* %carryout
4870
4871    // Scalarize our inputs.
4872    llvm::Value *X = EmitScalarExpr(E->getArg(0));
4873    llvm::Value *Y = EmitScalarExpr(E->getArg(1));
4874    llvm::Value *Carryin = EmitScalarExpr(E->getArg(2));
4875    Address CarryOutPtr = EmitPointerWithAlignment(E->getArg(3));
4876
4877    // Decide if we are lowering to a uadd.with.overflow or usub.with.overflow.
4878    llvm::Intrinsic::ID IntrinsicId;
4879    switch (BuiltinID) {
4880    default: llvm_unreachable("Unknown multiprecision builtin id.");
4881    case Builtin::BI__builtin_addcb:
4882    case Builtin::BI__builtin_addcs:
4883    case Builtin::BI__builtin_addc:
4884    case Builtin::BI__builtin_addcl:
4885    case Builtin::BI__builtin_addcll:
4886      IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
4887      break;
4888    case Builtin::BI__builtin_subcb:
4889    case Builtin::BI__builtin_subcs:
4890    case Builtin::BI__builtin_subc:
4891    case Builtin::BI__builtin_subcl:
4892    case Builtin::BI__builtin_subcll:
4893      IntrinsicId = llvm::Intrinsic::usub_with_overflow;
4894      break;
4895    }
4896
4897    // Construct our resulting LLVM IR expression.
4898    llvm::Value *Carry1;
4899    llvm::Value *Sum1 = EmitOverflowIntrinsic(*this, IntrinsicId,
4900                                              X, Y, Carry1);
4901    llvm::Value *Carry2;
4902    llvm::Value *Sum2 = EmitOverflowIntrinsic(*this, IntrinsicId,
4903                                              Sum1, Carryin, Carry2);
4904    llvm::Value *CarryOut = Builder.CreateZExt(Builder.CreateOr(Carry1, Carry2),
4905                                               X->getType());
4906    Builder.CreateStore(CarryOut, CarryOutPtr);
4907    return RValue::get(Sum2);
4908  }
4909
4910  case Builtin::BI__builtin_add_overflow:
4911  case Builtin::BI__builtin_sub_overflow:
4912  case Builtin::BI__builtin_mul_overflow: {
4913    const clang::Expr *LeftArg = E->getArg(0);
4914    const clang::Expr *RightArg = E->getArg(1);
4915    const clang::Expr *ResultArg = E->getArg(2);
4916
4917    clang::QualType ResultQTy =
4918        ResultArg->getType()->castAs<PointerType>()->getPointeeType();
4919
4920    WidthAndSignedness LeftInfo =
4921        getIntegerWidthAndSignedness(CGM.getContext(), LeftArg->getType());
4922    WidthAndSignedness RightInfo =
4923        getIntegerWidthAndSignedness(CGM.getContext(), RightArg->getType());
4924    WidthAndSignedness ResultInfo =
4925        getIntegerWidthAndSignedness(CGM.getContext(), ResultQTy);
4926
4927    // Handle mixed-sign multiplication as a special case, because adding
4928    // runtime or backend support for our generic irgen would be too expensive.
4929    if (isSpecialMixedSignMultiply(BuiltinID, LeftInfo, RightInfo, ResultInfo))
4930      return EmitCheckedMixedSignMultiply(*this, LeftArg, LeftInfo, RightArg,
4931                                          RightInfo, ResultArg, ResultQTy,
4932                                          ResultInfo);
4933
4934    if (isSpecialUnsignedMultiplySignedResult(BuiltinID, LeftInfo, RightInfo,
4935                                              ResultInfo))
4936      return EmitCheckedUnsignedMultiplySignedResult(
4937          *this, LeftArg, LeftInfo, RightArg, RightInfo, ResultArg, ResultQTy,
4938          ResultInfo);
4939
4940    WidthAndSignedness EncompassingInfo =
4941        EncompassingIntegerType({LeftInfo, RightInfo, ResultInfo});
4942
4943    llvm::Type *EncompassingLLVMTy =
4944        llvm::IntegerType::get(CGM.getLLVMContext(), EncompassingInfo.Width);
4945
4946    llvm::Type *ResultLLVMTy = CGM.getTypes().ConvertType(ResultQTy);
4947
4948    llvm::Intrinsic::ID IntrinsicId;
4949    switch (BuiltinID) {
4950    default:
4951      llvm_unreachable("Unknown overflow builtin id.");
4952    case Builtin::BI__builtin_add_overflow:
4953      IntrinsicId = EncompassingInfo.Signed
4954                        ? llvm::Intrinsic::sadd_with_overflow
4955                        : llvm::Intrinsic::uadd_with_overflow;
4956      break;
4957    case Builtin::BI__builtin_sub_overflow:
4958      IntrinsicId = EncompassingInfo.Signed
4959                        ? llvm::Intrinsic::ssub_with_overflow
4960                        : llvm::Intrinsic::usub_with_overflow;
4961      break;
4962    case Builtin::BI__builtin_mul_overflow:
4963      IntrinsicId = EncompassingInfo.Signed
4964                        ? llvm::Intrinsic::smul_with_overflow
4965                        : llvm::Intrinsic::umul_with_overflow;
4966      break;
4967    }
4968
4969    llvm::Value *Left = EmitScalarExpr(LeftArg);
4970    llvm::Value *Right = EmitScalarExpr(RightArg);
4971    Address ResultPtr = EmitPointerWithAlignment(ResultArg);
4972
4973    // Extend each operand to the encompassing type.
4974    Left = Builder.CreateIntCast(Left, EncompassingLLVMTy, LeftInfo.Signed);
4975    Right = Builder.CreateIntCast(Right, EncompassingLLVMTy, RightInfo.Signed);
4976
4977    // Perform the operation on the extended values.
4978    llvm::Value *Overflow, *Result;
4979    Result = EmitOverflowIntrinsic(*this, IntrinsicId, Left, Right, Overflow);
4980
4981    if (EncompassingInfo.Width > ResultInfo.Width) {
4982      // The encompassing type is wider than the result type, so we need to
4983      // truncate it.
4984      llvm::Value *ResultTrunc = Builder.CreateTrunc(Result, ResultLLVMTy);
4985
4986      // To see if the truncation caused an overflow, we will extend
4987      // the result and then compare it to the original result.
4988      llvm::Value *ResultTruncExt = Builder.CreateIntCast(
4989          ResultTrunc, EncompassingLLVMTy, ResultInfo.Signed);
4990      llvm::Value *TruncationOverflow =
4991          Builder.CreateICmpNE(Result, ResultTruncExt);
4992
4993      Overflow = Builder.CreateOr(Overflow, TruncationOverflow);
4994      Result = ResultTrunc;
4995    }
4996
4997    // Finally, store the result using the pointer.
4998    bool isVolatile =
4999      ResultArg->getType()->getPointeeType().isVolatileQualified();
5000    Builder.CreateStore(EmitToMemory(Result, ResultQTy), ResultPtr, isVolatile);
5001
5002    return RValue::get(Overflow);
5003  }
5004
5005  case Builtin::BI__builtin_uadd_overflow:
5006  case Builtin::BI__builtin_uaddl_overflow:
5007  case Builtin::BI__builtin_uaddll_overflow:
5008  case Builtin::BI__builtin_usub_overflow:
5009  case Builtin::BI__builtin_usubl_overflow:
5010  case Builtin::BI__builtin_usubll_overflow:
5011  case Builtin::BI__builtin_umul_overflow:
5012  case Builtin::BI__builtin_umull_overflow:
5013  case Builtin::BI__builtin_umulll_overflow:
5014  case Builtin::BI__builtin_sadd_overflow:
5015  case Builtin::BI__builtin_saddl_overflow:
5016  case Builtin::BI__builtin_saddll_overflow:
5017  case Builtin::BI__builtin_ssub_overflow:
5018  case Builtin::BI__builtin_ssubl_overflow:
5019  case Builtin::BI__builtin_ssubll_overflow:
5020  case Builtin::BI__builtin_smul_overflow:
5021  case Builtin::BI__builtin_smull_overflow:
5022  case Builtin::BI__builtin_smulll_overflow: {
5023
5024    // We translate all of these builtins directly to the relevant llvm IR node.
5025
5026    // Scalarize our inputs.
5027    llvm::Value *X = EmitScalarExpr(E->getArg(0));
5028    llvm::Value *Y = EmitScalarExpr(E->getArg(1));
5029    Address SumOutPtr = EmitPointerWithAlignment(E->getArg(2));
5030
5031    // Decide which of the overflow intrinsics we are lowering to:
5032    llvm::Intrinsic::ID IntrinsicId;
5033    switch (BuiltinID) {
5034    default: llvm_unreachable("Unknown overflow builtin id.");
5035    case Builtin::BI__builtin_uadd_overflow:
5036    case Builtin::BI__builtin_uaddl_overflow:
5037    case Builtin::BI__builtin_uaddll_overflow:
5038      IntrinsicId = llvm::Intrinsic::uadd_with_overflow;
5039      break;
5040    case Builtin::BI__builtin_usub_overflow:
5041    case Builtin::BI__builtin_usubl_overflow:
5042    case Builtin::BI__builtin_usubll_overflow:
5043      IntrinsicId = llvm::Intrinsic::usub_with_overflow;
5044      break;
5045    case Builtin::BI__builtin_umul_overflow:
5046    case Builtin::BI__builtin_umull_overflow:
5047    case Builtin::BI__builtin_umulll_overflow:
5048      IntrinsicId = llvm::Intrinsic::umul_with_overflow;
5049      break;
5050    case Builtin::BI__builtin_sadd_overflow:
5051    case Builtin::BI__builtin_saddl_overflow:
5052    case Builtin::BI__builtin_saddll_overflow:
5053      IntrinsicId = llvm::Intrinsic::sadd_with_overflow;
5054      break;
5055    case Builtin::BI__builtin_ssub_overflow:
5056    case Builtin::BI__builtin_ssubl_overflow:
5057    case Builtin::BI__builtin_ssubll_overflow:
5058      IntrinsicId = llvm::Intrinsic::ssub_with_overflow;
5059      break;
5060    case Builtin::BI__builtin_smul_overflow:
5061    case Builtin::BI__builtin_smull_overflow:
5062    case Builtin::BI__builtin_smulll_overflow:
5063      IntrinsicId = llvm::Intrinsic::smul_with_overflow;
5064      break;
5065    }
5066
5067
5068    llvm::Value *Carry;
5069    llvm::Value *Sum = EmitOverflowIntrinsic(*this, IntrinsicId, X, Y, Carry);
5070    Builder.CreateStore(Sum, SumOutPtr);
5071
5072    return RValue::get(Carry);
5073  }
5074  case Builtin::BIaddressof:
5075  case Builtin::BI__addressof:
5076  case Builtin::BI__builtin_addressof:
5077    return RValue::get(EmitLValue(E->getArg(0)).getPointer(*this));
5078  case Builtin::BI__builtin_function_start:
5079    return RValue::get(CGM.GetFunctionStart(
5080        E->getArg(0)->getAsBuiltinConstantDeclRef(CGM.getContext())));
5081  case Builtin::BI__builtin_operator_new:
5082    return EmitBuiltinNewDeleteCall(
5083        E->getCallee()->getType()->castAs<FunctionProtoType>(), E, false);
5084  case Builtin::BI__builtin_operator_delete:
5085    EmitBuiltinNewDeleteCall(
5086        E->getCallee()->getType()->castAs<FunctionProtoType>(), E, true);
5087    return RValue::get(nullptr);
5088
5089  case Builtin::BI__builtin_is_aligned:
5090    return EmitBuiltinIsAligned(E);
5091  case Builtin::BI__builtin_align_up:
5092    return EmitBuiltinAlignTo(E, true);
5093  case Builtin::BI__builtin_align_down:
5094    return EmitBuiltinAlignTo(E, false);
5095
5096  case Builtin::BI__noop:
5097    // __noop always evaluates to an integer literal zero.
5098    return RValue::get(ConstantInt::get(IntTy, 0));
5099  case Builtin::BI__builtin_call_with_static_chain: {
5100    const CallExpr *Call = cast<CallExpr>(E->getArg(0));
5101    const Expr *Chain = E->getArg(1);
5102    return EmitCall(Call->getCallee()->getType(),
5103                    EmitCallee(Call->getCallee()), Call, ReturnValue,
5104                    EmitScalarExpr(Chain));
5105  }
5106  case Builtin::BI_InterlockedExchange8:
5107  case Builtin::BI_InterlockedExchange16:
5108  case Builtin::BI_InterlockedExchange:
5109  case Builtin::BI_InterlockedExchangePointer:
5110    return RValue::get(
5111        EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchange, E));
5112  case Builtin::BI_InterlockedCompareExchangePointer:
5113  case Builtin::BI_InterlockedCompareExchangePointer_nf: {
5114    llvm::Type *RTy;
5115    llvm::IntegerType *IntType = IntegerType::get(
5116        getLLVMContext(), getContext().getTypeSize(E->getType()));
5117
5118    Address DestAddr = CheckAtomicAlignment(*this, E);
5119
5120    llvm::Value *Exchange = EmitScalarExpr(E->getArg(1));
5121    RTy = Exchange->getType();
5122    Exchange = Builder.CreatePtrToInt(Exchange, IntType);
5123
5124    llvm::Value *Comparand =
5125      Builder.CreatePtrToInt(EmitScalarExpr(E->getArg(2)), IntType);
5126
5127    auto Ordering =
5128      BuiltinID == Builtin::BI_InterlockedCompareExchangePointer_nf ?
5129      AtomicOrdering::Monotonic : AtomicOrdering::SequentiallyConsistent;
5130
5131    auto Result = Builder.CreateAtomicCmpXchg(DestAddr, Comparand, Exchange,
5132                                              Ordering, Ordering);
5133    Result->setVolatile(true);
5134
5135    return RValue::get(Builder.CreateIntToPtr(Builder.CreateExtractValue(Result,
5136                                                                         0),
5137                                              RTy));
5138  }
5139  case Builtin::BI_InterlockedCompareExchange8:
5140  case Builtin::BI_InterlockedCompareExchange16:
5141  case Builtin::BI_InterlockedCompareExchange:
5142  case Builtin::BI_InterlockedCompareExchange64:
5143    return RValue::get(EmitAtomicCmpXchgForMSIntrin(*this, E));
5144  case Builtin::BI_InterlockedIncrement16:
5145  case Builtin::BI_InterlockedIncrement:
5146    return RValue::get(
5147        EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedIncrement, E));
5148  case Builtin::BI_InterlockedDecrement16:
5149  case Builtin::BI_InterlockedDecrement:
5150    return RValue::get(
5151        EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedDecrement, E));
5152  case Builtin::BI_InterlockedAnd8:
5153  case Builtin::BI_InterlockedAnd16:
5154  case Builtin::BI_InterlockedAnd:
5155    return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedAnd, E));
5156  case Builtin::BI_InterlockedExchangeAdd8:
5157  case Builtin::BI_InterlockedExchangeAdd16:
5158  case Builtin::BI_InterlockedExchangeAdd:
5159    return RValue::get(
5160        EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeAdd, E));
5161  case Builtin::BI_InterlockedExchangeSub8:
5162  case Builtin::BI_InterlockedExchangeSub16:
5163  case Builtin::BI_InterlockedExchangeSub:
5164    return RValue::get(
5165        EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedExchangeSub, E));
5166  case Builtin::BI_InterlockedOr8:
5167  case Builtin::BI_InterlockedOr16:
5168  case Builtin::BI_InterlockedOr:
5169    return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedOr, E));
5170  case Builtin::BI_InterlockedXor8:
5171  case Builtin::BI_InterlockedXor16:
5172  case Builtin::BI_InterlockedXor:
5173    return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::_InterlockedXor, E));
5174
5175  case Builtin::BI_bittest64:
5176  case Builtin::BI_bittest:
5177  case Builtin::BI_bittestandcomplement64:
5178  case Builtin::BI_bittestandcomplement:
5179  case Builtin::BI_bittestandreset64:
5180  case Builtin::BI_bittestandreset:
5181  case Builtin::BI_bittestandset64:
5182  case Builtin::BI_bittestandset:
5183  case Builtin::BI_interlockedbittestandreset:
5184  case Builtin::BI_interlockedbittestandreset64:
5185  case Builtin::BI_interlockedbittestandset64:
5186  case Builtin::BI_interlockedbittestandset:
5187  case Builtin::BI_interlockedbittestandset_acq:
5188  case Builtin::BI_interlockedbittestandset_rel:
5189  case Builtin::BI_interlockedbittestandset_nf:
5190  case Builtin::BI_interlockedbittestandreset_acq:
5191  case Builtin::BI_interlockedbittestandreset_rel:
5192  case Builtin::BI_interlockedbittestandreset_nf:
5193    return RValue::get(EmitBitTestIntrinsic(*this, BuiltinID, E));
5194
5195    // These builtins exist to emit regular volatile loads and stores not
5196    // affected by the -fms-volatile setting.
5197  case Builtin::BI__iso_volatile_load8:
5198  case Builtin::BI__iso_volatile_load16:
5199  case Builtin::BI__iso_volatile_load32:
5200  case Builtin::BI__iso_volatile_load64:
5201    return RValue::get(EmitISOVolatileLoad(*this, E));
5202  case Builtin::BI__iso_volatile_store8:
5203  case Builtin::BI__iso_volatile_store16:
5204  case Builtin::BI__iso_volatile_store32:
5205  case Builtin::BI__iso_volatile_store64:
5206    return RValue::get(EmitISOVolatileStore(*this, E));
5207
5208  case Builtin::BI__exception_code:
5209  case Builtin::BI_exception_code:
5210    return RValue::get(EmitSEHExceptionCode());
5211  case Builtin::BI__exception_info:
5212  case Builtin::BI_exception_info:
5213    return RValue::get(EmitSEHExceptionInfo());
5214  case Builtin::BI__abnormal_termination:
5215  case Builtin::BI_abnormal_termination:
5216    return RValue::get(EmitSEHAbnormalTermination());
5217  case Builtin::BI_setjmpex:
5218    if (getTarget().getTriple().isOSMSVCRT() && E->getNumArgs() == 1 &&
5219        E->getArg(0)->getType()->isPointerType())
5220      return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
5221    break;
5222  case Builtin::BI_setjmp:
5223    if (getTarget().getTriple().isOSMSVCRT() && E->getNumArgs() == 1 &&
5224        E->getArg(0)->getType()->isPointerType()) {
5225      if (getTarget().getTriple().getArch() == llvm::Triple::x86)
5226        return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp3, E);
5227      else if (getTarget().getTriple().getArch() == llvm::Triple::aarch64)
5228        return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmpex, E);
5229      return EmitMSVCRTSetJmp(*this, MSVCSetJmpKind::_setjmp, E);
5230    }
5231    break;
5232
5233  // C++ std:: builtins.
5234  case Builtin::BImove:
5235  case Builtin::BImove_if_noexcept:
5236  case Builtin::BIforward:
5237  case Builtin::BIforward_like:
5238  case Builtin::BIas_const:
5239    return RValue::get(EmitLValue(E->getArg(0)).getPointer(*this));
5240  case Builtin::BI__GetExceptionInfo: {
5241    if (llvm::GlobalVariable *GV =
5242            CGM.getCXXABI().getThrowInfo(FD->getParamDecl(0)->getType()))
5243      return RValue::get(GV);
5244    break;
5245  }
5246
5247  case Builtin::BI__fastfail:
5248    return RValue::get(EmitMSVCBuiltinExpr(MSVCIntrin::__fastfail, E));
5249
5250  case Builtin::BI__builtin_coro_id:
5251    return EmitCoroutineIntrinsic(E, Intrinsic::coro_id);
5252  case Builtin::BI__builtin_coro_promise:
5253    return EmitCoroutineIntrinsic(E, Intrinsic::coro_promise);
5254  case Builtin::BI__builtin_coro_resume:
5255    EmitCoroutineIntrinsic(E, Intrinsic::coro_resume);
5256    return RValue::get(nullptr);
5257  case Builtin::BI__builtin_coro_frame:
5258    return EmitCoroutineIntrinsic(E, Intrinsic::coro_frame);
5259  case Builtin::BI__builtin_coro_noop:
5260    return EmitCoroutineIntrinsic(E, Intrinsic::coro_noop);
5261  case Builtin::BI__builtin_coro_free:
5262    return EmitCoroutineIntrinsic(E, Intrinsic::coro_free);
5263  case Builtin::BI__builtin_coro_destroy:
5264    EmitCoroutineIntrinsic(E, Intrinsic::coro_destroy);
5265    return RValue::get(nullptr);
5266  case Builtin::BI__builtin_coro_done:
5267    return EmitCoroutineIntrinsic(E, Intrinsic::coro_done);
5268  case Builtin::BI__builtin_coro_alloc:
5269    return EmitCoroutineIntrinsic(E, Intrinsic::coro_alloc);
5270  case Builtin::BI__builtin_coro_begin:
5271    return EmitCoroutineIntrinsic(E, Intrinsic::coro_begin);
5272  case Builtin::BI__builtin_coro_end:
5273    return EmitCoroutineIntrinsic(E, Intrinsic::coro_end);
5274  case Builtin::BI__builtin_coro_suspend:
5275    return EmitCoroutineIntrinsic(E, Intrinsic::coro_suspend);
5276  case Builtin::BI__builtin_coro_size:
5277    return EmitCoroutineIntrinsic(E, Intrinsic::coro_size);
5278  case Builtin::BI__builtin_coro_align:
5279    return EmitCoroutineIntrinsic(E, Intrinsic::coro_align);
5280
5281  // OpenCL v2.0 s6.13.16.2, Built-in pipe read and write functions
5282  case Builtin::BIread_pipe:
5283  case Builtin::BIwrite_pipe: {
5284    Value *Arg0 = EmitScalarExpr(E->getArg(0)),
5285          *Arg1 = EmitScalarExpr(E->getArg(1));
5286    CGOpenCLRuntime OpenCLRT(CGM);
5287    Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
5288    Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
5289
5290    // Type of the generic packet parameter.
5291    unsigned GenericAS =
5292        getContext().getTargetAddressSpace(LangAS::opencl_generic);
5293    llvm::Type *I8PTy = llvm::PointerType::get(getLLVMContext(), GenericAS);
5294
5295    // Testing which overloaded version we should generate the call for.
5296    if (2U == E->getNumArgs()) {
5297      const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_2"
5298                                                             : "__write_pipe_2";
5299      // Creating a generic function type to be able to call with any builtin or
5300      // user defined type.
5301      llvm::Type *ArgTys[] = {Arg0->getType(), I8PTy, Int32Ty, Int32Ty};
5302      llvm::FunctionType *FTy = llvm::FunctionType::get(
5303          Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5304      Value *BCast = Builder.CreatePointerCast(Arg1, I8PTy);
5305      return RValue::get(
5306          EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5307                          {Arg0, BCast, PacketSize, PacketAlign}));
5308    } else {
5309      assert(4 == E->getNumArgs() &&
5310             "Illegal number of parameters to pipe function");
5311      const char *Name = (BuiltinID == Builtin::BIread_pipe) ? "__read_pipe_4"
5312                                                             : "__write_pipe_4";
5313
5314      llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, I8PTy,
5315                              Int32Ty, Int32Ty};
5316      Value *Arg2 = EmitScalarExpr(E->getArg(2)),
5317            *Arg3 = EmitScalarExpr(E->getArg(3));
5318      llvm::FunctionType *FTy = llvm::FunctionType::get(
5319          Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5320      Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy);
5321      // We know the third argument is an integer type, but we may need to cast
5322      // it to i32.
5323      if (Arg2->getType() != Int32Ty)
5324        Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
5325      return RValue::get(
5326          EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5327                          {Arg0, Arg1, Arg2, BCast, PacketSize, PacketAlign}));
5328    }
5329  }
5330  // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
5331  // functions
5332  case Builtin::BIreserve_read_pipe:
5333  case Builtin::BIreserve_write_pipe:
5334  case Builtin::BIwork_group_reserve_read_pipe:
5335  case Builtin::BIwork_group_reserve_write_pipe:
5336  case Builtin::BIsub_group_reserve_read_pipe:
5337  case Builtin::BIsub_group_reserve_write_pipe: {
5338    // Composing the mangled name for the function.
5339    const char *Name;
5340    if (BuiltinID == Builtin::BIreserve_read_pipe)
5341      Name = "__reserve_read_pipe";
5342    else if (BuiltinID == Builtin::BIreserve_write_pipe)
5343      Name = "__reserve_write_pipe";
5344    else if (BuiltinID == Builtin::BIwork_group_reserve_read_pipe)
5345      Name = "__work_group_reserve_read_pipe";
5346    else if (BuiltinID == Builtin::BIwork_group_reserve_write_pipe)
5347      Name = "__work_group_reserve_write_pipe";
5348    else if (BuiltinID == Builtin::BIsub_group_reserve_read_pipe)
5349      Name = "__sub_group_reserve_read_pipe";
5350    else
5351      Name = "__sub_group_reserve_write_pipe";
5352
5353    Value *Arg0 = EmitScalarExpr(E->getArg(0)),
5354          *Arg1 = EmitScalarExpr(E->getArg(1));
5355    llvm::Type *ReservedIDTy = ConvertType(getContext().OCLReserveIDTy);
5356    CGOpenCLRuntime OpenCLRT(CGM);
5357    Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
5358    Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
5359
5360    // Building the generic function prototype.
5361    llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty, Int32Ty};
5362    llvm::FunctionType *FTy = llvm::FunctionType::get(
5363        ReservedIDTy, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5364    // We know the second argument is an integer type, but we may need to cast
5365    // it to i32.
5366    if (Arg1->getType() != Int32Ty)
5367      Arg1 = Builder.CreateZExtOrTrunc(Arg1, Int32Ty);
5368    return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5369                                       {Arg0, Arg1, PacketSize, PacketAlign}));
5370  }
5371  // OpenCL v2.0 s6.13.16, s9.17.3.5 - Built-in pipe commit read and write
5372  // functions
5373  case Builtin::BIcommit_read_pipe:
5374  case Builtin::BIcommit_write_pipe:
5375  case Builtin::BIwork_group_commit_read_pipe:
5376  case Builtin::BIwork_group_commit_write_pipe:
5377  case Builtin::BIsub_group_commit_read_pipe:
5378  case Builtin::BIsub_group_commit_write_pipe: {
5379    const char *Name;
5380    if (BuiltinID == Builtin::BIcommit_read_pipe)
5381      Name = "__commit_read_pipe";
5382    else if (BuiltinID == Builtin::BIcommit_write_pipe)
5383      Name = "__commit_write_pipe";
5384    else if (BuiltinID == Builtin::BIwork_group_commit_read_pipe)
5385      Name = "__work_group_commit_read_pipe";
5386    else if (BuiltinID == Builtin::BIwork_group_commit_write_pipe)
5387      Name = "__work_group_commit_write_pipe";
5388    else if (BuiltinID == Builtin::BIsub_group_commit_read_pipe)
5389      Name = "__sub_group_commit_read_pipe";
5390    else
5391      Name = "__sub_group_commit_write_pipe";
5392
5393    Value *Arg0 = EmitScalarExpr(E->getArg(0)),
5394          *Arg1 = EmitScalarExpr(E->getArg(1));
5395    CGOpenCLRuntime OpenCLRT(CGM);
5396    Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
5397    Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
5398
5399    // Building the generic function prototype.
5400    llvm::Type *ArgTys[] = {Arg0->getType(), Arg1->getType(), Int32Ty, Int32Ty};
5401    llvm::FunctionType *FTy =
5402        llvm::FunctionType::get(llvm::Type::getVoidTy(getLLVMContext()),
5403                                llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5404
5405    return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5406                                       {Arg0, Arg1, PacketSize, PacketAlign}));
5407  }
5408  // OpenCL v2.0 s6.13.16.4 Built-in pipe query functions
5409  case Builtin::BIget_pipe_num_packets:
5410  case Builtin::BIget_pipe_max_packets: {
5411    const char *BaseName;
5412    const auto *PipeTy = E->getArg(0)->getType()->castAs<PipeType>();
5413    if (BuiltinID == Builtin::BIget_pipe_num_packets)
5414      BaseName = "__get_pipe_num_packets";
5415    else
5416      BaseName = "__get_pipe_max_packets";
5417    std::string Name = std::string(BaseName) +
5418                       std::string(PipeTy->isReadOnly() ? "_ro" : "_wo");
5419
5420    // Building the generic function prototype.
5421    Value *Arg0 = EmitScalarExpr(E->getArg(0));
5422    CGOpenCLRuntime OpenCLRT(CGM);
5423    Value *PacketSize = OpenCLRT.getPipeElemSize(E->getArg(0));
5424    Value *PacketAlign = OpenCLRT.getPipeElemAlign(E->getArg(0));
5425    llvm::Type *ArgTys[] = {Arg0->getType(), Int32Ty, Int32Ty};
5426    llvm::FunctionType *FTy = llvm::FunctionType::get(
5427        Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5428
5429    return RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5430                                       {Arg0, PacketSize, PacketAlign}));
5431  }
5432
5433  // OpenCL v2.0 s6.13.9 - Address space qualifier functions.
5434  case Builtin::BIto_global:
5435  case Builtin::BIto_local:
5436  case Builtin::BIto_private: {
5437    auto Arg0 = EmitScalarExpr(E->getArg(0));
5438    auto NewArgT = llvm::PointerType::get(
5439        getLLVMContext(),
5440        CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
5441    auto NewRetT = llvm::PointerType::get(
5442        getLLVMContext(),
5443        CGM.getContext().getTargetAddressSpace(
5444            E->getType()->getPointeeType().getAddressSpace()));
5445    auto FTy = llvm::FunctionType::get(NewRetT, {NewArgT}, false);
5446    llvm::Value *NewArg;
5447    if (Arg0->getType()->getPointerAddressSpace() !=
5448        NewArgT->getPointerAddressSpace())
5449      NewArg = Builder.CreateAddrSpaceCast(Arg0, NewArgT);
5450    else
5451      NewArg = Builder.CreateBitOrPointerCast(Arg0, NewArgT);
5452    auto NewName = std::string("__") + E->getDirectCallee()->getName().str();
5453    auto NewCall =
5454        EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, NewName), {NewArg});
5455    return RValue::get(Builder.CreateBitOrPointerCast(NewCall,
5456      ConvertType(E->getType())));
5457  }
5458
5459  // OpenCL v2.0, s6.13.17 - Enqueue kernel function.
5460  // It contains four different overload formats specified in Table 6.13.17.1.
5461  case Builtin::BIenqueue_kernel: {
5462    StringRef Name; // Generated function call name
5463    unsigned NumArgs = E->getNumArgs();
5464
5465    llvm::Type *QueueTy = ConvertType(getContext().OCLQueueTy);
5466    llvm::Type *GenericVoidPtrTy = Builder.getPtrTy(
5467        getContext().getTargetAddressSpace(LangAS::opencl_generic));
5468
5469    llvm::Value *Queue = EmitScalarExpr(E->getArg(0));
5470    llvm::Value *Flags = EmitScalarExpr(E->getArg(1));
5471    LValue NDRangeL = EmitAggExprToLValue(E->getArg(2));
5472    llvm::Value *Range = NDRangeL.getAddress(*this).getPointer();
5473    llvm::Type *RangeTy = NDRangeL.getAddress(*this).getType();
5474
5475    if (NumArgs == 4) {
5476      // The most basic form of the call with parameters:
5477      // queue_t, kernel_enqueue_flags_t, ndrange_t, block(void)
5478      Name = "__enqueue_kernel_basic";
5479      llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, GenericVoidPtrTy,
5480                              GenericVoidPtrTy};
5481      llvm::FunctionType *FTy = llvm::FunctionType::get(
5482          Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5483
5484      auto Info =
5485          CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
5486      llvm::Value *Kernel =
5487          Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5488      llvm::Value *Block =
5489          Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5490
5491      AttrBuilder B(Builder.getContext());
5492      B.addByValAttr(NDRangeL.getAddress(*this).getElementType());
5493      llvm::AttributeList ByValAttrSet =
5494          llvm::AttributeList::get(CGM.getModule().getContext(), 3U, B);
5495
5496      auto RTCall =
5497          EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name, ByValAttrSet),
5498                          {Queue, Flags, Range, Kernel, Block});
5499      RTCall->setAttributes(ByValAttrSet);
5500      return RValue::get(RTCall);
5501    }
5502    assert(NumArgs >= 5 && "Invalid enqueue_kernel signature");
5503
5504    // Create a temporary array to hold the sizes of local pointer arguments
5505    // for the block. \p First is the position of the first size argument.
5506    auto CreateArrayForSizeVar = [=](unsigned First)
5507        -> std::tuple<llvm::Value *, llvm::Value *, llvm::Value *> {
5508      llvm::APInt ArraySize(32, NumArgs - First);
5509      QualType SizeArrayTy = getContext().getConstantArrayType(
5510          getContext().getSizeType(), ArraySize, nullptr,
5511          ArraySizeModifier::Normal,
5512          /*IndexTypeQuals=*/0);
5513      auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
5514      llvm::Value *TmpPtr = Tmp.getPointer();
5515      llvm::Value *TmpSize = EmitLifetimeStart(
5516          CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), TmpPtr);
5517      llvm::Value *ElemPtr;
5518      // Each of the following arguments specifies the size of the corresponding
5519      // argument passed to the enqueued block.
5520      auto *Zero = llvm::ConstantInt::get(IntTy, 0);
5521      for (unsigned I = First; I < NumArgs; ++I) {
5522        auto *Index = llvm::ConstantInt::get(IntTy, I - First);
5523        auto *GEP = Builder.CreateGEP(Tmp.getElementType(), TmpPtr,
5524                                      {Zero, Index});
5525        if (I == First)
5526          ElemPtr = GEP;
5527        auto *V =
5528            Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(I)), SizeTy);
5529        Builder.CreateAlignedStore(
5530            V, GEP, CGM.getDataLayout().getPrefTypeAlign(SizeTy));
5531      }
5532      return std::tie(ElemPtr, TmpSize, TmpPtr);
5533    };
5534
5535    // Could have events and/or varargs.
5536    if (E->getArg(3)->getType()->isBlockPointerType()) {
5537      // No events passed, but has variadic arguments.
5538      Name = "__enqueue_kernel_varargs";
5539      auto Info =
5540          CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3));
5541      llvm::Value *Kernel =
5542          Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5543      auto *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5544      llvm::Value *ElemPtr, *TmpSize, *TmpPtr;
5545      std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(4);
5546
5547      // Create a vector of the arguments, as well as a constant value to
5548      // express to the runtime the number of variadic arguments.
5549      llvm::Value *const Args[] = {Queue,  Flags,
5550                                   Range,  Kernel,
5551                                   Block,  ConstantInt::get(IntTy, NumArgs - 4),
5552                                   ElemPtr};
5553      llvm::Type *const ArgTys[] = {
5554          QueueTy,          IntTy, RangeTy,           GenericVoidPtrTy,
5555          GenericVoidPtrTy, IntTy, ElemPtr->getType()};
5556
5557      llvm::FunctionType *FTy = llvm::FunctionType::get(Int32Ty, ArgTys, false);
5558      auto Call = RValue::get(
5559          EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Args));
5560      if (TmpSize)
5561        EmitLifetimeEnd(TmpSize, TmpPtr);
5562      return Call;
5563    }
5564    // Any calls now have event arguments passed.
5565    if (NumArgs >= 7) {
5566      llvm::PointerType *PtrTy = llvm::PointerType::get(
5567          CGM.getLLVMContext(),
5568          CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
5569
5570      llvm::Value *NumEvents =
5571          Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(3)), Int32Ty);
5572
5573      // Since SemaOpenCLBuiltinEnqueueKernel allows fifth and sixth arguments
5574      // to be a null pointer constant (including `0` literal), we can take it
5575      // into account and emit null pointer directly.
5576      llvm::Value *EventWaitList = nullptr;
5577      if (E->getArg(4)->isNullPointerConstant(
5578              getContext(), Expr::NPC_ValueDependentIsNotNull)) {
5579        EventWaitList = llvm::ConstantPointerNull::get(PtrTy);
5580      } else {
5581        EventWaitList = E->getArg(4)->getType()->isArrayType()
5582                        ? EmitArrayToPointerDecay(E->getArg(4)).getPointer()
5583                        : EmitScalarExpr(E->getArg(4));
5584        // Convert to generic address space.
5585        EventWaitList = Builder.CreatePointerCast(EventWaitList, PtrTy);
5586      }
5587      llvm::Value *EventRet = nullptr;
5588      if (E->getArg(5)->isNullPointerConstant(
5589              getContext(), Expr::NPC_ValueDependentIsNotNull)) {
5590        EventRet = llvm::ConstantPointerNull::get(PtrTy);
5591      } else {
5592        EventRet =
5593            Builder.CreatePointerCast(EmitScalarExpr(E->getArg(5)), PtrTy);
5594      }
5595
5596      auto Info =
5597          CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(6));
5598      llvm::Value *Kernel =
5599          Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5600      llvm::Value *Block =
5601          Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5602
5603      std::vector<llvm::Type *> ArgTys = {
5604          QueueTy, Int32Ty, RangeTy,          Int32Ty,
5605          PtrTy,   PtrTy,   GenericVoidPtrTy, GenericVoidPtrTy};
5606
5607      std::vector<llvm::Value *> Args = {Queue,     Flags,         Range,
5608                                         NumEvents, EventWaitList, EventRet,
5609                                         Kernel,    Block};
5610
5611      if (NumArgs == 7) {
5612        // Has events but no variadics.
5613        Name = "__enqueue_kernel_basic_events";
5614        llvm::FunctionType *FTy = llvm::FunctionType::get(
5615            Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5616        return RValue::get(
5617            EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5618                            llvm::ArrayRef<llvm::Value *>(Args)));
5619      }
5620      // Has event info and variadics
5621      // Pass the number of variadics to the runtime function too.
5622      Args.push_back(ConstantInt::get(Int32Ty, NumArgs - 7));
5623      ArgTys.push_back(Int32Ty);
5624      Name = "__enqueue_kernel_events_varargs";
5625
5626      llvm::Value *ElemPtr, *TmpSize, *TmpPtr;
5627      std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(7);
5628      Args.push_back(ElemPtr);
5629      ArgTys.push_back(ElemPtr->getType());
5630
5631      llvm::FunctionType *FTy = llvm::FunctionType::get(
5632          Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
5633      auto Call =
5634          RValue::get(EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
5635                                      llvm::ArrayRef<llvm::Value *>(Args)));
5636      if (TmpSize)
5637        EmitLifetimeEnd(TmpSize, TmpPtr);
5638      return Call;
5639    }
5640    [[fallthrough]];
5641  }
5642  // OpenCL v2.0 s6.13.17.6 - Kernel query functions need bitcast of block
5643  // parameter.
5644  case Builtin::BIget_kernel_work_group_size: {
5645    llvm::Type *GenericVoidPtrTy = Builder.getPtrTy(
5646        getContext().getTargetAddressSpace(LangAS::opencl_generic));
5647    auto Info =
5648        CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
5649    Value *Kernel =
5650        Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5651    Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5652    return RValue::get(EmitRuntimeCall(
5653        CGM.CreateRuntimeFunction(
5654            llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
5655                                    false),
5656            "__get_kernel_work_group_size_impl"),
5657        {Kernel, Arg}));
5658  }
5659  case Builtin::BIget_kernel_preferred_work_group_size_multiple: {
5660    llvm::Type *GenericVoidPtrTy = Builder.getPtrTy(
5661        getContext().getTargetAddressSpace(LangAS::opencl_generic));
5662    auto Info =
5663        CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
5664    Value *Kernel =
5665        Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5666    Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5667    return RValue::get(EmitRuntimeCall(
5668        CGM.CreateRuntimeFunction(
5669            llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy},
5670                                    false),
5671            "__get_kernel_preferred_work_group_size_multiple_impl"),
5672        {Kernel, Arg}));
5673  }
5674  case Builtin::BIget_kernel_max_sub_group_size_for_ndrange:
5675  case Builtin::BIget_kernel_sub_group_count_for_ndrange: {
5676    llvm::Type *GenericVoidPtrTy = Builder.getPtrTy(
5677        getContext().getTargetAddressSpace(LangAS::opencl_generic));
5678    LValue NDRangeL = EmitAggExprToLValue(E->getArg(0));
5679    llvm::Value *NDRange = NDRangeL.getAddress(*this).getPointer();
5680    auto Info =
5681        CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1));
5682    Value *Kernel =
5683        Builder.CreatePointerCast(Info.KernelHandle, GenericVoidPtrTy);
5684    Value *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
5685    const char *Name =
5686        BuiltinID == Builtin::BIget_kernel_max_sub_group_size_for_ndrange
5687            ? "__get_kernel_max_sub_group_size_for_ndrange_impl"
5688            : "__get_kernel_sub_group_count_for_ndrange_impl";
5689    return RValue::get(EmitRuntimeCall(
5690        CGM.CreateRuntimeFunction(
5691            llvm::FunctionType::get(
5692                IntTy, {NDRange->getType(), GenericVoidPtrTy, GenericVoidPtrTy},
5693                false),
5694            Name),
5695        {NDRange, Kernel, Block}));
5696  }
5697
5698  case Builtin::BI__builtin_store_half:
5699  case Builtin::BI__builtin_store_halff: {
5700    Value *Val = EmitScalarExpr(E->getArg(0));
5701    Address Address = EmitPointerWithAlignment(E->getArg(1));
5702    Value *HalfVal = Builder.CreateFPTrunc(Val, Builder.getHalfTy());
5703    Builder.CreateStore(HalfVal, Address);
5704    return RValue::get(nullptr);
5705  }
5706  case Builtin::BI__builtin_load_half: {
5707    Address Address = EmitPointerWithAlignment(E->getArg(0));
5708    Value *HalfVal = Builder.CreateLoad(Address);
5709    return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getDoubleTy()));
5710  }
5711  case Builtin::BI__builtin_load_halff: {
5712    Address Address = EmitPointerWithAlignment(E->getArg(0));
5713    Value *HalfVal = Builder.CreateLoad(Address);
5714    return RValue::get(Builder.CreateFPExt(HalfVal, Builder.getFloatTy()));
5715  }
5716  case Builtin::BIprintf:
5717    if (getTarget().getTriple().isNVPTX() ||
5718        getTarget().getTriple().isAMDGCN()) {
5719      if (getLangOpts().OpenMPIsTargetDevice)
5720        return EmitOpenMPDevicePrintfCallExpr(E);
5721      if (getTarget().getTriple().isNVPTX())
5722        return EmitNVPTXDevicePrintfCallExpr(E);
5723      if (getTarget().getTriple().isAMDGCN() && getLangOpts().HIP)
5724        return EmitAMDGPUDevicePrintfCallExpr(E);
5725    }
5726
5727    break;
5728  case Builtin::BI__builtin_canonicalize:
5729  case Builtin::BI__builtin_canonicalizef:
5730  case Builtin::BI__builtin_canonicalizef16:
5731  case Builtin::BI__builtin_canonicalizel:
5732    return RValue::get(emitUnaryBuiltin(*this, E, Intrinsic::canonicalize));
5733
5734  case Builtin::BI__builtin_thread_pointer: {
5735    if (!getContext().getTargetInfo().isTLSSupported())
5736      CGM.ErrorUnsupported(E, "__builtin_thread_pointer");
5737    // Fall through - it's already mapped to the intrinsic by ClangBuiltin.
5738    break;
5739  }
5740  case Builtin::BI__builtin_os_log_format:
5741    return emitBuiltinOSLogFormat(*E);
5742
5743  case Builtin::BI__xray_customevent: {
5744    if (!ShouldXRayInstrumentFunction())
5745      return RValue::getIgnored();
5746
5747    if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
5748            XRayInstrKind::Custom))
5749      return RValue::getIgnored();
5750
5751    if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
5752      if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayCustomEvents())
5753        return RValue::getIgnored();
5754
5755    Function *F = CGM.getIntrinsic(Intrinsic::xray_customevent);
5756    auto FTy = F->getFunctionType();
5757    auto Arg0 = E->getArg(0);
5758    auto Arg0Val = EmitScalarExpr(Arg0);
5759    auto Arg0Ty = Arg0->getType();
5760    auto PTy0 = FTy->getParamType(0);
5761    if (PTy0 != Arg0Val->getType()) {
5762      if (Arg0Ty->isArrayType())
5763        Arg0Val = EmitArrayToPointerDecay(Arg0).getPointer();
5764      else
5765        Arg0Val = Builder.CreatePointerCast(Arg0Val, PTy0);
5766    }
5767    auto Arg1 = EmitScalarExpr(E->getArg(1));
5768    auto PTy1 = FTy->getParamType(1);
5769    if (PTy1 != Arg1->getType())
5770      Arg1 = Builder.CreateTruncOrBitCast(Arg1, PTy1);
5771    return RValue::get(Builder.CreateCall(F, {Arg0Val, Arg1}));
5772  }
5773
5774  case Builtin::BI__xray_typedevent: {
5775    // TODO: There should be a way to always emit events even if the current
5776    // function is not instrumented. Losing events in a stream can cripple
5777    // a trace.
5778    if (!ShouldXRayInstrumentFunction())
5779      return RValue::getIgnored();
5780
5781    if (!CGM.getCodeGenOpts().XRayInstrumentationBundle.has(
5782            XRayInstrKind::Typed))
5783      return RValue::getIgnored();
5784
5785    if (const auto *XRayAttr = CurFuncDecl->getAttr<XRayInstrumentAttr>())
5786      if (XRayAttr->neverXRayInstrument() && !AlwaysEmitXRayTypedEvents())
5787        return RValue::getIgnored();
5788
5789    Function *F = CGM.getIntrinsic(Intrinsic::xray_typedevent);
5790    auto FTy = F->getFunctionType();
5791    auto Arg0 = EmitScalarExpr(E->getArg(0));
5792    auto PTy0 = FTy->getParamType(0);
5793    if (PTy0 != Arg0->getType())
5794      Arg0 = Builder.CreateTruncOrBitCast(Arg0, PTy0);
5795    auto Arg1 = E->getArg(1);
5796    auto Arg1Val = EmitScalarExpr(Arg1);
5797    auto Arg1Ty = Arg1->getType();
5798    auto PTy1 = FTy->getParamType(1);
5799    if (PTy1 != Arg1Val->getType()) {
5800      if (Arg1Ty->isArrayType())
5801        Arg1Val = EmitArrayToPointerDecay(Arg1).getPointer();
5802      else
5803        Arg1Val = Builder.CreatePointerCast(Arg1Val, PTy1);
5804    }
5805    auto Arg2 = EmitScalarExpr(E->getArg(2));
5806    auto PTy2 = FTy->getParamType(2);
5807    if (PTy2 != Arg2->getType())
5808      Arg2 = Builder.CreateTruncOrBitCast(Arg2, PTy2);
5809    return RValue::get(Builder.CreateCall(F, {Arg0, Arg1Val, Arg2}));
5810  }
5811
5812  case Builtin::BI__builtin_ms_va_start:
5813  case Builtin::BI__builtin_ms_va_end:
5814    return RValue::get(
5815        EmitVAStartEnd(EmitMSVAListRef(E->getArg(0)).getPointer(),
5816                       BuiltinID == Builtin::BI__builtin_ms_va_start));
5817
5818  case Builtin::BI__builtin_ms_va_copy: {
5819    // Lower this manually. We can't reliably determine whether or not any
5820    // given va_copy() is for a Win64 va_list from the calling convention
5821    // alone, because it's legal to do this from a System V ABI function.
5822    // With opaque pointer types, we won't have enough information in LLVM
5823    // IR to determine this from the argument types, either. Best to do it
5824    // now, while we have enough information.
5825    Address DestAddr = EmitMSVAListRef(E->getArg(0));
5826    Address SrcAddr = EmitMSVAListRef(E->getArg(1));
5827
5828    DestAddr = DestAddr.withElementType(Int8PtrTy);
5829    SrcAddr = SrcAddr.withElementType(Int8PtrTy);
5830
5831    Value *ArgPtr = Builder.CreateLoad(SrcAddr, "ap.val");
5832    return RValue::get(Builder.CreateStore(ArgPtr, DestAddr));
5833  }
5834
5835  case Builtin::BI__builtin_get_device_side_mangled_name: {
5836    auto Name = CGM.getCUDARuntime().getDeviceSideName(
5837        cast<DeclRefExpr>(E->getArg(0)->IgnoreImpCasts())->getDecl());
5838    auto Str = CGM.GetAddrOfConstantCString(Name, "");
5839    llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
5840                               llvm::ConstantInt::get(SizeTy, 0)};
5841    auto *Ptr = llvm::ConstantExpr::getGetElementPtr(Str.getElementType(),
5842                                                     Str.getPointer(), Zeros);
5843    return RValue::get(Ptr);
5844  }
5845  }
5846
5847  // If this is an alias for a lib function (e.g. __builtin_sin), emit
5848  // the call using the normal call path, but using the unmangled
5849  // version of the function name.
5850  if (getContext().BuiltinInfo.isLibFunction(BuiltinID))
5851    return emitLibraryCall(*this, FD, E,
5852                           CGM.getBuiltinLibFunction(FD, BuiltinID));
5853
5854  // If this is a predefined lib function (e.g. malloc), emit the call
5855  // using exactly the normal call path.
5856  if (getContext().BuiltinInfo.isPredefinedLibFunction(BuiltinID))
5857    return emitLibraryCall(*this, FD, E,
5858                      cast<llvm::Constant>(EmitScalarExpr(E->getCallee())));
5859
5860  // Check that a call to a target specific builtin has the correct target
5861  // features.
5862  // This is down here to avoid non-target specific builtins, however, if
5863  // generic builtins start to require generic target features then we
5864  // can move this up to the beginning of the function.
5865  checkTargetFeatures(E, FD);
5866
5867  if (unsigned VectorWidth = getContext().BuiltinInfo.getRequiredVectorWidth(BuiltinID))
5868    LargestVectorWidth = std::max(LargestVectorWidth, VectorWidth);
5869
5870  // See if we have a target specific intrinsic.
5871  StringRef Name = getContext().BuiltinInfo.getName(BuiltinID);
5872  Intrinsic::ID IntrinsicID = Intrinsic::not_intrinsic;
5873  StringRef Prefix =
5874      llvm::Triple::getArchTypePrefix(getTarget().getTriple().getArch());
5875  if (!Prefix.empty()) {
5876    IntrinsicID = Intrinsic::getIntrinsicForClangBuiltin(Prefix.data(), Name);
5877    // NOTE we don't need to perform a compatibility flag check here since the
5878    // intrinsics are declared in Builtins*.def via LANGBUILTIN which filter the
5879    // MS builtins via ALL_MS_LANGUAGES and are filtered earlier.
5880    if (IntrinsicID == Intrinsic::not_intrinsic)
5881      IntrinsicID = Intrinsic::getIntrinsicForMSBuiltin(Prefix.data(), Name);
5882  }
5883
5884  if (IntrinsicID != Intrinsic::not_intrinsic) {
5885    SmallVector<Value*, 16> Args;
5886
5887    // Find out if any arguments are required to be integer constant
5888    // expressions.
5889    unsigned ICEArguments = 0;
5890    ASTContext::GetBuiltinTypeError Error;
5891    getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
5892    assert(Error == ASTContext::GE_None && "Should not codegen an error");
5893
5894    Function *F = CGM.getIntrinsic(IntrinsicID);
5895    llvm::FunctionType *FTy = F->getFunctionType();
5896
5897    for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
5898      Value *ArgValue = EmitScalarOrConstFoldImmArg(ICEArguments, i, E);
5899      // If the intrinsic arg type is different from the builtin arg type
5900      // we need to do a bit cast.
5901      llvm::Type *PTy = FTy->getParamType(i);
5902      if (PTy != ArgValue->getType()) {
5903        // XXX - vector of pointers?
5904        if (auto *PtrTy = dyn_cast<llvm::PointerType>(PTy)) {
5905          if (PtrTy->getAddressSpace() !=
5906              ArgValue->getType()->getPointerAddressSpace()) {
5907            ArgValue = Builder.CreateAddrSpaceCast(
5908                ArgValue, llvm::PointerType::get(getLLVMContext(),
5909                                                 PtrTy->getAddressSpace()));
5910          }
5911        }
5912
5913        assert(PTy->canLosslesslyBitCastTo(FTy->getParamType(i)) &&
5914               "Must be able to losslessly bit cast to param");
5915        // Cast vector type (e.g., v256i32) to x86_amx, this only happen
5916        // in amx intrinsics.
5917        if (PTy->isX86_AMXTy())
5918          ArgValue = Builder.CreateIntrinsic(Intrinsic::x86_cast_vector_to_tile,
5919                                             {ArgValue->getType()}, {ArgValue});
5920        else
5921          ArgValue = Builder.CreateBitCast(ArgValue, PTy);
5922      }
5923
5924      Args.push_back(ArgValue);
5925    }
5926
5927    Value *V = Builder.CreateCall(F, Args);
5928    QualType BuiltinRetType = E->getType();
5929
5930    llvm::Type *RetTy = VoidTy;
5931    if (!BuiltinRetType->isVoidType())
5932      RetTy = ConvertType(BuiltinRetType);
5933
5934    if (RetTy != V->getType()) {
5935      // XXX - vector of pointers?
5936      if (auto *PtrTy = dyn_cast<llvm::PointerType>(RetTy)) {
5937        if (PtrTy->getAddressSpace() != V->getType()->getPointerAddressSpace()) {
5938          V = Builder.CreateAddrSpaceCast(
5939              V, llvm::PointerType::get(getLLVMContext(),
5940                                        PtrTy->getAddressSpace()));
5941        }
5942      }
5943
5944      assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
5945             "Must be able to losslessly bit cast result type");
5946      // Cast x86_amx to vector type (e.g., v256i32), this only happen
5947      // in amx intrinsics.
5948      if (V->getType()->isX86_AMXTy())
5949        V = Builder.CreateIntrinsic(Intrinsic::x86_cast_tile_to_vector, {RetTy},
5950                                    {V});
5951      else
5952        V = Builder.CreateBitCast(V, RetTy);
5953    }
5954
5955    if (RetTy->isVoidTy())
5956      return RValue::get(nullptr);
5957
5958    return RValue::get(V);
5959  }
5960
5961  // Some target-specific builtins can have aggregate return values, e.g.
5962  // __builtin_arm_mve_vld2q_u32. So if the result is an aggregate, force
5963  // ReturnValue to be non-null, so that the target-specific emission code can
5964  // always just emit into it.
5965  TypeEvaluationKind EvalKind = getEvaluationKind(E->getType());
5966  if (EvalKind == TEK_Aggregate && ReturnValue.isNull()) {
5967    Address DestPtr = CreateMemTemp(E->getType(), "agg.tmp");
5968    ReturnValue = ReturnValueSlot(DestPtr, false);
5969  }
5970
5971  // Now see if we can emit a target-specific builtin.
5972  if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E, ReturnValue)) {
5973    switch (EvalKind) {
5974    case TEK_Scalar:
5975      if (V->getType()->isVoidTy())
5976        return RValue::get(nullptr);
5977      return RValue::get(V);
5978    case TEK_Aggregate:
5979      return RValue::getAggregate(ReturnValue.getValue(),
5980                                  ReturnValue.isVolatile());
5981    case TEK_Complex:
5982      llvm_unreachable("No current target builtin returns complex");
5983    }
5984    llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
5985  }
5986
5987  if (getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)
5988    return EmitHipStdParUnsupportedBuiltin(this, FD);
5989
5990  ErrorUnsupported(E, "builtin function");
5991
5992  // Unknown builtin, for now just dump it out and return undef.
5993  return GetUndefRValue(E->getType());
5994}
5995
5996static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF,
5997                                        unsigned BuiltinID, const CallExpr *E,
5998                                        ReturnValueSlot ReturnValue,
5999                                        llvm::Triple::ArchType Arch) {
6000  // When compiling in HipStdPar mode we have to be conservative in rejecting
6001  // target specific features in the FE, and defer the possible error to the
6002  // AcceleratorCodeSelection pass, wherein iff an unsupported target builtin is
6003  // referenced by an accelerator executable function, we emit an error.
6004  // Returning nullptr here leads to the builtin being handled in
6005  // EmitStdParUnsupportedBuiltin.
6006  if (CGF->getLangOpts().HIPStdPar && CGF->getLangOpts().CUDAIsDevice &&
6007      Arch != CGF->getTarget().getTriple().getArch())
6008    return nullptr;
6009
6010  switch (Arch) {
6011  case llvm::Triple::arm:
6012  case llvm::Triple::armeb:
6013  case llvm::Triple::thumb:
6014  case llvm::Triple::thumbeb:
6015    return CGF->EmitARMBuiltinExpr(BuiltinID, E, ReturnValue, Arch);
6016  case llvm::Triple::aarch64:
6017  case llvm::Triple::aarch64_32:
6018  case llvm::Triple::aarch64_be:
6019    return CGF->EmitAArch64BuiltinExpr(BuiltinID, E, Arch);
6020  case llvm::Triple::bpfeb:
6021  case llvm::Triple::bpfel:
6022    return CGF->EmitBPFBuiltinExpr(BuiltinID, E);
6023  case llvm::Triple::x86:
6024  case llvm::Triple::x86_64:
6025    return CGF->EmitX86BuiltinExpr(BuiltinID, E);
6026  case llvm::Triple::ppc:
6027  case llvm::Triple::ppcle:
6028  case llvm::Triple::ppc64:
6029  case llvm::Triple::ppc64le:
6030    return CGF->EmitPPCBuiltinExpr(BuiltinID, E);
6031  case llvm::Triple::r600:
6032  case llvm::Triple::amdgcn:
6033    return CGF->EmitAMDGPUBuiltinExpr(BuiltinID, E);
6034  case llvm::Triple::systemz:
6035    return CGF->EmitSystemZBuiltinExpr(BuiltinID, E);
6036  case llvm::Triple::nvptx:
6037  case llvm::Triple::nvptx64:
6038    return CGF->EmitNVPTXBuiltinExpr(BuiltinID, E);
6039  case llvm::Triple::wasm32:
6040  case llvm::Triple::wasm64:
6041    return CGF->EmitWebAssemblyBuiltinExpr(BuiltinID, E);
6042  case llvm::Triple::hexagon:
6043    return CGF->EmitHexagonBuiltinExpr(BuiltinID, E);
6044  case llvm::Triple::riscv32:
6045  case llvm::Triple::riscv64:
6046    return CGF->EmitRISCVBuiltinExpr(BuiltinID, E, ReturnValue);
6047  default:
6048    return nullptr;
6049  }
6050}
6051
6052Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID,
6053                                              const CallExpr *E,
6054                                              ReturnValueSlot ReturnValue) {
6055  if (getContext().BuiltinInfo.isAuxBuiltinID(BuiltinID)) {
6056    assert(getContext().getAuxTargetInfo() && "Missing aux target info");
6057    return EmitTargetArchBuiltinExpr(
6058        this, getContext().BuiltinInfo.getAuxBuiltinID(BuiltinID), E,
6059        ReturnValue, getContext().getAuxTargetInfo()->getTriple().getArch());
6060  }
6061
6062  return EmitTargetArchBuiltinExpr(this, BuiltinID, E, ReturnValue,
6063                                   getTarget().getTriple().getArch());
6064}
6065
6066static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF,
6067                                          NeonTypeFlags TypeFlags,
6068                                          bool HasLegalHalfType = true,
6069                                          bool V1Ty = false,
6070                                          bool AllowBFloatArgsAndRet = true) {
6071  int IsQuad = TypeFlags.isQuad();
6072  switch (TypeFlags.getEltType()) {
6073  case NeonTypeFlags::Int8:
6074  case NeonTypeFlags::Poly8:
6075    return llvm::FixedVectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
6076  case NeonTypeFlags::Int16:
6077  case NeonTypeFlags::Poly16:
6078    return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
6079  case NeonTypeFlags::BFloat16:
6080    if (AllowBFloatArgsAndRet)
6081      return llvm::FixedVectorType::get(CGF->BFloatTy, V1Ty ? 1 : (4 << IsQuad));
6082    else
6083      return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
6084  case NeonTypeFlags::Float16:
6085    if (HasLegalHalfType)
6086      return llvm::FixedVectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad));
6087    else
6088      return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad));
6089  case NeonTypeFlags::Int32:
6090    return llvm::FixedVectorType::get(CGF->Int32Ty, V1Ty ? 1 : (2 << IsQuad));
6091  case NeonTypeFlags::Int64:
6092  case NeonTypeFlags::Poly64:
6093    return llvm::FixedVectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
6094  case NeonTypeFlags::Poly128:
6095    // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
6096    // There is a lot of i128 and f128 API missing.
6097    // so we use v16i8 to represent poly128 and get pattern matched.
6098    return llvm::FixedVectorType::get(CGF->Int8Ty, 16);
6099  case NeonTypeFlags::Float32:
6100    return llvm::FixedVectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
6101  case NeonTypeFlags::Float64:
6102    return llvm::FixedVectorType::get(CGF->DoubleTy, V1Ty ? 1 : (1 << IsQuad));
6103  }
6104  llvm_unreachable("Unknown vector element type!");
6105}
6106
6107static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
6108                                          NeonTypeFlags IntTypeFlags) {
6109  int IsQuad = IntTypeFlags.isQuad();
6110  switch (IntTypeFlags.getEltType()) {
6111  case NeonTypeFlags::Int16:
6112    return llvm::FixedVectorType::get(CGF->HalfTy, (4 << IsQuad));
6113  case NeonTypeFlags::Int32:
6114    return llvm::FixedVectorType::get(CGF->FloatTy, (2 << IsQuad));
6115  case NeonTypeFlags::Int64:
6116    return llvm::FixedVectorType::get(CGF->DoubleTy, (1 << IsQuad));
6117  default:
6118    llvm_unreachable("Type can't be converted to floating-point!");
6119  }
6120}
6121
6122Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C,
6123                                      const ElementCount &Count) {
6124  Value *SV = llvm::ConstantVector::getSplat(Count, C);
6125  return Builder.CreateShuffleVector(V, V, SV, "lane");
6126}
6127
6128Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
6129  ElementCount EC = cast<llvm::VectorType>(V->getType())->getElementCount();
6130  return EmitNeonSplat(V, C, EC);
6131}
6132
6133Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
6134                                     const char *name,
6135                                     unsigned shift, bool rightshift) {
6136  unsigned j = 0;
6137  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
6138       ai != ae; ++ai, ++j) {
6139    if (F->isConstrainedFPIntrinsic())
6140      if (ai->getType()->isMetadataTy())
6141        continue;
6142    if (shift > 0 && shift == j)
6143      Ops[j] = EmitNeonShiftVector(Ops[j], ai->getType(), rightshift);
6144    else
6145      Ops[j] = Builder.CreateBitCast(Ops[j], ai->getType(), name);
6146  }
6147
6148  if (F->isConstrainedFPIntrinsic())
6149    return Builder.CreateConstrainedFPCall(F, Ops, name);
6150  else
6151    return Builder.CreateCall(F, Ops, name);
6152}
6153
6154Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
6155                                            bool neg) {
6156  int SV = cast<ConstantInt>(V)->getSExtValue();
6157  return ConstantInt::get(Ty, neg ? -SV : SV);
6158}
6159
6160// Right-shift a vector by a constant.
6161Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
6162                                          llvm::Type *Ty, bool usgn,
6163                                          const char *name) {
6164  llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
6165
6166  int ShiftAmt = cast<ConstantInt>(Shift)->getSExtValue();
6167  int EltSize = VTy->getScalarSizeInBits();
6168
6169  Vec = Builder.CreateBitCast(Vec, Ty);
6170
6171  // lshr/ashr are undefined when the shift amount is equal to the vector
6172  // element size.
6173  if (ShiftAmt == EltSize) {
6174    if (usgn) {
6175      // Right-shifting an unsigned value by its size yields 0.
6176      return llvm::ConstantAggregateZero::get(VTy);
6177    } else {
6178      // Right-shifting a signed value by its size is equivalent
6179      // to a shift of size-1.
6180      --ShiftAmt;
6181      Shift = ConstantInt::get(VTy->getElementType(), ShiftAmt);
6182    }
6183  }
6184
6185  Shift = EmitNeonShiftVector(Shift, Ty, false);
6186  if (usgn)
6187    return Builder.CreateLShr(Vec, Shift, name);
6188  else
6189    return Builder.CreateAShr(Vec, Shift, name);
6190}
6191
6192enum {
6193  AddRetType = (1 << 0),
6194  Add1ArgType = (1 << 1),
6195  Add2ArgTypes = (1 << 2),
6196
6197  VectorizeRetType = (1 << 3),
6198  VectorizeArgTypes = (1 << 4),
6199
6200  InventFloatType = (1 << 5),
6201  UnsignedAlts = (1 << 6),
6202
6203  Use64BitVectors = (1 << 7),
6204  Use128BitVectors = (1 << 8),
6205
6206  Vectorize1ArgType = Add1ArgType | VectorizeArgTypes,
6207  VectorRet = AddRetType | VectorizeRetType,
6208  VectorRetGetArgs01 =
6209      AddRetType | Add2ArgTypes | VectorizeRetType | VectorizeArgTypes,
6210  FpCmpzModifiers =
6211      AddRetType | VectorizeRetType | Add1ArgType | InventFloatType
6212};
6213
6214namespace {
6215struct ARMVectorIntrinsicInfo {
6216  const char *NameHint;
6217  unsigned BuiltinID;
6218  unsigned LLVMIntrinsic;
6219  unsigned AltLLVMIntrinsic;
6220  uint64_t TypeModifier;
6221
6222  bool operator<(unsigned RHSBuiltinID) const {
6223    return BuiltinID < RHSBuiltinID;
6224  }
6225  bool operator<(const ARMVectorIntrinsicInfo &TE) const {
6226    return BuiltinID < TE.BuiltinID;
6227  }
6228};
6229} // end anonymous namespace
6230
6231#define NEONMAP0(NameBase) \
6232  { #NameBase, NEON::BI__builtin_neon_ ## NameBase, 0, 0, 0 }
6233
6234#define NEONMAP1(NameBase, LLVMIntrinsic, TypeModifier) \
6235  { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
6236      Intrinsic::LLVMIntrinsic, 0, TypeModifier }
6237
6238#define NEONMAP2(NameBase, LLVMIntrinsic, AltLLVMIntrinsic, TypeModifier) \
6239  { #NameBase, NEON:: BI__builtin_neon_ ## NameBase, \
6240      Intrinsic::LLVMIntrinsic, Intrinsic::AltLLVMIntrinsic, \
6241      TypeModifier }
6242
6243static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
6244  NEONMAP1(__a32_vcvt_bf16_f32, arm_neon_vcvtfp2bf, 0),
6245  NEONMAP0(splat_lane_v),
6246  NEONMAP0(splat_laneq_v),
6247  NEONMAP0(splatq_lane_v),
6248  NEONMAP0(splatq_laneq_v),
6249  NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
6250  NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
6251  NEONMAP1(vabs_v, arm_neon_vabs, 0),
6252  NEONMAP1(vabsq_v, arm_neon_vabs, 0),
6253  NEONMAP0(vadd_v),
6254  NEONMAP0(vaddhn_v),
6255  NEONMAP0(vaddq_v),
6256  NEONMAP1(vaesdq_u8, arm_neon_aesd, 0),
6257  NEONMAP1(vaeseq_u8, arm_neon_aese, 0),
6258  NEONMAP1(vaesimcq_u8, arm_neon_aesimc, 0),
6259  NEONMAP1(vaesmcq_u8, arm_neon_aesmc, 0),
6260  NEONMAP1(vbfdot_f32, arm_neon_bfdot, 0),
6261  NEONMAP1(vbfdotq_f32, arm_neon_bfdot, 0),
6262  NEONMAP1(vbfmlalbq_f32, arm_neon_bfmlalb, 0),
6263  NEONMAP1(vbfmlaltq_f32, arm_neon_bfmlalt, 0),
6264  NEONMAP1(vbfmmlaq_f32, arm_neon_bfmmla, 0),
6265  NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
6266  NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
6267  NEONMAP1(vcadd_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
6268  NEONMAP1(vcadd_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
6269  NEONMAP1(vcadd_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
6270  NEONMAP1(vcadd_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
6271  NEONMAP1(vcaddq_rot270_f16, arm_neon_vcadd_rot270, Add1ArgType),
6272  NEONMAP1(vcaddq_rot270_f32, arm_neon_vcadd_rot270, Add1ArgType),
6273  NEONMAP1(vcaddq_rot270_f64, arm_neon_vcadd_rot270, Add1ArgType),
6274  NEONMAP1(vcaddq_rot90_f16, arm_neon_vcadd_rot90, Add1ArgType),
6275  NEONMAP1(vcaddq_rot90_f32, arm_neon_vcadd_rot90, Add1ArgType),
6276  NEONMAP1(vcaddq_rot90_f64, arm_neon_vcadd_rot90, Add1ArgType),
6277  NEONMAP1(vcage_v, arm_neon_vacge, 0),
6278  NEONMAP1(vcageq_v, arm_neon_vacge, 0),
6279  NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
6280  NEONMAP1(vcagtq_v, arm_neon_vacgt, 0),
6281  NEONMAP1(vcale_v, arm_neon_vacge, 0),
6282  NEONMAP1(vcaleq_v, arm_neon_vacge, 0),
6283  NEONMAP1(vcalt_v, arm_neon_vacgt, 0),
6284  NEONMAP1(vcaltq_v, arm_neon_vacgt, 0),
6285  NEONMAP0(vceqz_v),
6286  NEONMAP0(vceqzq_v),
6287  NEONMAP0(vcgez_v),
6288  NEONMAP0(vcgezq_v),
6289  NEONMAP0(vcgtz_v),
6290  NEONMAP0(vcgtzq_v),
6291  NEONMAP0(vclez_v),
6292  NEONMAP0(vclezq_v),
6293  NEONMAP1(vcls_v, arm_neon_vcls, Add1ArgType),
6294  NEONMAP1(vclsq_v, arm_neon_vcls, Add1ArgType),
6295  NEONMAP0(vcltz_v),
6296  NEONMAP0(vcltzq_v),
6297  NEONMAP1(vclz_v, ctlz, Add1ArgType),
6298  NEONMAP1(vclzq_v, ctlz, Add1ArgType),
6299  NEONMAP1(vcnt_v, ctpop, Add1ArgType),
6300  NEONMAP1(vcntq_v, ctpop, Add1ArgType),
6301  NEONMAP1(vcvt_f16_f32, arm_neon_vcvtfp2hf, 0),
6302  NEONMAP0(vcvt_f16_s16),
6303  NEONMAP0(vcvt_f16_u16),
6304  NEONMAP1(vcvt_f32_f16, arm_neon_vcvthf2fp, 0),
6305  NEONMAP0(vcvt_f32_v),
6306  NEONMAP1(vcvt_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
6307  NEONMAP1(vcvt_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
6308  NEONMAP2(vcvt_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
6309  NEONMAP1(vcvt_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
6310  NEONMAP1(vcvt_n_s32_v, arm_neon_vcvtfp2fxs, 0),
6311  NEONMAP1(vcvt_n_s64_v, arm_neon_vcvtfp2fxs, 0),
6312  NEONMAP1(vcvt_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
6313  NEONMAP1(vcvt_n_u32_v, arm_neon_vcvtfp2fxu, 0),
6314  NEONMAP1(vcvt_n_u64_v, arm_neon_vcvtfp2fxu, 0),
6315  NEONMAP0(vcvt_s16_f16),
6316  NEONMAP0(vcvt_s32_v),
6317  NEONMAP0(vcvt_s64_v),
6318  NEONMAP0(vcvt_u16_f16),
6319  NEONMAP0(vcvt_u32_v),
6320  NEONMAP0(vcvt_u64_v),
6321  NEONMAP1(vcvta_s16_f16, arm_neon_vcvtas, 0),
6322  NEONMAP1(vcvta_s32_v, arm_neon_vcvtas, 0),
6323  NEONMAP1(vcvta_s64_v, arm_neon_vcvtas, 0),
6324  NEONMAP1(vcvta_u16_f16, arm_neon_vcvtau, 0),
6325  NEONMAP1(vcvta_u32_v, arm_neon_vcvtau, 0),
6326  NEONMAP1(vcvta_u64_v, arm_neon_vcvtau, 0),
6327  NEONMAP1(vcvtaq_s16_f16, arm_neon_vcvtas, 0),
6328  NEONMAP1(vcvtaq_s32_v, arm_neon_vcvtas, 0),
6329  NEONMAP1(vcvtaq_s64_v, arm_neon_vcvtas, 0),
6330  NEONMAP1(vcvtaq_u16_f16, arm_neon_vcvtau, 0),
6331  NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
6332  NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
6333  NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0),
6334  NEONMAP1(vcvtm_s16_f16, arm_neon_vcvtms, 0),
6335  NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
6336  NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
6337  NEONMAP1(vcvtm_u16_f16, arm_neon_vcvtmu, 0),
6338  NEONMAP1(vcvtm_u32_v, arm_neon_vcvtmu, 0),
6339  NEONMAP1(vcvtm_u64_v, arm_neon_vcvtmu, 0),
6340  NEONMAP1(vcvtmq_s16_f16, arm_neon_vcvtms, 0),
6341  NEONMAP1(vcvtmq_s32_v, arm_neon_vcvtms, 0),
6342  NEONMAP1(vcvtmq_s64_v, arm_neon_vcvtms, 0),
6343  NEONMAP1(vcvtmq_u16_f16, arm_neon_vcvtmu, 0),
6344  NEONMAP1(vcvtmq_u32_v, arm_neon_vcvtmu, 0),
6345  NEONMAP1(vcvtmq_u64_v, arm_neon_vcvtmu, 0),
6346  NEONMAP1(vcvtn_s16_f16, arm_neon_vcvtns, 0),
6347  NEONMAP1(vcvtn_s32_v, arm_neon_vcvtns, 0),
6348  NEONMAP1(vcvtn_s64_v, arm_neon_vcvtns, 0),
6349  NEONMAP1(vcvtn_u16_f16, arm_neon_vcvtnu, 0),
6350  NEONMAP1(vcvtn_u32_v, arm_neon_vcvtnu, 0),
6351  NEONMAP1(vcvtn_u64_v, arm_neon_vcvtnu, 0),
6352  NEONMAP1(vcvtnq_s16_f16, arm_neon_vcvtns, 0),
6353  NEONMAP1(vcvtnq_s32_v, arm_neon_vcvtns, 0),
6354  NEONMAP1(vcvtnq_s64_v, arm_neon_vcvtns, 0),
6355  NEONMAP1(vcvtnq_u16_f16, arm_neon_vcvtnu, 0),
6356  NEONMAP1(vcvtnq_u32_v, arm_neon_vcvtnu, 0),
6357  NEONMAP1(vcvtnq_u64_v, arm_neon_vcvtnu, 0),
6358  NEONMAP1(vcvtp_s16_f16, arm_neon_vcvtps, 0),
6359  NEONMAP1(vcvtp_s32_v, arm_neon_vcvtps, 0),
6360  NEONMAP1(vcvtp_s64_v, arm_neon_vcvtps, 0),
6361  NEONMAP1(vcvtp_u16_f16, arm_neon_vcvtpu, 0),
6362  NEONMAP1(vcvtp_u32_v, arm_neon_vcvtpu, 0),
6363  NEONMAP1(vcvtp_u64_v, arm_neon_vcvtpu, 0),
6364  NEONMAP1(vcvtpq_s16_f16, arm_neon_vcvtps, 0),
6365  NEONMAP1(vcvtpq_s32_v, arm_neon_vcvtps, 0),
6366  NEONMAP1(vcvtpq_s64_v, arm_neon_vcvtps, 0),
6367  NEONMAP1(vcvtpq_u16_f16, arm_neon_vcvtpu, 0),
6368  NEONMAP1(vcvtpq_u32_v, arm_neon_vcvtpu, 0),
6369  NEONMAP1(vcvtpq_u64_v, arm_neon_vcvtpu, 0),
6370  NEONMAP0(vcvtq_f16_s16),
6371  NEONMAP0(vcvtq_f16_u16),
6372  NEONMAP0(vcvtq_f32_v),
6373  NEONMAP1(vcvtq_n_f16_s16, arm_neon_vcvtfxs2fp, 0),
6374  NEONMAP1(vcvtq_n_f16_u16, arm_neon_vcvtfxu2fp, 0),
6375  NEONMAP2(vcvtq_n_f32_v, arm_neon_vcvtfxu2fp, arm_neon_vcvtfxs2fp, 0),
6376  NEONMAP1(vcvtq_n_s16_f16, arm_neon_vcvtfp2fxs, 0),
6377  NEONMAP1(vcvtq_n_s32_v, arm_neon_vcvtfp2fxs, 0),
6378  NEONMAP1(vcvtq_n_s64_v, arm_neon_vcvtfp2fxs, 0),
6379  NEONMAP1(vcvtq_n_u16_f16, arm_neon_vcvtfp2fxu, 0),
6380  NEONMAP1(vcvtq_n_u32_v, arm_neon_vcvtfp2fxu, 0),
6381  NEONMAP1(vcvtq_n_u64_v, arm_neon_vcvtfp2fxu, 0),
6382  NEONMAP0(vcvtq_s16_f16),
6383  NEONMAP0(vcvtq_s32_v),
6384  NEONMAP0(vcvtq_s64_v),
6385  NEONMAP0(vcvtq_u16_f16),
6386  NEONMAP0(vcvtq_u32_v),
6387  NEONMAP0(vcvtq_u64_v),
6388  NEONMAP1(vdot_s32, arm_neon_sdot, 0),
6389  NEONMAP1(vdot_u32, arm_neon_udot, 0),
6390  NEONMAP1(vdotq_s32, arm_neon_sdot, 0),
6391  NEONMAP1(vdotq_u32, arm_neon_udot, 0),
6392  NEONMAP0(vext_v),
6393  NEONMAP0(vextq_v),
6394  NEONMAP0(vfma_v),
6395  NEONMAP0(vfmaq_v),
6396  NEONMAP2(vhadd_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
6397  NEONMAP2(vhaddq_v, arm_neon_vhaddu, arm_neon_vhadds, Add1ArgType | UnsignedAlts),
6398  NEONMAP2(vhsub_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
6399  NEONMAP2(vhsubq_v, arm_neon_vhsubu, arm_neon_vhsubs, Add1ArgType | UnsignedAlts),
6400  NEONMAP0(vld1_dup_v),
6401  NEONMAP1(vld1_v, arm_neon_vld1, 0),
6402  NEONMAP1(vld1_x2_v, arm_neon_vld1x2, 0),
6403  NEONMAP1(vld1_x3_v, arm_neon_vld1x3, 0),
6404  NEONMAP1(vld1_x4_v, arm_neon_vld1x4, 0),
6405  NEONMAP0(vld1q_dup_v),
6406  NEONMAP1(vld1q_v, arm_neon_vld1, 0),
6407  NEONMAP1(vld1q_x2_v, arm_neon_vld1x2, 0),
6408  NEONMAP1(vld1q_x3_v, arm_neon_vld1x3, 0),
6409  NEONMAP1(vld1q_x4_v, arm_neon_vld1x4, 0),
6410  NEONMAP1(vld2_dup_v, arm_neon_vld2dup, 0),
6411  NEONMAP1(vld2_lane_v, arm_neon_vld2lane, 0),
6412  NEONMAP1(vld2_v, arm_neon_vld2, 0),
6413  NEONMAP1(vld2q_dup_v, arm_neon_vld2dup, 0),
6414  NEONMAP1(vld2q_lane_v, arm_neon_vld2lane, 0),
6415  NEONMAP1(vld2q_v, arm_neon_vld2, 0),
6416  NEONMAP1(vld3_dup_v, arm_neon_vld3dup, 0),
6417  NEONMAP1(vld3_lane_v, arm_neon_vld3lane, 0),
6418  NEONMAP1(vld3_v, arm_neon_vld3, 0),
6419  NEONMAP1(vld3q_dup_v, arm_neon_vld3dup, 0),
6420  NEONMAP1(vld3q_lane_v, arm_neon_vld3lane, 0),
6421  NEONMAP1(vld3q_v, arm_neon_vld3, 0),
6422  NEONMAP1(vld4_dup_v, arm_neon_vld4dup, 0),
6423  NEONMAP1(vld4_lane_v, arm_neon_vld4lane, 0),
6424  NEONMAP1(vld4_v, arm_neon_vld4, 0),
6425  NEONMAP1(vld4q_dup_v, arm_neon_vld4dup, 0),
6426  NEONMAP1(vld4q_lane_v, arm_neon_vld4lane, 0),
6427  NEONMAP1(vld4q_v, arm_neon_vld4, 0),
6428  NEONMAP2(vmax_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
6429  NEONMAP1(vmaxnm_v, arm_neon_vmaxnm, Add1ArgType),
6430  NEONMAP1(vmaxnmq_v, arm_neon_vmaxnm, Add1ArgType),
6431  NEONMAP2(vmaxq_v, arm_neon_vmaxu, arm_neon_vmaxs, Add1ArgType | UnsignedAlts),
6432  NEONMAP2(vmin_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
6433  NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType),
6434  NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType),
6435  NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts),
6436  NEONMAP1(vmmlaq_s32, arm_neon_smmla, 0),
6437  NEONMAP1(vmmlaq_u32, arm_neon_ummla, 0),
6438  NEONMAP0(vmovl_v),
6439  NEONMAP0(vmovn_v),
6440  NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType),
6441  NEONMAP0(vmull_v),
6442  NEONMAP1(vmulq_v, arm_neon_vmulp, Add1ArgType),
6443  NEONMAP2(vpadal_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
6444  NEONMAP2(vpadalq_v, arm_neon_vpadalu, arm_neon_vpadals, UnsignedAlts),
6445  NEONMAP1(vpadd_v, arm_neon_vpadd, Add1ArgType),
6446  NEONMAP2(vpaddl_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
6447  NEONMAP2(vpaddlq_v, arm_neon_vpaddlu, arm_neon_vpaddls, UnsignedAlts),
6448  NEONMAP1(vpaddq_v, arm_neon_vpadd, Add1ArgType),
6449  NEONMAP2(vpmax_v, arm_neon_vpmaxu, arm_neon_vpmaxs, Add1ArgType | UnsignedAlts),
6450  NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
6451  NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
6452  NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
6453  NEONMAP2(vqadd_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
6454  NEONMAP2(vqaddq_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
6455  NEONMAP2(vqdmlal_v, arm_neon_vqdmull, sadd_sat, 0),
6456  NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, ssub_sat, 0),
6457  NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
6458  NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
6459  NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
6460  NEONMAP2(vqmovn_v, arm_neon_vqmovnu, arm_neon_vqmovns, Add1ArgType | UnsignedAlts),
6461  NEONMAP1(vqmovun_v, arm_neon_vqmovnsu, Add1ArgType),
6462  NEONMAP1(vqneg_v, arm_neon_vqneg, Add1ArgType),
6463  NEONMAP1(vqnegq_v, arm_neon_vqneg, Add1ArgType),
6464  NEONMAP1(vqrdmlah_s16, arm_neon_vqrdmlah, Add1ArgType),
6465  NEONMAP1(vqrdmlah_s32, arm_neon_vqrdmlah, Add1ArgType),
6466  NEONMAP1(vqrdmlahq_s16, arm_neon_vqrdmlah, Add1ArgType),
6467  NEONMAP1(vqrdmlahq_s32, arm_neon_vqrdmlah, Add1ArgType),
6468  NEONMAP1(vqrdmlsh_s16, arm_neon_vqrdmlsh, Add1ArgType),
6469  NEONMAP1(vqrdmlsh_s32, arm_neon_vqrdmlsh, Add1ArgType),
6470  NEONMAP1(vqrdmlshq_s16, arm_neon_vqrdmlsh, Add1ArgType),
6471  NEONMAP1(vqrdmlshq_s32, arm_neon_vqrdmlsh, Add1ArgType),
6472  NEONMAP1(vqrdmulh_v, arm_neon_vqrdmulh, Add1ArgType),
6473  NEONMAP1(vqrdmulhq_v, arm_neon_vqrdmulh, Add1ArgType),
6474  NEONMAP2(vqrshl_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
6475  NEONMAP2(vqrshlq_v, arm_neon_vqrshiftu, arm_neon_vqrshifts, Add1ArgType | UnsignedAlts),
6476  NEONMAP2(vqshl_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
6477  NEONMAP2(vqshl_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
6478  NEONMAP2(vqshlq_n_v, arm_neon_vqshiftu, arm_neon_vqshifts, UnsignedAlts),
6479  NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
6480  NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0),
6481  NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0),
6482  NEONMAP2(vqsub_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
6483  NEONMAP2(vqsubq_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
6484  NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
6485  NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
6486  NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
6487  NEONMAP1(vrecps_v, arm_neon_vrecps, Add1ArgType),
6488  NEONMAP1(vrecpsq_v, arm_neon_vrecps, Add1ArgType),
6489  NEONMAP2(vrhadd_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
6490  NEONMAP2(vrhaddq_v, arm_neon_vrhaddu, arm_neon_vrhadds, Add1ArgType | UnsignedAlts),
6491  NEONMAP1(vrnd_v, arm_neon_vrintz, Add1ArgType),
6492  NEONMAP1(vrnda_v, arm_neon_vrinta, Add1ArgType),
6493  NEONMAP1(vrndaq_v, arm_neon_vrinta, Add1ArgType),
6494  NEONMAP0(vrndi_v),
6495  NEONMAP0(vrndiq_v),
6496  NEONMAP1(vrndm_v, arm_neon_vrintm, Add1ArgType),
6497  NEONMAP1(vrndmq_v, arm_neon_vrintm, Add1ArgType),
6498  NEONMAP1(vrndn_v, arm_neon_vrintn, Add1ArgType),
6499  NEONMAP1(vrndnq_v, arm_neon_vrintn, Add1ArgType),
6500  NEONMAP1(vrndp_v, arm_neon_vrintp, Add1ArgType),
6501  NEONMAP1(vrndpq_v, arm_neon_vrintp, Add1ArgType),
6502  NEONMAP1(vrndq_v, arm_neon_vrintz, Add1ArgType),
6503  NEONMAP1(vrndx_v, arm_neon_vrintx, Add1ArgType),
6504  NEONMAP1(vrndxq_v, arm_neon_vrintx, Add1ArgType),
6505  NEONMAP2(vrshl_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
6506  NEONMAP2(vrshlq_v, arm_neon_vrshiftu, arm_neon_vrshifts, Add1ArgType | UnsignedAlts),
6507  NEONMAP2(vrshr_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
6508  NEONMAP2(vrshrq_n_v, arm_neon_vrshiftu, arm_neon_vrshifts, UnsignedAlts),
6509  NEONMAP2(vrsqrte_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
6510  NEONMAP2(vrsqrteq_v, arm_neon_vrsqrte, arm_neon_vrsqrte, 0),
6511  NEONMAP1(vrsqrts_v, arm_neon_vrsqrts, Add1ArgType),
6512  NEONMAP1(vrsqrtsq_v, arm_neon_vrsqrts, Add1ArgType),
6513  NEONMAP1(vrsubhn_v, arm_neon_vrsubhn, Add1ArgType),
6514  NEONMAP1(vsha1su0q_u32, arm_neon_sha1su0, 0),
6515  NEONMAP1(vsha1su1q_u32, arm_neon_sha1su1, 0),
6516  NEONMAP1(vsha256h2q_u32, arm_neon_sha256h2, 0),
6517  NEONMAP1(vsha256hq_u32, arm_neon_sha256h, 0),
6518  NEONMAP1(vsha256su0q_u32, arm_neon_sha256su0, 0),
6519  NEONMAP1(vsha256su1q_u32, arm_neon_sha256su1, 0),
6520  NEONMAP0(vshl_n_v),
6521  NEONMAP2(vshl_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
6522  NEONMAP0(vshll_n_v),
6523  NEONMAP0(vshlq_n_v),
6524  NEONMAP2(vshlq_v, arm_neon_vshiftu, arm_neon_vshifts, Add1ArgType | UnsignedAlts),
6525  NEONMAP0(vshr_n_v),
6526  NEONMAP0(vshrn_n_v),
6527  NEONMAP0(vshrq_n_v),
6528  NEONMAP1(vst1_v, arm_neon_vst1, 0),
6529  NEONMAP1(vst1_x2_v, arm_neon_vst1x2, 0),
6530  NEONMAP1(vst1_x3_v, arm_neon_vst1x3, 0),
6531  NEONMAP1(vst1_x4_v, arm_neon_vst1x4, 0),
6532  NEONMAP1(vst1q_v, arm_neon_vst1, 0),
6533  NEONMAP1(vst1q_x2_v, arm_neon_vst1x2, 0),
6534  NEONMAP1(vst1q_x3_v, arm_neon_vst1x3, 0),
6535  NEONMAP1(vst1q_x4_v, arm_neon_vst1x4, 0),
6536  NEONMAP1(vst2_lane_v, arm_neon_vst2lane, 0),
6537  NEONMAP1(vst2_v, arm_neon_vst2, 0),
6538  NEONMAP1(vst2q_lane_v, arm_neon_vst2lane, 0),
6539  NEONMAP1(vst2q_v, arm_neon_vst2, 0),
6540  NEONMAP1(vst3_lane_v, arm_neon_vst3lane, 0),
6541  NEONMAP1(vst3_v, arm_neon_vst3, 0),
6542  NEONMAP1(vst3q_lane_v, arm_neon_vst3lane, 0),
6543  NEONMAP1(vst3q_v, arm_neon_vst3, 0),
6544  NEONMAP1(vst4_lane_v, arm_neon_vst4lane, 0),
6545  NEONMAP1(vst4_v, arm_neon_vst4, 0),
6546  NEONMAP1(vst4q_lane_v, arm_neon_vst4lane, 0),
6547  NEONMAP1(vst4q_v, arm_neon_vst4, 0),
6548  NEONMAP0(vsubhn_v),
6549  NEONMAP0(vtrn_v),
6550  NEONMAP0(vtrnq_v),
6551  NEONMAP0(vtst_v),
6552  NEONMAP0(vtstq_v),
6553  NEONMAP1(vusdot_s32, arm_neon_usdot, 0),
6554  NEONMAP1(vusdotq_s32, arm_neon_usdot, 0),
6555  NEONMAP1(vusmmlaq_s32, arm_neon_usmmla, 0),
6556  NEONMAP0(vuzp_v),
6557  NEONMAP0(vuzpq_v),
6558  NEONMAP0(vzip_v),
6559  NEONMAP0(vzipq_v)
6560};
6561
6562static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
6563  NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
6564  NEONMAP0(splat_lane_v),
6565  NEONMAP0(splat_laneq_v),
6566  NEONMAP0(splatq_lane_v),
6567  NEONMAP0(splatq_laneq_v),
6568  NEONMAP1(vabs_v, aarch64_neon_abs, 0),
6569  NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
6570  NEONMAP0(vadd_v),
6571  NEONMAP0(vaddhn_v),
6572  NEONMAP0(vaddq_p128),
6573  NEONMAP0(vaddq_v),
6574  NEONMAP1(vaesdq_u8, aarch64_crypto_aesd, 0),
6575  NEONMAP1(vaeseq_u8, aarch64_crypto_aese, 0),
6576  NEONMAP1(vaesimcq_u8, aarch64_crypto_aesimc, 0),
6577  NEONMAP1(vaesmcq_u8, aarch64_crypto_aesmc, 0),
6578  NEONMAP2(vbcaxq_s16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6579  NEONMAP2(vbcaxq_s32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6580  NEONMAP2(vbcaxq_s64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6581  NEONMAP2(vbcaxq_s8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6582  NEONMAP2(vbcaxq_u16, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6583  NEONMAP2(vbcaxq_u32, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6584  NEONMAP2(vbcaxq_u64, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6585  NEONMAP2(vbcaxq_u8, aarch64_crypto_bcaxu, aarch64_crypto_bcaxs, Add1ArgType | UnsignedAlts),
6586  NEONMAP1(vbfdot_f32, aarch64_neon_bfdot, 0),
6587  NEONMAP1(vbfdotq_f32, aarch64_neon_bfdot, 0),
6588  NEONMAP1(vbfmlalbq_f32, aarch64_neon_bfmlalb, 0),
6589  NEONMAP1(vbfmlaltq_f32, aarch64_neon_bfmlalt, 0),
6590  NEONMAP1(vbfmmlaq_f32, aarch64_neon_bfmmla, 0),
6591  NEONMAP1(vcadd_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
6592  NEONMAP1(vcadd_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
6593  NEONMAP1(vcadd_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
6594  NEONMAP1(vcadd_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
6595  NEONMAP1(vcaddq_rot270_f16, aarch64_neon_vcadd_rot270, Add1ArgType),
6596  NEONMAP1(vcaddq_rot270_f32, aarch64_neon_vcadd_rot270, Add1ArgType),
6597  NEONMAP1(vcaddq_rot270_f64, aarch64_neon_vcadd_rot270, Add1ArgType),
6598  NEONMAP1(vcaddq_rot90_f16, aarch64_neon_vcadd_rot90, Add1ArgType),
6599  NEONMAP1(vcaddq_rot90_f32, aarch64_neon_vcadd_rot90, Add1ArgType),
6600  NEONMAP1(vcaddq_rot90_f64, aarch64_neon_vcadd_rot90, Add1ArgType),
6601  NEONMAP1(vcage_v, aarch64_neon_facge, 0),
6602  NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
6603  NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
6604  NEONMAP1(vcagtq_v, aarch64_neon_facgt, 0),
6605  NEONMAP1(vcale_v, aarch64_neon_facge, 0),
6606  NEONMAP1(vcaleq_v, aarch64_neon_facge, 0),
6607  NEONMAP1(vcalt_v, aarch64_neon_facgt, 0),
6608  NEONMAP1(vcaltq_v, aarch64_neon_facgt, 0),
6609  NEONMAP0(vceqz_v),
6610  NEONMAP0(vceqzq_v),
6611  NEONMAP0(vcgez_v),
6612  NEONMAP0(vcgezq_v),
6613  NEONMAP0(vcgtz_v),
6614  NEONMAP0(vcgtzq_v),
6615  NEONMAP0(vclez_v),
6616  NEONMAP0(vclezq_v),
6617  NEONMAP1(vcls_v, aarch64_neon_cls, Add1ArgType),
6618  NEONMAP1(vclsq_v, aarch64_neon_cls, Add1ArgType),
6619  NEONMAP0(vcltz_v),
6620  NEONMAP0(vcltzq_v),
6621  NEONMAP1(vclz_v, ctlz, Add1ArgType),
6622  NEONMAP1(vclzq_v, ctlz, Add1ArgType),
6623  NEONMAP1(vcmla_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
6624  NEONMAP1(vcmla_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
6625  NEONMAP1(vcmla_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
6626  NEONMAP1(vcmla_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
6627  NEONMAP1(vcmla_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
6628  NEONMAP1(vcmla_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
6629  NEONMAP1(vcmla_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
6630  NEONMAP1(vcmla_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
6631  NEONMAP1(vcmlaq_f16, aarch64_neon_vcmla_rot0, Add1ArgType),
6632  NEONMAP1(vcmlaq_f32, aarch64_neon_vcmla_rot0, Add1ArgType),
6633  NEONMAP1(vcmlaq_f64, aarch64_neon_vcmla_rot0, Add1ArgType),
6634  NEONMAP1(vcmlaq_rot180_f16, aarch64_neon_vcmla_rot180, Add1ArgType),
6635  NEONMAP1(vcmlaq_rot180_f32, aarch64_neon_vcmla_rot180, Add1ArgType),
6636  NEONMAP1(vcmlaq_rot180_f64, aarch64_neon_vcmla_rot180, Add1ArgType),
6637  NEONMAP1(vcmlaq_rot270_f16, aarch64_neon_vcmla_rot270, Add1ArgType),
6638  NEONMAP1(vcmlaq_rot270_f32, aarch64_neon_vcmla_rot270, Add1ArgType),
6639  NEONMAP1(vcmlaq_rot270_f64, aarch64_neon_vcmla_rot270, Add1ArgType),
6640  NEONMAP1(vcmlaq_rot90_f16, aarch64_neon_vcmla_rot90, Add1ArgType),
6641  NEONMAP1(vcmlaq_rot90_f32, aarch64_neon_vcmla_rot90, Add1ArgType),
6642  NEONMAP1(vcmlaq_rot90_f64, aarch64_neon_vcmla_rot90, Add1ArgType),
6643  NEONMAP1(vcnt_v, ctpop, Add1ArgType),
6644  NEONMAP1(vcntq_v, ctpop, Add1ArgType),
6645  NEONMAP1(vcvt_f16_f32, aarch64_neon_vcvtfp2hf, 0),
6646  NEONMAP0(vcvt_f16_s16),
6647  NEONMAP0(vcvt_f16_u16),
6648  NEONMAP1(vcvt_f32_f16, aarch64_neon_vcvthf2fp, 0),
6649  NEONMAP0(vcvt_f32_v),
6650  NEONMAP1(vcvt_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
6651  NEONMAP1(vcvt_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
6652  NEONMAP2(vcvt_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6653  NEONMAP2(vcvt_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6654  NEONMAP1(vcvt_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
6655  NEONMAP1(vcvt_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
6656  NEONMAP1(vcvt_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
6657  NEONMAP1(vcvt_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
6658  NEONMAP1(vcvt_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
6659  NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
6660  NEONMAP0(vcvtq_f16_s16),
6661  NEONMAP0(vcvtq_f16_u16),
6662  NEONMAP0(vcvtq_f32_v),
6663  NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
6664  NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
6665  NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
6666  NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6667  NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
6668  NEONMAP1(vcvtq_n_s16_f16, aarch64_neon_vcvtfp2fxs, 0),
6669  NEONMAP1(vcvtq_n_s32_v, aarch64_neon_vcvtfp2fxs, 0),
6670  NEONMAP1(vcvtq_n_s64_v, aarch64_neon_vcvtfp2fxs, 0),
6671  NEONMAP1(vcvtq_n_u16_f16, aarch64_neon_vcvtfp2fxu, 0),
6672  NEONMAP1(vcvtq_n_u32_v, aarch64_neon_vcvtfp2fxu, 0),
6673  NEONMAP1(vcvtq_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
6674  NEONMAP1(vcvtx_f32_v, aarch64_neon_fcvtxn, AddRetType | Add1ArgType),
6675  NEONMAP1(vdot_s32, aarch64_neon_sdot, 0),
6676  NEONMAP1(vdot_u32, aarch64_neon_udot, 0),
6677  NEONMAP1(vdotq_s32, aarch64_neon_sdot, 0),
6678  NEONMAP1(vdotq_u32, aarch64_neon_udot, 0),
6679  NEONMAP2(veor3q_s16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6680  NEONMAP2(veor3q_s32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6681  NEONMAP2(veor3q_s64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6682  NEONMAP2(veor3q_s8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6683  NEONMAP2(veor3q_u16, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6684  NEONMAP2(veor3q_u32, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6685  NEONMAP2(veor3q_u64, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6686  NEONMAP2(veor3q_u8, aarch64_crypto_eor3u, aarch64_crypto_eor3s, Add1ArgType | UnsignedAlts),
6687  NEONMAP0(vext_v),
6688  NEONMAP0(vextq_v),
6689  NEONMAP0(vfma_v),
6690  NEONMAP0(vfmaq_v),
6691  NEONMAP1(vfmlal_high_f16, aarch64_neon_fmlal2, 0),
6692  NEONMAP1(vfmlal_low_f16, aarch64_neon_fmlal, 0),
6693  NEONMAP1(vfmlalq_high_f16, aarch64_neon_fmlal2, 0),
6694  NEONMAP1(vfmlalq_low_f16, aarch64_neon_fmlal, 0),
6695  NEONMAP1(vfmlsl_high_f16, aarch64_neon_fmlsl2, 0),
6696  NEONMAP1(vfmlsl_low_f16, aarch64_neon_fmlsl, 0),
6697  NEONMAP1(vfmlslq_high_f16, aarch64_neon_fmlsl2, 0),
6698  NEONMAP1(vfmlslq_low_f16, aarch64_neon_fmlsl, 0),
6699  NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
6700  NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
6701  NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
6702  NEONMAP2(vhsubq_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
6703  NEONMAP1(vld1_x2_v, aarch64_neon_ld1x2, 0),
6704  NEONMAP1(vld1_x3_v, aarch64_neon_ld1x3, 0),
6705  NEONMAP1(vld1_x4_v, aarch64_neon_ld1x4, 0),
6706  NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0),
6707  NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0),
6708  NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0),
6709  NEONMAP1(vmmlaq_s32, aarch64_neon_smmla, 0),
6710  NEONMAP1(vmmlaq_u32, aarch64_neon_ummla, 0),
6711  NEONMAP0(vmovl_v),
6712  NEONMAP0(vmovn_v),
6713  NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType),
6714  NEONMAP1(vmulq_v, aarch64_neon_pmul, Add1ArgType),
6715  NEONMAP1(vpadd_v, aarch64_neon_addp, Add1ArgType),
6716  NEONMAP2(vpaddl_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
6717  NEONMAP2(vpaddlq_v, aarch64_neon_uaddlp, aarch64_neon_saddlp, UnsignedAlts),
6718  NEONMAP1(vpaddq_v, aarch64_neon_addp, Add1ArgType),
6719  NEONMAP1(vqabs_v, aarch64_neon_sqabs, Add1ArgType),
6720  NEONMAP1(vqabsq_v, aarch64_neon_sqabs, Add1ArgType),
6721  NEONMAP2(vqadd_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
6722  NEONMAP2(vqaddq_v, aarch64_neon_uqadd, aarch64_neon_sqadd, Add1ArgType | UnsignedAlts),
6723  NEONMAP2(vqdmlal_v, aarch64_neon_sqdmull, aarch64_neon_sqadd, 0),
6724  NEONMAP2(vqdmlsl_v, aarch64_neon_sqdmull, aarch64_neon_sqsub, 0),
6725  NEONMAP1(vqdmulh_lane_v, aarch64_neon_sqdmulh_lane, 0),
6726  NEONMAP1(vqdmulh_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
6727  NEONMAP1(vqdmulh_v, aarch64_neon_sqdmulh, Add1ArgType),
6728  NEONMAP1(vqdmulhq_lane_v, aarch64_neon_sqdmulh_lane, 0),
6729  NEONMAP1(vqdmulhq_laneq_v, aarch64_neon_sqdmulh_laneq, 0),
6730  NEONMAP1(vqdmulhq_v, aarch64_neon_sqdmulh, Add1ArgType),
6731  NEONMAP1(vqdmull_v, aarch64_neon_sqdmull, Add1ArgType),
6732  NEONMAP2(vqmovn_v, aarch64_neon_uqxtn, aarch64_neon_sqxtn, Add1ArgType | UnsignedAlts),
6733  NEONMAP1(vqmovun_v, aarch64_neon_sqxtun, Add1ArgType),
6734  NEONMAP1(vqneg_v, aarch64_neon_sqneg, Add1ArgType),
6735  NEONMAP1(vqnegq_v, aarch64_neon_sqneg, Add1ArgType),
6736  NEONMAP1(vqrdmlah_s16, aarch64_neon_sqrdmlah, Add1ArgType),
6737  NEONMAP1(vqrdmlah_s32, aarch64_neon_sqrdmlah, Add1ArgType),
6738  NEONMAP1(vqrdmlahq_s16, aarch64_neon_sqrdmlah, Add1ArgType),
6739  NEONMAP1(vqrdmlahq_s32, aarch64_neon_sqrdmlah, Add1ArgType),
6740  NEONMAP1(vqrdmlsh_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
6741  NEONMAP1(vqrdmlsh_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
6742  NEONMAP1(vqrdmlshq_s16, aarch64_neon_sqrdmlsh, Add1ArgType),
6743  NEONMAP1(vqrdmlshq_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
6744  NEONMAP1(vqrdmulh_lane_v, aarch64_neon_sqrdmulh_lane, 0),
6745  NEONMAP1(vqrdmulh_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
6746  NEONMAP1(vqrdmulh_v, aarch64_neon_sqrdmulh, Add1ArgType),
6747  NEONMAP1(vqrdmulhq_lane_v, aarch64_neon_sqrdmulh_lane, 0),
6748  NEONMAP1(vqrdmulhq_laneq_v, aarch64_neon_sqrdmulh_laneq, 0),
6749  NEONMAP1(vqrdmulhq_v, aarch64_neon_sqrdmulh, Add1ArgType),
6750  NEONMAP2(vqrshl_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
6751  NEONMAP2(vqrshlq_v, aarch64_neon_uqrshl, aarch64_neon_sqrshl, Add1ArgType | UnsignedAlts),
6752  NEONMAP2(vqshl_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl, UnsignedAlts),
6753  NEONMAP2(vqshl_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
6754  NEONMAP2(vqshlq_n_v, aarch64_neon_uqshl, aarch64_neon_sqshl,UnsignedAlts),
6755  NEONMAP2(vqshlq_v, aarch64_neon_uqshl, aarch64_neon_sqshl, Add1ArgType | UnsignedAlts),
6756  NEONMAP1(vqshlu_n_v, aarch64_neon_sqshlu, 0),
6757  NEONMAP1(vqshluq_n_v, aarch64_neon_sqshlu, 0),
6758  NEONMAP2(vqsub_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
6759  NEONMAP2(vqsubq_v, aarch64_neon_uqsub, aarch64_neon_sqsub, Add1ArgType | UnsignedAlts),
6760  NEONMAP1(vraddhn_v, aarch64_neon_raddhn, Add1ArgType),
6761  NEONMAP1(vrax1q_u64, aarch64_crypto_rax1, 0),
6762  NEONMAP2(vrecpe_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
6763  NEONMAP2(vrecpeq_v, aarch64_neon_frecpe, aarch64_neon_urecpe, 0),
6764  NEONMAP1(vrecps_v, aarch64_neon_frecps, Add1ArgType),
6765  NEONMAP1(vrecpsq_v, aarch64_neon_frecps, Add1ArgType),
6766  NEONMAP2(vrhadd_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
6767  NEONMAP2(vrhaddq_v, aarch64_neon_urhadd, aarch64_neon_srhadd, Add1ArgType | UnsignedAlts),
6768  NEONMAP1(vrnd32x_f32, aarch64_neon_frint32x, Add1ArgType),
6769  NEONMAP1(vrnd32x_f64, aarch64_neon_frint32x, Add1ArgType),
6770  NEONMAP1(vrnd32xq_f32, aarch64_neon_frint32x, Add1ArgType),
6771  NEONMAP1(vrnd32xq_f64, aarch64_neon_frint32x, Add1ArgType),
6772  NEONMAP1(vrnd32z_f32, aarch64_neon_frint32z, Add1ArgType),
6773  NEONMAP1(vrnd32z_f64, aarch64_neon_frint32z, Add1ArgType),
6774  NEONMAP1(vrnd32zq_f32, aarch64_neon_frint32z, Add1ArgType),
6775  NEONMAP1(vrnd32zq_f64, aarch64_neon_frint32z, Add1ArgType),
6776  NEONMAP1(vrnd64x_f32, aarch64_neon_frint64x, Add1ArgType),
6777  NEONMAP1(vrnd64x_f64, aarch64_neon_frint64x, Add1ArgType),
6778  NEONMAP1(vrnd64xq_f32, aarch64_neon_frint64x, Add1ArgType),
6779  NEONMAP1(vrnd64xq_f64, aarch64_neon_frint64x, Add1ArgType),
6780  NEONMAP1(vrnd64z_f32, aarch64_neon_frint64z, Add1ArgType),
6781  NEONMAP1(vrnd64z_f64, aarch64_neon_frint64z, Add1ArgType),
6782  NEONMAP1(vrnd64zq_f32, aarch64_neon_frint64z, Add1ArgType),
6783  NEONMAP1(vrnd64zq_f64, aarch64_neon_frint64z, Add1ArgType),
6784  NEONMAP0(vrndi_v),
6785  NEONMAP0(vrndiq_v),
6786  NEONMAP2(vrshl_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
6787  NEONMAP2(vrshlq_v, aarch64_neon_urshl, aarch64_neon_srshl, Add1ArgType | UnsignedAlts),
6788  NEONMAP2(vrshr_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
6789  NEONMAP2(vrshrq_n_v, aarch64_neon_urshl, aarch64_neon_srshl, UnsignedAlts),
6790  NEONMAP2(vrsqrte_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
6791  NEONMAP2(vrsqrteq_v, aarch64_neon_frsqrte, aarch64_neon_ursqrte, 0),
6792  NEONMAP1(vrsqrts_v, aarch64_neon_frsqrts, Add1ArgType),
6793  NEONMAP1(vrsqrtsq_v, aarch64_neon_frsqrts, Add1ArgType),
6794  NEONMAP1(vrsubhn_v, aarch64_neon_rsubhn, Add1ArgType),
6795  NEONMAP1(vsha1su0q_u32, aarch64_crypto_sha1su0, 0),
6796  NEONMAP1(vsha1su1q_u32, aarch64_crypto_sha1su1, 0),
6797  NEONMAP1(vsha256h2q_u32, aarch64_crypto_sha256h2, 0),
6798  NEONMAP1(vsha256hq_u32, aarch64_crypto_sha256h, 0),
6799  NEONMAP1(vsha256su0q_u32, aarch64_crypto_sha256su0, 0),
6800  NEONMAP1(vsha256su1q_u32, aarch64_crypto_sha256su1, 0),
6801  NEONMAP1(vsha512h2q_u64, aarch64_crypto_sha512h2, 0),
6802  NEONMAP1(vsha512hq_u64, aarch64_crypto_sha512h, 0),
6803  NEONMAP1(vsha512su0q_u64, aarch64_crypto_sha512su0, 0),
6804  NEONMAP1(vsha512su1q_u64, aarch64_crypto_sha512su1, 0),
6805  NEONMAP0(vshl_n_v),
6806  NEONMAP2(vshl_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
6807  NEONMAP0(vshll_n_v),
6808  NEONMAP0(vshlq_n_v),
6809  NEONMAP2(vshlq_v, aarch64_neon_ushl, aarch64_neon_sshl, Add1ArgType | UnsignedAlts),
6810  NEONMAP0(vshr_n_v),
6811  NEONMAP0(vshrn_n_v),
6812  NEONMAP0(vshrq_n_v),
6813  NEONMAP1(vsm3partw1q_u32, aarch64_crypto_sm3partw1, 0),
6814  NEONMAP1(vsm3partw2q_u32, aarch64_crypto_sm3partw2, 0),
6815  NEONMAP1(vsm3ss1q_u32, aarch64_crypto_sm3ss1, 0),
6816  NEONMAP1(vsm3tt1aq_u32, aarch64_crypto_sm3tt1a, 0),
6817  NEONMAP1(vsm3tt1bq_u32, aarch64_crypto_sm3tt1b, 0),
6818  NEONMAP1(vsm3tt2aq_u32, aarch64_crypto_sm3tt2a, 0),
6819  NEONMAP1(vsm3tt2bq_u32, aarch64_crypto_sm3tt2b, 0),
6820  NEONMAP1(vsm4ekeyq_u32, aarch64_crypto_sm4ekey, 0),
6821  NEONMAP1(vsm4eq_u32, aarch64_crypto_sm4e, 0),
6822  NEONMAP1(vst1_x2_v, aarch64_neon_st1x2, 0),
6823  NEONMAP1(vst1_x3_v, aarch64_neon_st1x3, 0),
6824  NEONMAP1(vst1_x4_v, aarch64_neon_st1x4, 0),
6825  NEONMAP1(vst1q_x2_v, aarch64_neon_st1x2, 0),
6826  NEONMAP1(vst1q_x3_v, aarch64_neon_st1x3, 0),
6827  NEONMAP1(vst1q_x4_v, aarch64_neon_st1x4, 0),
6828  NEONMAP0(vsubhn_v),
6829  NEONMAP0(vtst_v),
6830  NEONMAP0(vtstq_v),
6831  NEONMAP1(vusdot_s32, aarch64_neon_usdot, 0),
6832  NEONMAP1(vusdotq_s32, aarch64_neon_usdot, 0),
6833  NEONMAP1(vusmmlaq_s32, aarch64_neon_usmmla, 0),
6834  NEONMAP1(vxarq_u64, aarch64_crypto_xar, 0),
6835};
6836
6837static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
6838  NEONMAP1(vabdd_f64, aarch64_sisd_fabd, Add1ArgType),
6839  NEONMAP1(vabds_f32, aarch64_sisd_fabd, Add1ArgType),
6840  NEONMAP1(vabsd_s64, aarch64_neon_abs, Add1ArgType),
6841  NEONMAP1(vaddlv_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
6842  NEONMAP1(vaddlv_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
6843  NEONMAP1(vaddlvq_s32, aarch64_neon_saddlv, AddRetType | Add1ArgType),
6844  NEONMAP1(vaddlvq_u32, aarch64_neon_uaddlv, AddRetType | Add1ArgType),
6845  NEONMAP1(vaddv_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
6846  NEONMAP1(vaddv_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
6847  NEONMAP1(vaddv_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6848  NEONMAP1(vaddvq_f32, aarch64_neon_faddv, AddRetType | Add1ArgType),
6849  NEONMAP1(vaddvq_f64, aarch64_neon_faddv, AddRetType | Add1ArgType),
6850  NEONMAP1(vaddvq_s32, aarch64_neon_saddv, AddRetType | Add1ArgType),
6851  NEONMAP1(vaddvq_s64, aarch64_neon_saddv, AddRetType | Add1ArgType),
6852  NEONMAP1(vaddvq_u32, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6853  NEONMAP1(vaddvq_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6854  NEONMAP1(vcaged_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
6855  NEONMAP1(vcages_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
6856  NEONMAP1(vcagtd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
6857  NEONMAP1(vcagts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
6858  NEONMAP1(vcaled_f64, aarch64_neon_facge, AddRetType | Add1ArgType),
6859  NEONMAP1(vcales_f32, aarch64_neon_facge, AddRetType | Add1ArgType),
6860  NEONMAP1(vcaltd_f64, aarch64_neon_facgt, AddRetType | Add1ArgType),
6861  NEONMAP1(vcalts_f32, aarch64_neon_facgt, AddRetType | Add1ArgType),
6862  NEONMAP1(vcvtad_s64_f64, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6863  NEONMAP1(vcvtad_u64_f64, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6864  NEONMAP1(vcvtas_s32_f32, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
6865  NEONMAP1(vcvtas_u32_f32, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
6866  NEONMAP1(vcvtd_n_f64_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6867  NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6868  NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6869  NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6870  NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6871  NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6872  NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
6873  NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6874  NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6875  NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
6876  NEONMAP1(vcvtms_u32_f32, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
6877  NEONMAP1(vcvtnd_s64_f64, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6878  NEONMAP1(vcvtnd_u64_f64, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6879  NEONMAP1(vcvtns_s32_f32, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
6880  NEONMAP1(vcvtns_u32_f32, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
6881  NEONMAP1(vcvtpd_s64_f64, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6882  NEONMAP1(vcvtpd_u64_f64, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6883  NEONMAP1(vcvtps_s32_f32, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
6884  NEONMAP1(vcvtps_u32_f32, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
6885  NEONMAP1(vcvts_n_f32_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
6886  NEONMAP1(vcvts_n_f32_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
6887  NEONMAP1(vcvts_n_s32_f32, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
6888  NEONMAP1(vcvts_n_u32_f32, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
6889  NEONMAP1(vcvts_s32_f32, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
6890  NEONMAP1(vcvts_u32_f32, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
6891  NEONMAP1(vcvtxd_f32_f64, aarch64_sisd_fcvtxn, 0),
6892  NEONMAP1(vmaxnmv_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6893  NEONMAP1(vmaxnmvq_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6894  NEONMAP1(vmaxnmvq_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6895  NEONMAP1(vmaxv_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6896  NEONMAP1(vmaxv_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
6897  NEONMAP1(vmaxv_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
6898  NEONMAP1(vmaxvq_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6899  NEONMAP1(vmaxvq_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6900  NEONMAP1(vmaxvq_s32, aarch64_neon_smaxv, AddRetType | Add1ArgType),
6901  NEONMAP1(vmaxvq_u32, aarch64_neon_umaxv, AddRetType | Add1ArgType),
6902  NEONMAP1(vminnmv_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6903  NEONMAP1(vminnmvq_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6904  NEONMAP1(vminnmvq_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6905  NEONMAP1(vminv_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6906  NEONMAP1(vminv_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
6907  NEONMAP1(vminv_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
6908  NEONMAP1(vminvq_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6909  NEONMAP1(vminvq_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
6910  NEONMAP1(vminvq_s32, aarch64_neon_sminv, AddRetType | Add1ArgType),
6911  NEONMAP1(vminvq_u32, aarch64_neon_uminv, AddRetType | Add1ArgType),
6912  NEONMAP1(vmull_p64, aarch64_neon_pmull64, 0),
6913  NEONMAP1(vmulxd_f64, aarch64_neon_fmulx, Add1ArgType),
6914  NEONMAP1(vmulxs_f32, aarch64_neon_fmulx, Add1ArgType),
6915  NEONMAP1(vpaddd_s64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6916  NEONMAP1(vpaddd_u64, aarch64_neon_uaddv, AddRetType | Add1ArgType),
6917  NEONMAP1(vpmaxnmqd_f64, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6918  NEONMAP1(vpmaxnms_f32, aarch64_neon_fmaxnmv, AddRetType | Add1ArgType),
6919  NEONMAP1(vpmaxqd_f64, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6920  NEONMAP1(vpmaxs_f32, aarch64_neon_fmaxv, AddRetType | Add1ArgType),
6921  NEONMAP1(vpminnmqd_f64, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6922  NEONMAP1(vpminnms_f32, aarch64_neon_fminnmv, AddRetType | Add1ArgType),
6923  NEONMAP1(vpminqd_f64, aarch64_neon_fminv, AddRetType | Add1ArgType),
6924  NEONMAP1(vpmins_f32, aarch64_neon_fminv, AddRetType | Add1ArgType),
6925  NEONMAP1(vqabsb_s8, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
6926  NEONMAP1(vqabsd_s64, aarch64_neon_sqabs, Add1ArgType),
6927  NEONMAP1(vqabsh_s16, aarch64_neon_sqabs, Vectorize1ArgType | Use64BitVectors),
6928  NEONMAP1(vqabss_s32, aarch64_neon_sqabs, Add1ArgType),
6929  NEONMAP1(vqaddb_s8, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
6930  NEONMAP1(vqaddb_u8, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
6931  NEONMAP1(vqaddd_s64, aarch64_neon_sqadd, Add1ArgType),
6932  NEONMAP1(vqaddd_u64, aarch64_neon_uqadd, Add1ArgType),
6933  NEONMAP1(vqaddh_s16, aarch64_neon_sqadd, Vectorize1ArgType | Use64BitVectors),
6934  NEONMAP1(vqaddh_u16, aarch64_neon_uqadd, Vectorize1ArgType | Use64BitVectors),
6935  NEONMAP1(vqadds_s32, aarch64_neon_sqadd, Add1ArgType),
6936  NEONMAP1(vqadds_u32, aarch64_neon_uqadd, Add1ArgType),
6937  NEONMAP1(vqdmulhh_s16, aarch64_neon_sqdmulh, Vectorize1ArgType | Use64BitVectors),
6938  NEONMAP1(vqdmulhs_s32, aarch64_neon_sqdmulh, Add1ArgType),
6939  NEONMAP1(vqdmullh_s16, aarch64_neon_sqdmull, VectorRet | Use128BitVectors),
6940  NEONMAP1(vqdmulls_s32, aarch64_neon_sqdmulls_scalar, 0),
6941  NEONMAP1(vqmovnd_s64, aarch64_neon_scalar_sqxtn, AddRetType | Add1ArgType),
6942  NEONMAP1(vqmovnd_u64, aarch64_neon_scalar_uqxtn, AddRetType | Add1ArgType),
6943  NEONMAP1(vqmovnh_s16, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
6944  NEONMAP1(vqmovnh_u16, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
6945  NEONMAP1(vqmovns_s32, aarch64_neon_sqxtn, VectorRet | Use64BitVectors),
6946  NEONMAP1(vqmovns_u32, aarch64_neon_uqxtn, VectorRet | Use64BitVectors),
6947  NEONMAP1(vqmovund_s64, aarch64_neon_scalar_sqxtun, AddRetType | Add1ArgType),
6948  NEONMAP1(vqmovunh_s16, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
6949  NEONMAP1(vqmovuns_s32, aarch64_neon_sqxtun, VectorRet | Use64BitVectors),
6950  NEONMAP1(vqnegb_s8, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
6951  NEONMAP1(vqnegd_s64, aarch64_neon_sqneg, Add1ArgType),
6952  NEONMAP1(vqnegh_s16, aarch64_neon_sqneg, Vectorize1ArgType | Use64BitVectors),
6953  NEONMAP1(vqnegs_s32, aarch64_neon_sqneg, Add1ArgType),
6954  NEONMAP1(vqrdmlahh_s16, aarch64_neon_sqrdmlah, Vectorize1ArgType | Use64BitVectors),
6955  NEONMAP1(vqrdmlahs_s32, aarch64_neon_sqrdmlah, Add1ArgType),
6956  NEONMAP1(vqrdmlshh_s16, aarch64_neon_sqrdmlsh, Vectorize1ArgType | Use64BitVectors),
6957  NEONMAP1(vqrdmlshs_s32, aarch64_neon_sqrdmlsh, Add1ArgType),
6958  NEONMAP1(vqrdmulhh_s16, aarch64_neon_sqrdmulh, Vectorize1ArgType | Use64BitVectors),
6959  NEONMAP1(vqrdmulhs_s32, aarch64_neon_sqrdmulh, Add1ArgType),
6960  NEONMAP1(vqrshlb_s8, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
6961  NEONMAP1(vqrshlb_u8, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
6962  NEONMAP1(vqrshld_s64, aarch64_neon_sqrshl, Add1ArgType),
6963  NEONMAP1(vqrshld_u64, aarch64_neon_uqrshl, Add1ArgType),
6964  NEONMAP1(vqrshlh_s16, aarch64_neon_sqrshl, Vectorize1ArgType | Use64BitVectors),
6965  NEONMAP1(vqrshlh_u16, aarch64_neon_uqrshl, Vectorize1ArgType | Use64BitVectors),
6966  NEONMAP1(vqrshls_s32, aarch64_neon_sqrshl, Add1ArgType),
6967  NEONMAP1(vqrshls_u32, aarch64_neon_uqrshl, Add1ArgType),
6968  NEONMAP1(vqrshrnd_n_s64, aarch64_neon_sqrshrn, AddRetType),
6969  NEONMAP1(vqrshrnd_n_u64, aarch64_neon_uqrshrn, AddRetType),
6970  NEONMAP1(vqrshrnh_n_s16, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
6971  NEONMAP1(vqrshrnh_n_u16, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
6972  NEONMAP1(vqrshrns_n_s32, aarch64_neon_sqrshrn, VectorRet | Use64BitVectors),
6973  NEONMAP1(vqrshrns_n_u32, aarch64_neon_uqrshrn, VectorRet | Use64BitVectors),
6974  NEONMAP1(vqrshrund_n_s64, aarch64_neon_sqrshrun, AddRetType),
6975  NEONMAP1(vqrshrunh_n_s16, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
6976  NEONMAP1(vqrshruns_n_s32, aarch64_neon_sqrshrun, VectorRet | Use64BitVectors),
6977  NEONMAP1(vqshlb_n_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6978  NEONMAP1(vqshlb_n_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6979  NEONMAP1(vqshlb_s8, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6980  NEONMAP1(vqshlb_u8, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6981  NEONMAP1(vqshld_s64, aarch64_neon_sqshl, Add1ArgType),
6982  NEONMAP1(vqshld_u64, aarch64_neon_uqshl, Add1ArgType),
6983  NEONMAP1(vqshlh_n_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6984  NEONMAP1(vqshlh_n_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6985  NEONMAP1(vqshlh_s16, aarch64_neon_sqshl, Vectorize1ArgType | Use64BitVectors),
6986  NEONMAP1(vqshlh_u16, aarch64_neon_uqshl, Vectorize1ArgType | Use64BitVectors),
6987  NEONMAP1(vqshls_n_s32, aarch64_neon_sqshl, Add1ArgType),
6988  NEONMAP1(vqshls_n_u32, aarch64_neon_uqshl, Add1ArgType),
6989  NEONMAP1(vqshls_s32, aarch64_neon_sqshl, Add1ArgType),
6990  NEONMAP1(vqshls_u32, aarch64_neon_uqshl, Add1ArgType),
6991  NEONMAP1(vqshlub_n_s8, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
6992  NEONMAP1(vqshluh_n_s16, aarch64_neon_sqshlu, Vectorize1ArgType | Use64BitVectors),
6993  NEONMAP1(vqshlus_n_s32, aarch64_neon_sqshlu, Add1ArgType),
6994  NEONMAP1(vqshrnd_n_s64, aarch64_neon_sqshrn, AddRetType),
6995  NEONMAP1(vqshrnd_n_u64, aarch64_neon_uqshrn, AddRetType),
6996  NEONMAP1(vqshrnh_n_s16, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
6997  NEONMAP1(vqshrnh_n_u16, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
6998  NEONMAP1(vqshrns_n_s32, aarch64_neon_sqshrn, VectorRet | Use64BitVectors),
6999  NEONMAP1(vqshrns_n_u32, aarch64_neon_uqshrn, VectorRet | Use64BitVectors),
7000  NEONMAP1(vqshrund_n_s64, aarch64_neon_sqshrun, AddRetType),
7001  NEONMAP1(vqshrunh_n_s16, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
7002  NEONMAP1(vqshruns_n_s32, aarch64_neon_sqshrun, VectorRet | Use64BitVectors),
7003  NEONMAP1(vqsubb_s8, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
7004  NEONMAP1(vqsubb_u8, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
7005  NEONMAP1(vqsubd_s64, aarch64_neon_sqsub, Add1ArgType),
7006  NEONMAP1(vqsubd_u64, aarch64_neon_uqsub, Add1ArgType),
7007  NEONMAP1(vqsubh_s16, aarch64_neon_sqsub, Vectorize1ArgType | Use64BitVectors),
7008  NEONMAP1(vqsubh_u16, aarch64_neon_uqsub, Vectorize1ArgType | Use64BitVectors),
7009  NEONMAP1(vqsubs_s32, aarch64_neon_sqsub, Add1ArgType),
7010  NEONMAP1(vqsubs_u32, aarch64_neon_uqsub, Add1ArgType),
7011  NEONMAP1(vrecped_f64, aarch64_neon_frecpe, Add1ArgType),
7012  NEONMAP1(vrecpes_f32, aarch64_neon_frecpe, Add1ArgType),
7013  NEONMAP1(vrecpxd_f64, aarch64_neon_frecpx, Add1ArgType),
7014  NEONMAP1(vrecpxs_f32, aarch64_neon_frecpx, Add1ArgType),
7015  NEONMAP1(vrshld_s64, aarch64_neon_srshl, Add1ArgType),
7016  NEONMAP1(vrshld_u64, aarch64_neon_urshl, Add1ArgType),
7017  NEONMAP1(vrsqrted_f64, aarch64_neon_frsqrte, Add1ArgType),
7018  NEONMAP1(vrsqrtes_f32, aarch64_neon_frsqrte, Add1ArgType),
7019  NEONMAP1(vrsqrtsd_f64, aarch64_neon_frsqrts, Add1ArgType),
7020  NEONMAP1(vrsqrtss_f32, aarch64_neon_frsqrts, Add1ArgType),
7021  NEONMAP1(vsha1cq_u32, aarch64_crypto_sha1c, 0),
7022  NEONMAP1(vsha1h_u32, aarch64_crypto_sha1h, 0),
7023  NEONMAP1(vsha1mq_u32, aarch64_crypto_sha1m, 0),
7024  NEONMAP1(vsha1pq_u32, aarch64_crypto_sha1p, 0),
7025  NEONMAP1(vshld_s64, aarch64_neon_sshl, Add1ArgType),
7026  NEONMAP1(vshld_u64, aarch64_neon_ushl, Add1ArgType),
7027  NEONMAP1(vslid_n_s64, aarch64_neon_vsli, Vectorize1ArgType),
7028  NEONMAP1(vslid_n_u64, aarch64_neon_vsli, Vectorize1ArgType),
7029  NEONMAP1(vsqaddb_u8, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
7030  NEONMAP1(vsqaddd_u64, aarch64_neon_usqadd, Add1ArgType),
7031  NEONMAP1(vsqaddh_u16, aarch64_neon_usqadd, Vectorize1ArgType | Use64BitVectors),
7032  NEONMAP1(vsqadds_u32, aarch64_neon_usqadd, Add1ArgType),
7033  NEONMAP1(vsrid_n_s64, aarch64_neon_vsri, Vectorize1ArgType),
7034  NEONMAP1(vsrid_n_u64, aarch64_neon_vsri, Vectorize1ArgType),
7035  NEONMAP1(vuqaddb_s8, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
7036  NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType),
7037  NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors),
7038  NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType),
7039  // FP16 scalar intrinisics go here.
7040  NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType),
7041  NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
7042  NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType),
7043  NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
7044  NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType),
7045  NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
7046  NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType),
7047  NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
7048  NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
7049  NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
7050  NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
7051  NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
7052  NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
7053  NEONMAP1(vcvth_s32_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
7054  NEONMAP1(vcvth_s64_f16, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
7055  NEONMAP1(vcvth_u32_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
7056  NEONMAP1(vcvth_u64_f16, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
7057  NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
7058  NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
7059  NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
7060  NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
7061  NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
7062  NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType),
7063  NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
7064  NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType),
7065  NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
7066  NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType),
7067  NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
7068  NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType),
7069  NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType),
7070  NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType),
7071  NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType),
7072  NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType),
7073  NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType),
7074};
7075
7076// Some intrinsics are equivalent for codegen.
7077static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = {
7078  { NEON::BI__builtin_neon_splat_lane_bf16, NEON::BI__builtin_neon_splat_lane_v, },
7079  { NEON::BI__builtin_neon_splat_laneq_bf16, NEON::BI__builtin_neon_splat_laneq_v, },
7080  { NEON::BI__builtin_neon_splatq_lane_bf16, NEON::BI__builtin_neon_splatq_lane_v, },
7081  { NEON::BI__builtin_neon_splatq_laneq_bf16, NEON::BI__builtin_neon_splatq_laneq_v, },
7082  { NEON::BI__builtin_neon_vabd_f16, NEON::BI__builtin_neon_vabd_v, },
7083  { NEON::BI__builtin_neon_vabdq_f16, NEON::BI__builtin_neon_vabdq_v, },
7084  { NEON::BI__builtin_neon_vabs_f16, NEON::BI__builtin_neon_vabs_v, },
7085  { NEON::BI__builtin_neon_vabsq_f16, NEON::BI__builtin_neon_vabsq_v, },
7086  { NEON::BI__builtin_neon_vbsl_f16, NEON::BI__builtin_neon_vbsl_v, },
7087  { NEON::BI__builtin_neon_vbslq_f16, NEON::BI__builtin_neon_vbslq_v, },
7088  { NEON::BI__builtin_neon_vcage_f16, NEON::BI__builtin_neon_vcage_v, },
7089  { NEON::BI__builtin_neon_vcageq_f16, NEON::BI__builtin_neon_vcageq_v, },
7090  { NEON::BI__builtin_neon_vcagt_f16, NEON::BI__builtin_neon_vcagt_v, },
7091  { NEON::BI__builtin_neon_vcagtq_f16, NEON::BI__builtin_neon_vcagtq_v, },
7092  { NEON::BI__builtin_neon_vcale_f16, NEON::BI__builtin_neon_vcale_v, },
7093  { NEON::BI__builtin_neon_vcaleq_f16, NEON::BI__builtin_neon_vcaleq_v, },
7094  { NEON::BI__builtin_neon_vcalt_f16, NEON::BI__builtin_neon_vcalt_v, },
7095  { NEON::BI__builtin_neon_vcaltq_f16, NEON::BI__builtin_neon_vcaltq_v, },
7096  { NEON::BI__builtin_neon_vceqz_f16, NEON::BI__builtin_neon_vceqz_v, },
7097  { NEON::BI__builtin_neon_vceqzq_f16, NEON::BI__builtin_neon_vceqzq_v, },
7098  { NEON::BI__builtin_neon_vcgez_f16, NEON::BI__builtin_neon_vcgez_v, },
7099  { NEON::BI__builtin_neon_vcgezq_f16, NEON::BI__builtin_neon_vcgezq_v, },
7100  { NEON::BI__builtin_neon_vcgtz_f16, NEON::BI__builtin_neon_vcgtz_v, },
7101  { NEON::BI__builtin_neon_vcgtzq_f16, NEON::BI__builtin_neon_vcgtzq_v, },
7102  { NEON::BI__builtin_neon_vclez_f16, NEON::BI__builtin_neon_vclez_v, },
7103  { NEON::BI__builtin_neon_vclezq_f16, NEON::BI__builtin_neon_vclezq_v, },
7104  { NEON::BI__builtin_neon_vcltz_f16, NEON::BI__builtin_neon_vcltz_v, },
7105  { NEON::BI__builtin_neon_vcltzq_f16, NEON::BI__builtin_neon_vcltzq_v, },
7106  { NEON::BI__builtin_neon_vext_f16, NEON::BI__builtin_neon_vext_v, },
7107  { NEON::BI__builtin_neon_vextq_f16, NEON::BI__builtin_neon_vextq_v, },
7108  { NEON::BI__builtin_neon_vfma_f16, NEON::BI__builtin_neon_vfma_v, },
7109  { NEON::BI__builtin_neon_vfma_lane_f16, NEON::BI__builtin_neon_vfma_lane_v, },
7110  { NEON::BI__builtin_neon_vfma_laneq_f16, NEON::BI__builtin_neon_vfma_laneq_v, },
7111  { NEON::BI__builtin_neon_vfmaq_f16, NEON::BI__builtin_neon_vfmaq_v, },
7112  { NEON::BI__builtin_neon_vfmaq_lane_f16, NEON::BI__builtin_neon_vfmaq_lane_v, },
7113  { NEON::BI__builtin_neon_vfmaq_laneq_f16, NEON::BI__builtin_neon_vfmaq_laneq_v, },
7114  { NEON::BI__builtin_neon_vld1_bf16_x2, NEON::BI__builtin_neon_vld1_x2_v },
7115  { NEON::BI__builtin_neon_vld1_bf16_x3, NEON::BI__builtin_neon_vld1_x3_v },
7116  { NEON::BI__builtin_neon_vld1_bf16_x4, NEON::BI__builtin_neon_vld1_x4_v },
7117  { NEON::BI__builtin_neon_vld1_bf16, NEON::BI__builtin_neon_vld1_v },
7118  { NEON::BI__builtin_neon_vld1_dup_bf16, NEON::BI__builtin_neon_vld1_dup_v },
7119  { NEON::BI__builtin_neon_vld1_lane_bf16, NEON::BI__builtin_neon_vld1_lane_v },
7120  { NEON::BI__builtin_neon_vld1q_bf16_x2, NEON::BI__builtin_neon_vld1q_x2_v },
7121  { NEON::BI__builtin_neon_vld1q_bf16_x3, NEON::BI__builtin_neon_vld1q_x3_v },
7122  { NEON::BI__builtin_neon_vld1q_bf16_x4, NEON::BI__builtin_neon_vld1q_x4_v },
7123  { NEON::BI__builtin_neon_vld1q_bf16, NEON::BI__builtin_neon_vld1q_v },
7124  { NEON::BI__builtin_neon_vld1q_dup_bf16, NEON::BI__builtin_neon_vld1q_dup_v },
7125  { NEON::BI__builtin_neon_vld1q_lane_bf16, NEON::BI__builtin_neon_vld1q_lane_v },
7126  { NEON::BI__builtin_neon_vld2_bf16, NEON::BI__builtin_neon_vld2_v },
7127  { NEON::BI__builtin_neon_vld2_dup_bf16, NEON::BI__builtin_neon_vld2_dup_v },
7128  { NEON::BI__builtin_neon_vld2_lane_bf16, NEON::BI__builtin_neon_vld2_lane_v },
7129  { NEON::BI__builtin_neon_vld2q_bf16, NEON::BI__builtin_neon_vld2q_v },
7130  { NEON::BI__builtin_neon_vld2q_dup_bf16, NEON::BI__builtin_neon_vld2q_dup_v },
7131  { NEON::BI__builtin_neon_vld2q_lane_bf16, NEON::BI__builtin_neon_vld2q_lane_v },
7132  { NEON::BI__builtin_neon_vld3_bf16, NEON::BI__builtin_neon_vld3_v },
7133  { NEON::BI__builtin_neon_vld3_dup_bf16, NEON::BI__builtin_neon_vld3_dup_v },
7134  { NEON::BI__builtin_neon_vld3_lane_bf16, NEON::BI__builtin_neon_vld3_lane_v },
7135  { NEON::BI__builtin_neon_vld3q_bf16, NEON::BI__builtin_neon_vld3q_v },
7136  { NEON::BI__builtin_neon_vld3q_dup_bf16, NEON::BI__builtin_neon_vld3q_dup_v },
7137  { NEON::BI__builtin_neon_vld3q_lane_bf16, NEON::BI__builtin_neon_vld3q_lane_v },
7138  { NEON::BI__builtin_neon_vld4_bf16, NEON::BI__builtin_neon_vld4_v },
7139  { NEON::BI__builtin_neon_vld4_dup_bf16, NEON::BI__builtin_neon_vld4_dup_v },
7140  { NEON::BI__builtin_neon_vld4_lane_bf16, NEON::BI__builtin_neon_vld4_lane_v },
7141  { NEON::BI__builtin_neon_vld4q_bf16, NEON::BI__builtin_neon_vld4q_v },
7142  { NEON::BI__builtin_neon_vld4q_dup_bf16, NEON::BI__builtin_neon_vld4q_dup_v },
7143  { NEON::BI__builtin_neon_vld4q_lane_bf16, NEON::BI__builtin_neon_vld4q_lane_v },
7144  { NEON::BI__builtin_neon_vmax_f16, NEON::BI__builtin_neon_vmax_v, },
7145  { NEON::BI__builtin_neon_vmaxnm_f16, NEON::BI__builtin_neon_vmaxnm_v, },
7146  { NEON::BI__builtin_neon_vmaxnmq_f16, NEON::BI__builtin_neon_vmaxnmq_v, },
7147  { NEON::BI__builtin_neon_vmaxq_f16, NEON::BI__builtin_neon_vmaxq_v, },
7148  { NEON::BI__builtin_neon_vmin_f16, NEON::BI__builtin_neon_vmin_v, },
7149  { NEON::BI__builtin_neon_vminnm_f16, NEON::BI__builtin_neon_vminnm_v, },
7150  { NEON::BI__builtin_neon_vminnmq_f16, NEON::BI__builtin_neon_vminnmq_v, },
7151  { NEON::BI__builtin_neon_vminq_f16, NEON::BI__builtin_neon_vminq_v, },
7152  { NEON::BI__builtin_neon_vmulx_f16, NEON::BI__builtin_neon_vmulx_v, },
7153  { NEON::BI__builtin_neon_vmulxq_f16, NEON::BI__builtin_neon_vmulxq_v, },
7154  { NEON::BI__builtin_neon_vpadd_f16, NEON::BI__builtin_neon_vpadd_v, },
7155  { NEON::BI__builtin_neon_vpaddq_f16, NEON::BI__builtin_neon_vpaddq_v, },
7156  { NEON::BI__builtin_neon_vpmax_f16, NEON::BI__builtin_neon_vpmax_v, },
7157  { NEON::BI__builtin_neon_vpmaxnm_f16, NEON::BI__builtin_neon_vpmaxnm_v, },
7158  { NEON::BI__builtin_neon_vpmaxnmq_f16, NEON::BI__builtin_neon_vpmaxnmq_v, },
7159  { NEON::BI__builtin_neon_vpmaxq_f16, NEON::BI__builtin_neon_vpmaxq_v, },
7160  { NEON::BI__builtin_neon_vpmin_f16, NEON::BI__builtin_neon_vpmin_v, },
7161  { NEON::BI__builtin_neon_vpminnm_f16, NEON::BI__builtin_neon_vpminnm_v, },
7162  { NEON::BI__builtin_neon_vpminnmq_f16, NEON::BI__builtin_neon_vpminnmq_v, },
7163  { NEON::BI__builtin_neon_vpminq_f16, NEON::BI__builtin_neon_vpminq_v, },
7164  { NEON::BI__builtin_neon_vrecpe_f16, NEON::BI__builtin_neon_vrecpe_v, },
7165  { NEON::BI__builtin_neon_vrecpeq_f16, NEON::BI__builtin_neon_vrecpeq_v, },
7166  { NEON::BI__builtin_neon_vrecps_f16, NEON::BI__builtin_neon_vrecps_v, },
7167  { NEON::BI__builtin_neon_vrecpsq_f16, NEON::BI__builtin_neon_vrecpsq_v, },
7168  { NEON::BI__builtin_neon_vrnd_f16, NEON::BI__builtin_neon_vrnd_v, },
7169  { NEON::BI__builtin_neon_vrnda_f16, NEON::BI__builtin_neon_vrnda_v, },
7170  { NEON::BI__builtin_neon_vrndaq_f16, NEON::BI__builtin_neon_vrndaq_v, },
7171  { NEON::BI__builtin_neon_vrndi_f16, NEON::BI__builtin_neon_vrndi_v, },
7172  { NEON::BI__builtin_neon_vrndiq_f16, NEON::BI__builtin_neon_vrndiq_v, },
7173  { NEON::BI__builtin_neon_vrndm_f16, NEON::BI__builtin_neon_vrndm_v, },
7174  { NEON::BI__builtin_neon_vrndmq_f16, NEON::BI__builtin_neon_vrndmq_v, },
7175  { NEON::BI__builtin_neon_vrndn_f16, NEON::BI__builtin_neon_vrndn_v, },
7176  { NEON::BI__builtin_neon_vrndnq_f16, NEON::BI__builtin_neon_vrndnq_v, },
7177  { NEON::BI__builtin_neon_vrndp_f16, NEON::BI__builtin_neon_vrndp_v, },
7178  { NEON::BI__builtin_neon_vrndpq_f16, NEON::BI__builtin_neon_vrndpq_v, },
7179  { NEON::BI__builtin_neon_vrndq_f16, NEON::BI__builtin_neon_vrndq_v, },
7180  { NEON::BI__builtin_neon_vrndx_f16, NEON::BI__builtin_neon_vrndx_v, },
7181  { NEON::BI__builtin_neon_vrndxq_f16, NEON::BI__builtin_neon_vrndxq_v, },
7182  { NEON::BI__builtin_neon_vrsqrte_f16, NEON::BI__builtin_neon_vrsqrte_v, },
7183  { NEON::BI__builtin_neon_vrsqrteq_f16, NEON::BI__builtin_neon_vrsqrteq_v, },
7184  { NEON::BI__builtin_neon_vrsqrts_f16, NEON::BI__builtin_neon_vrsqrts_v, },
7185  { NEON::BI__builtin_neon_vrsqrtsq_f16, NEON::BI__builtin_neon_vrsqrtsq_v, },
7186  { NEON::BI__builtin_neon_vsqrt_f16, NEON::BI__builtin_neon_vsqrt_v, },
7187  { NEON::BI__builtin_neon_vsqrtq_f16, NEON::BI__builtin_neon_vsqrtq_v, },
7188  { NEON::BI__builtin_neon_vst1_bf16_x2, NEON::BI__builtin_neon_vst1_x2_v },
7189  { NEON::BI__builtin_neon_vst1_bf16_x3, NEON::BI__builtin_neon_vst1_x3_v },
7190  { NEON::BI__builtin_neon_vst1_bf16_x4, NEON::BI__builtin_neon_vst1_x4_v },
7191  { NEON::BI__builtin_neon_vst1_bf16, NEON::BI__builtin_neon_vst1_v },
7192  { NEON::BI__builtin_neon_vst1_lane_bf16, NEON::BI__builtin_neon_vst1_lane_v },
7193  { NEON::BI__builtin_neon_vst1q_bf16_x2, NEON::BI__builtin_neon_vst1q_x2_v },
7194  { NEON::BI__builtin_neon_vst1q_bf16_x3, NEON::BI__builtin_neon_vst1q_x3_v },
7195  { NEON::BI__builtin_neon_vst1q_bf16_x4, NEON::BI__builtin_neon_vst1q_x4_v },
7196  { NEON::BI__builtin_neon_vst1q_bf16, NEON::BI__builtin_neon_vst1q_v },
7197  { NEON::BI__builtin_neon_vst1q_lane_bf16, NEON::BI__builtin_neon_vst1q_lane_v },
7198  { NEON::BI__builtin_neon_vst2_bf16, NEON::BI__builtin_neon_vst2_v },
7199  { NEON::BI__builtin_neon_vst2_lane_bf16, NEON::BI__builtin_neon_vst2_lane_v },
7200  { NEON::BI__builtin_neon_vst2q_bf16, NEON::BI__builtin_neon_vst2q_v },
7201  { NEON::BI__builtin_neon_vst2q_lane_bf16, NEON::BI__builtin_neon_vst2q_lane_v },
7202  { NEON::BI__builtin_neon_vst3_bf16, NEON::BI__builtin_neon_vst3_v },
7203  { NEON::BI__builtin_neon_vst3_lane_bf16, NEON::BI__builtin_neon_vst3_lane_v },
7204  { NEON::BI__builtin_neon_vst3q_bf16, NEON::BI__builtin_neon_vst3q_v },
7205  { NEON::BI__builtin_neon_vst3q_lane_bf16, NEON::BI__builtin_neon_vst3q_lane_v },
7206  { NEON::BI__builtin_neon_vst4_bf16, NEON::BI__builtin_neon_vst4_v },
7207  { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v },
7208  { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v },
7209  { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v },
7210  { NEON::BI__builtin_neon_vtrn_f16, NEON::BI__builtin_neon_vtrn_v, },
7211  { NEON::BI__builtin_neon_vtrnq_f16, NEON::BI__builtin_neon_vtrnq_v, },
7212  { NEON::BI__builtin_neon_vuzp_f16, NEON::BI__builtin_neon_vuzp_v, },
7213  { NEON::BI__builtin_neon_vuzpq_f16, NEON::BI__builtin_neon_vuzpq_v, },
7214  { NEON::BI__builtin_neon_vzip_f16, NEON::BI__builtin_neon_vzip_v, },
7215  { NEON::BI__builtin_neon_vzipq_f16, NEON::BI__builtin_neon_vzipq_v, },
7216  // The mangling rules cause us to have one ID for each type for vldap1(q)_lane
7217  // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an
7218  // arbitrary one to be handled as tha canonical variation.
7219  { NEON::BI__builtin_neon_vldap1_lane_u64, NEON::BI__builtin_neon_vldap1_lane_s64 },
7220  { NEON::BI__builtin_neon_vldap1_lane_f64, NEON::BI__builtin_neon_vldap1_lane_s64 },
7221  { NEON::BI__builtin_neon_vldap1_lane_p64, NEON::BI__builtin_neon_vldap1_lane_s64 },
7222  { NEON::BI__builtin_neon_vldap1q_lane_u64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
7223  { NEON::BI__builtin_neon_vldap1q_lane_f64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
7224  { NEON::BI__builtin_neon_vldap1q_lane_p64, NEON::BI__builtin_neon_vldap1q_lane_s64 },
7225  { NEON::BI__builtin_neon_vstl1_lane_u64, NEON::BI__builtin_neon_vstl1_lane_s64 },
7226  { NEON::BI__builtin_neon_vstl1_lane_f64, NEON::BI__builtin_neon_vstl1_lane_s64 },
7227  { NEON::BI__builtin_neon_vstl1_lane_p64, NEON::BI__builtin_neon_vstl1_lane_s64 },
7228  { NEON::BI__builtin_neon_vstl1q_lane_u64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
7229  { NEON::BI__builtin_neon_vstl1q_lane_f64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
7230  { NEON::BI__builtin_neon_vstl1q_lane_p64, NEON::BI__builtin_neon_vstl1q_lane_s64 },
7231};
7232
7233#undef NEONMAP0
7234#undef NEONMAP1
7235#undef NEONMAP2
7236
7237#define SVEMAP1(NameBase, LLVMIntrinsic, TypeModifier)                         \
7238  {                                                                            \
7239    #NameBase, SVE::BI__builtin_sve_##NameBase, Intrinsic::LLVMIntrinsic, 0,   \
7240        TypeModifier                                                           \
7241  }
7242
7243#define SVEMAP2(NameBase, TypeModifier)                                        \
7244  { #NameBase, SVE::BI__builtin_sve_##NameBase, 0, 0, TypeModifier }
7245static const ARMVectorIntrinsicInfo AArch64SVEIntrinsicMap[] = {
7246#define GET_SVE_LLVM_INTRINSIC_MAP
7247#include "clang/Basic/arm_sve_builtin_cg.inc"
7248#include "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def"
7249#undef GET_SVE_LLVM_INTRINSIC_MAP
7250};
7251
7252#undef SVEMAP1
7253#undef SVEMAP2
7254
7255#define SMEMAP1(NameBase, LLVMIntrinsic, TypeModifier)                         \
7256  {                                                                            \
7257    #NameBase, SME::BI__builtin_sme_##NameBase, Intrinsic::LLVMIntrinsic, 0,   \
7258        TypeModifier                                                           \
7259  }
7260
7261#define SMEMAP2(NameBase, TypeModifier)                                        \
7262  { #NameBase, SME::BI__builtin_sme_##NameBase, 0, 0, TypeModifier }
7263static const ARMVectorIntrinsicInfo AArch64SMEIntrinsicMap[] = {
7264#define GET_SME_LLVM_INTRINSIC_MAP
7265#include "clang/Basic/arm_sme_builtin_cg.inc"
7266#undef GET_SME_LLVM_INTRINSIC_MAP
7267};
7268
7269#undef SMEMAP1
7270#undef SMEMAP2
7271
7272static bool NEONSIMDIntrinsicsProvenSorted = false;
7273
7274static bool AArch64SIMDIntrinsicsProvenSorted = false;
7275static bool AArch64SISDIntrinsicsProvenSorted = false;
7276static bool AArch64SVEIntrinsicsProvenSorted = false;
7277static bool AArch64SMEIntrinsicsProvenSorted = false;
7278
7279static const ARMVectorIntrinsicInfo *
7280findARMVectorIntrinsicInMap(ArrayRef<ARMVectorIntrinsicInfo> IntrinsicMap,
7281                            unsigned BuiltinID, bool &MapProvenSorted) {
7282
7283#ifndef NDEBUG
7284  if (!MapProvenSorted) {
7285    assert(llvm::is_sorted(IntrinsicMap));
7286    MapProvenSorted = true;
7287  }
7288#endif
7289
7290  const ARMVectorIntrinsicInfo *Builtin =
7291      llvm::lower_bound(IntrinsicMap, BuiltinID);
7292
7293  if (Builtin != IntrinsicMap.end() && Builtin->BuiltinID == BuiltinID)
7294    return Builtin;
7295
7296  return nullptr;
7297}
7298
7299Function *CodeGenFunction::LookupNeonLLVMIntrinsic(unsigned IntrinsicID,
7300                                                   unsigned Modifier,
7301                                                   llvm::Type *ArgType,
7302                                                   const CallExpr *E) {
7303  int VectorSize = 0;
7304  if (Modifier & Use64BitVectors)
7305    VectorSize = 64;
7306  else if (Modifier & Use128BitVectors)
7307    VectorSize = 128;
7308
7309  // Return type.
7310  SmallVector<llvm::Type *, 3> Tys;
7311  if (Modifier & AddRetType) {
7312    llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
7313    if (Modifier & VectorizeRetType)
7314      Ty = llvm::FixedVectorType::get(
7315          Ty, VectorSize ? VectorSize / Ty->getPrimitiveSizeInBits() : 1);
7316
7317    Tys.push_back(Ty);
7318  }
7319
7320  // Arguments.
7321  if (Modifier & VectorizeArgTypes) {
7322    int Elts = VectorSize ? VectorSize / ArgType->getPrimitiveSizeInBits() : 1;
7323    ArgType = llvm::FixedVectorType::get(ArgType, Elts);
7324  }
7325
7326  if (Modifier & (Add1ArgType | Add2ArgTypes))
7327    Tys.push_back(ArgType);
7328
7329  if (Modifier & Add2ArgTypes)
7330    Tys.push_back(ArgType);
7331
7332  if (Modifier & InventFloatType)
7333    Tys.push_back(FloatTy);
7334
7335  return CGM.getIntrinsic(IntrinsicID, Tys);
7336}
7337
7338static Value *EmitCommonNeonSISDBuiltinExpr(
7339    CodeGenFunction &CGF, const ARMVectorIntrinsicInfo &SISDInfo,
7340    SmallVectorImpl<Value *> &Ops, const CallExpr *E) {
7341  unsigned BuiltinID = SISDInfo.BuiltinID;
7342  unsigned int Int = SISDInfo.LLVMIntrinsic;
7343  unsigned Modifier = SISDInfo.TypeModifier;
7344  const char *s = SISDInfo.NameHint;
7345
7346  switch (BuiltinID) {
7347  case NEON::BI__builtin_neon_vcled_s64:
7348  case NEON::BI__builtin_neon_vcled_u64:
7349  case NEON::BI__builtin_neon_vcles_f32:
7350  case NEON::BI__builtin_neon_vcled_f64:
7351  case NEON::BI__builtin_neon_vcltd_s64:
7352  case NEON::BI__builtin_neon_vcltd_u64:
7353  case NEON::BI__builtin_neon_vclts_f32:
7354  case NEON::BI__builtin_neon_vcltd_f64:
7355  case NEON::BI__builtin_neon_vcales_f32:
7356  case NEON::BI__builtin_neon_vcaled_f64:
7357  case NEON::BI__builtin_neon_vcalts_f32:
7358  case NEON::BI__builtin_neon_vcaltd_f64:
7359    // Only one direction of comparisons actually exist, cmle is actually a cmge
7360    // with swapped operands. The table gives us the right intrinsic but we
7361    // still need to do the swap.
7362    std::swap(Ops[0], Ops[1]);
7363    break;
7364  }
7365
7366  assert(Int && "Generic code assumes a valid intrinsic");
7367
7368  // Determine the type(s) of this overloaded AArch64 intrinsic.
7369  const Expr *Arg = E->getArg(0);
7370  llvm::Type *ArgTy = CGF.ConvertType(Arg->getType());
7371  Function *F = CGF.LookupNeonLLVMIntrinsic(Int, Modifier, ArgTy, E);
7372
7373  int j = 0;
7374  ConstantInt *C0 = ConstantInt::get(CGF.SizeTy, 0);
7375  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
7376       ai != ae; ++ai, ++j) {
7377    llvm::Type *ArgTy = ai->getType();
7378    if (Ops[j]->getType()->getPrimitiveSizeInBits() ==
7379             ArgTy->getPrimitiveSizeInBits())
7380      continue;
7381
7382    assert(ArgTy->isVectorTy() && !Ops[j]->getType()->isVectorTy());
7383    // The constant argument to an _n_ intrinsic always has Int32Ty, so truncate
7384    // it before inserting.
7385    Ops[j] = CGF.Builder.CreateTruncOrBitCast(
7386        Ops[j], cast<llvm::VectorType>(ArgTy)->getElementType());
7387    Ops[j] =
7388        CGF.Builder.CreateInsertElement(PoisonValue::get(ArgTy), Ops[j], C0);
7389  }
7390
7391  Value *Result = CGF.EmitNeonCall(F, Ops, s);
7392  llvm::Type *ResultType = CGF.ConvertType(E->getType());
7393  if (ResultType->getPrimitiveSizeInBits().getFixedValue() <
7394      Result->getType()->getPrimitiveSizeInBits().getFixedValue())
7395    return CGF.Builder.CreateExtractElement(Result, C0);
7396
7397  return CGF.Builder.CreateBitCast(Result, ResultType, s);
7398}
7399
7400Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
7401    unsigned BuiltinID, unsigned LLVMIntrinsic, unsigned AltLLVMIntrinsic,
7402    const char *NameHint, unsigned Modifier, const CallExpr *E,
7403    SmallVectorImpl<llvm::Value *> &Ops, Address PtrOp0, Address PtrOp1,
7404    llvm::Triple::ArchType Arch) {
7405  // Get the last argument, which specifies the vector type.
7406  const Expr *Arg = E->getArg(E->getNumArgs() - 1);
7407  std::optional<llvm::APSInt> NeonTypeConst =
7408      Arg->getIntegerConstantExpr(getContext());
7409  if (!NeonTypeConst)
7410    return nullptr;
7411
7412  // Determine the type of this overloaded NEON intrinsic.
7413  NeonTypeFlags Type(NeonTypeConst->getZExtValue());
7414  bool Usgn = Type.isUnsigned();
7415  bool Quad = Type.isQuad();
7416  const bool HasLegalHalfType = getTarget().hasLegalHalfType();
7417  const bool AllowBFloatArgsAndRet =
7418      getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
7419
7420  llvm::FixedVectorType *VTy =
7421      GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet);
7422  llvm::Type *Ty = VTy;
7423  if (!Ty)
7424    return nullptr;
7425
7426  auto getAlignmentValue32 = [&](Address addr) -> Value* {
7427    return Builder.getInt32(addr.getAlignment().getQuantity());
7428  };
7429
7430  unsigned Int = LLVMIntrinsic;
7431  if ((Modifier & UnsignedAlts) && !Usgn)
7432    Int = AltLLVMIntrinsic;
7433
7434  switch (BuiltinID) {
7435  default: break;
7436  case NEON::BI__builtin_neon_splat_lane_v:
7437  case NEON::BI__builtin_neon_splat_laneq_v:
7438  case NEON::BI__builtin_neon_splatq_lane_v:
7439  case NEON::BI__builtin_neon_splatq_laneq_v: {
7440    auto NumElements = VTy->getElementCount();
7441    if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v)
7442      NumElements = NumElements * 2;
7443    if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v)
7444      NumElements = NumElements.divideCoefficientBy(2);
7445
7446    Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
7447    return EmitNeonSplat(Ops[0], cast<ConstantInt>(Ops[1]), NumElements);
7448  }
7449  case NEON::BI__builtin_neon_vpadd_v:
7450  case NEON::BI__builtin_neon_vpaddq_v:
7451    // We don't allow fp/int overloading of intrinsics.
7452    if (VTy->getElementType()->isFloatingPointTy() &&
7453        Int == Intrinsic::aarch64_neon_addp)
7454      Int = Intrinsic::aarch64_neon_faddp;
7455    break;
7456  case NEON::BI__builtin_neon_vabs_v:
7457  case NEON::BI__builtin_neon_vabsq_v:
7458    if (VTy->getElementType()->isFloatingPointTy())
7459      return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs");
7460    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs");
7461  case NEON::BI__builtin_neon_vadd_v:
7462  case NEON::BI__builtin_neon_vaddq_v: {
7463    llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, Quad ? 16 : 8);
7464    Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
7465    Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
7466    Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
7467    return Builder.CreateBitCast(Ops[0], Ty);
7468  }
7469  case NEON::BI__builtin_neon_vaddhn_v: {
7470    llvm::FixedVectorType *SrcTy =
7471        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7472
7473    // %sum = add <4 x i32> %lhs, %rhs
7474    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7475    Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
7476    Ops[0] = Builder.CreateAdd(Ops[0], Ops[1], "vaddhn");
7477
7478    // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
7479    Constant *ShiftAmt =
7480        ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
7481    Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vaddhn");
7482
7483    // %res = trunc <4 x i32> %high to <4 x i16>
7484    return Builder.CreateTrunc(Ops[0], VTy, "vaddhn");
7485  }
7486  case NEON::BI__builtin_neon_vcale_v:
7487  case NEON::BI__builtin_neon_vcaleq_v:
7488  case NEON::BI__builtin_neon_vcalt_v:
7489  case NEON::BI__builtin_neon_vcaltq_v:
7490    std::swap(Ops[0], Ops[1]);
7491    [[fallthrough]];
7492  case NEON::BI__builtin_neon_vcage_v:
7493  case NEON::BI__builtin_neon_vcageq_v:
7494  case NEON::BI__builtin_neon_vcagt_v:
7495  case NEON::BI__builtin_neon_vcagtq_v: {
7496    llvm::Type *Ty;
7497    switch (VTy->getScalarSizeInBits()) {
7498    default: llvm_unreachable("unexpected type");
7499    case 32:
7500      Ty = FloatTy;
7501      break;
7502    case 64:
7503      Ty = DoubleTy;
7504      break;
7505    case 16:
7506      Ty = HalfTy;
7507      break;
7508    }
7509    auto *VecFlt = llvm::FixedVectorType::get(Ty, VTy->getNumElements());
7510    llvm::Type *Tys[] = { VTy, VecFlt };
7511    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7512    return EmitNeonCall(F, Ops, NameHint);
7513  }
7514  case NEON::BI__builtin_neon_vceqz_v:
7515  case NEON::BI__builtin_neon_vceqzq_v:
7516    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
7517                                         ICmpInst::ICMP_EQ, "vceqz");
7518  case NEON::BI__builtin_neon_vcgez_v:
7519  case NEON::BI__builtin_neon_vcgezq_v:
7520    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
7521                                         ICmpInst::ICMP_SGE, "vcgez");
7522  case NEON::BI__builtin_neon_vclez_v:
7523  case NEON::BI__builtin_neon_vclezq_v:
7524    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
7525                                         ICmpInst::ICMP_SLE, "vclez");
7526  case NEON::BI__builtin_neon_vcgtz_v:
7527  case NEON::BI__builtin_neon_vcgtzq_v:
7528    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
7529                                         ICmpInst::ICMP_SGT, "vcgtz");
7530  case NEON::BI__builtin_neon_vcltz_v:
7531  case NEON::BI__builtin_neon_vcltzq_v:
7532    return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
7533                                         ICmpInst::ICMP_SLT, "vcltz");
7534  case NEON::BI__builtin_neon_vclz_v:
7535  case NEON::BI__builtin_neon_vclzq_v:
7536    // We generate target-independent intrinsic, which needs a second argument
7537    // for whether or not clz of zero is undefined; on ARM it isn't.
7538    Ops.push_back(Builder.getInt1(getTarget().isCLZForZeroUndef()));
7539    break;
7540  case NEON::BI__builtin_neon_vcvt_f32_v:
7541  case NEON::BI__builtin_neon_vcvtq_f32_v:
7542    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7543    Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad),
7544                     HasLegalHalfType);
7545    return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
7546                : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
7547  case NEON::BI__builtin_neon_vcvt_f16_s16:
7548  case NEON::BI__builtin_neon_vcvt_f16_u16:
7549  case NEON::BI__builtin_neon_vcvtq_f16_s16:
7550  case NEON::BI__builtin_neon_vcvtq_f16_u16:
7551    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7552    Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad),
7553                     HasLegalHalfType);
7554    return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
7555                : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
7556  case NEON::BI__builtin_neon_vcvt_n_f16_s16:
7557  case NEON::BI__builtin_neon_vcvt_n_f16_u16:
7558  case NEON::BI__builtin_neon_vcvtq_n_f16_s16:
7559  case NEON::BI__builtin_neon_vcvtq_n_f16_u16: {
7560    llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
7561    Function *F = CGM.getIntrinsic(Int, Tys);
7562    return EmitNeonCall(F, Ops, "vcvt_n");
7563  }
7564  case NEON::BI__builtin_neon_vcvt_n_f32_v:
7565  case NEON::BI__builtin_neon_vcvt_n_f64_v:
7566  case NEON::BI__builtin_neon_vcvtq_n_f32_v:
7567  case NEON::BI__builtin_neon_vcvtq_n_f64_v: {
7568    llvm::Type *Tys[2] = { GetFloatNeonType(this, Type), Ty };
7569    Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
7570    Function *F = CGM.getIntrinsic(Int, Tys);
7571    return EmitNeonCall(F, Ops, "vcvt_n");
7572  }
7573  case NEON::BI__builtin_neon_vcvt_n_s16_f16:
7574  case NEON::BI__builtin_neon_vcvt_n_s32_v:
7575  case NEON::BI__builtin_neon_vcvt_n_u16_f16:
7576  case NEON::BI__builtin_neon_vcvt_n_u32_v:
7577  case NEON::BI__builtin_neon_vcvt_n_s64_v:
7578  case NEON::BI__builtin_neon_vcvt_n_u64_v:
7579  case NEON::BI__builtin_neon_vcvtq_n_s16_f16:
7580  case NEON::BI__builtin_neon_vcvtq_n_s32_v:
7581  case NEON::BI__builtin_neon_vcvtq_n_u16_f16:
7582  case NEON::BI__builtin_neon_vcvtq_n_u32_v:
7583  case NEON::BI__builtin_neon_vcvtq_n_s64_v:
7584  case NEON::BI__builtin_neon_vcvtq_n_u64_v: {
7585    llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
7586    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7587    return EmitNeonCall(F, Ops, "vcvt_n");
7588  }
7589  case NEON::BI__builtin_neon_vcvt_s32_v:
7590  case NEON::BI__builtin_neon_vcvt_u32_v:
7591  case NEON::BI__builtin_neon_vcvt_s64_v:
7592  case NEON::BI__builtin_neon_vcvt_u64_v:
7593  case NEON::BI__builtin_neon_vcvt_s16_f16:
7594  case NEON::BI__builtin_neon_vcvt_u16_f16:
7595  case NEON::BI__builtin_neon_vcvtq_s32_v:
7596  case NEON::BI__builtin_neon_vcvtq_u32_v:
7597  case NEON::BI__builtin_neon_vcvtq_s64_v:
7598  case NEON::BI__builtin_neon_vcvtq_u64_v:
7599  case NEON::BI__builtin_neon_vcvtq_s16_f16:
7600  case NEON::BI__builtin_neon_vcvtq_u16_f16: {
7601    Ops[0] = Builder.CreateBitCast(Ops[0], GetFloatNeonType(this, Type));
7602    return Usgn ? Builder.CreateFPToUI(Ops[0], Ty, "vcvt")
7603                : Builder.CreateFPToSI(Ops[0], Ty, "vcvt");
7604  }
7605  case NEON::BI__builtin_neon_vcvta_s16_f16:
7606  case NEON::BI__builtin_neon_vcvta_s32_v:
7607  case NEON::BI__builtin_neon_vcvta_s64_v:
7608  case NEON::BI__builtin_neon_vcvta_u16_f16:
7609  case NEON::BI__builtin_neon_vcvta_u32_v:
7610  case NEON::BI__builtin_neon_vcvta_u64_v:
7611  case NEON::BI__builtin_neon_vcvtaq_s16_f16:
7612  case NEON::BI__builtin_neon_vcvtaq_s32_v:
7613  case NEON::BI__builtin_neon_vcvtaq_s64_v:
7614  case NEON::BI__builtin_neon_vcvtaq_u16_f16:
7615  case NEON::BI__builtin_neon_vcvtaq_u32_v:
7616  case NEON::BI__builtin_neon_vcvtaq_u64_v:
7617  case NEON::BI__builtin_neon_vcvtn_s16_f16:
7618  case NEON::BI__builtin_neon_vcvtn_s32_v:
7619  case NEON::BI__builtin_neon_vcvtn_s64_v:
7620  case NEON::BI__builtin_neon_vcvtn_u16_f16:
7621  case NEON::BI__builtin_neon_vcvtn_u32_v:
7622  case NEON::BI__builtin_neon_vcvtn_u64_v:
7623  case NEON::BI__builtin_neon_vcvtnq_s16_f16:
7624  case NEON::BI__builtin_neon_vcvtnq_s32_v:
7625  case NEON::BI__builtin_neon_vcvtnq_s64_v:
7626  case NEON::BI__builtin_neon_vcvtnq_u16_f16:
7627  case NEON::BI__builtin_neon_vcvtnq_u32_v:
7628  case NEON::BI__builtin_neon_vcvtnq_u64_v:
7629  case NEON::BI__builtin_neon_vcvtp_s16_f16:
7630  case NEON::BI__builtin_neon_vcvtp_s32_v:
7631  case NEON::BI__builtin_neon_vcvtp_s64_v:
7632  case NEON::BI__builtin_neon_vcvtp_u16_f16:
7633  case NEON::BI__builtin_neon_vcvtp_u32_v:
7634  case NEON::BI__builtin_neon_vcvtp_u64_v:
7635  case NEON::BI__builtin_neon_vcvtpq_s16_f16:
7636  case NEON::BI__builtin_neon_vcvtpq_s32_v:
7637  case NEON::BI__builtin_neon_vcvtpq_s64_v:
7638  case NEON::BI__builtin_neon_vcvtpq_u16_f16:
7639  case NEON::BI__builtin_neon_vcvtpq_u32_v:
7640  case NEON::BI__builtin_neon_vcvtpq_u64_v:
7641  case NEON::BI__builtin_neon_vcvtm_s16_f16:
7642  case NEON::BI__builtin_neon_vcvtm_s32_v:
7643  case NEON::BI__builtin_neon_vcvtm_s64_v:
7644  case NEON::BI__builtin_neon_vcvtm_u16_f16:
7645  case NEON::BI__builtin_neon_vcvtm_u32_v:
7646  case NEON::BI__builtin_neon_vcvtm_u64_v:
7647  case NEON::BI__builtin_neon_vcvtmq_s16_f16:
7648  case NEON::BI__builtin_neon_vcvtmq_s32_v:
7649  case NEON::BI__builtin_neon_vcvtmq_s64_v:
7650  case NEON::BI__builtin_neon_vcvtmq_u16_f16:
7651  case NEON::BI__builtin_neon_vcvtmq_u32_v:
7652  case NEON::BI__builtin_neon_vcvtmq_u64_v: {
7653    llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
7654    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
7655  }
7656  case NEON::BI__builtin_neon_vcvtx_f32_v: {
7657    llvm::Type *Tys[2] = { VTy->getTruncatedElementVectorType(VTy), Ty};
7658    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, NameHint);
7659
7660  }
7661  case NEON::BI__builtin_neon_vext_v:
7662  case NEON::BI__builtin_neon_vextq_v: {
7663    int CV = cast<ConstantInt>(Ops[2])->getSExtValue();
7664    SmallVector<int, 16> Indices;
7665    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
7666      Indices.push_back(i+CV);
7667
7668    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7669    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7670    return Builder.CreateShuffleVector(Ops[0], Ops[1], Indices, "vext");
7671  }
7672  case NEON::BI__builtin_neon_vfma_v:
7673  case NEON::BI__builtin_neon_vfmaq_v: {
7674    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7675    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7676    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7677
7678    // NEON intrinsic puts accumulator first, unlike the LLVM fma.
7679    return emitCallMaybeConstrainedFPBuiltin(
7680        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
7681        {Ops[1], Ops[2], Ops[0]});
7682  }
7683  case NEON::BI__builtin_neon_vld1_v:
7684  case NEON::BI__builtin_neon_vld1q_v: {
7685    llvm::Type *Tys[] = {Ty, Int8PtrTy};
7686    Ops.push_back(getAlignmentValue32(PtrOp0));
7687    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vld1");
7688  }
7689  case NEON::BI__builtin_neon_vld1_x2_v:
7690  case NEON::BI__builtin_neon_vld1q_x2_v:
7691  case NEON::BI__builtin_neon_vld1_x3_v:
7692  case NEON::BI__builtin_neon_vld1q_x3_v:
7693  case NEON::BI__builtin_neon_vld1_x4_v:
7694  case NEON::BI__builtin_neon_vld1q_x4_v: {
7695    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
7696    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7697    Ops[1] = Builder.CreateCall(F, Ops[1], "vld1xN");
7698    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7699  }
7700  case NEON::BI__builtin_neon_vld2_v:
7701  case NEON::BI__builtin_neon_vld2q_v:
7702  case NEON::BI__builtin_neon_vld3_v:
7703  case NEON::BI__builtin_neon_vld3q_v:
7704  case NEON::BI__builtin_neon_vld4_v:
7705  case NEON::BI__builtin_neon_vld4q_v:
7706  case NEON::BI__builtin_neon_vld2_dup_v:
7707  case NEON::BI__builtin_neon_vld2q_dup_v:
7708  case NEON::BI__builtin_neon_vld3_dup_v:
7709  case NEON::BI__builtin_neon_vld3q_dup_v:
7710  case NEON::BI__builtin_neon_vld4_dup_v:
7711  case NEON::BI__builtin_neon_vld4q_dup_v: {
7712    llvm::Type *Tys[] = {Ty, Int8PtrTy};
7713    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7714    Value *Align = getAlignmentValue32(PtrOp1);
7715    Ops[1] = Builder.CreateCall(F, {Ops[1], Align}, NameHint);
7716    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7717  }
7718  case NEON::BI__builtin_neon_vld1_dup_v:
7719  case NEON::BI__builtin_neon_vld1q_dup_v: {
7720    Value *V = PoisonValue::get(Ty);
7721    PtrOp0 = PtrOp0.withElementType(VTy->getElementType());
7722    LoadInst *Ld = Builder.CreateLoad(PtrOp0);
7723    llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
7724    Ops[0] = Builder.CreateInsertElement(V, Ld, CI);
7725    return EmitNeonSplat(Ops[0], CI);
7726  }
7727  case NEON::BI__builtin_neon_vld2_lane_v:
7728  case NEON::BI__builtin_neon_vld2q_lane_v:
7729  case NEON::BI__builtin_neon_vld3_lane_v:
7730  case NEON::BI__builtin_neon_vld3q_lane_v:
7731  case NEON::BI__builtin_neon_vld4_lane_v:
7732  case NEON::BI__builtin_neon_vld4q_lane_v: {
7733    llvm::Type *Tys[] = {Ty, Int8PtrTy};
7734    Function *F = CGM.getIntrinsic(LLVMIntrinsic, Tys);
7735    for (unsigned I = 2; I < Ops.size() - 1; ++I)
7736      Ops[I] = Builder.CreateBitCast(Ops[I], Ty);
7737    Ops.push_back(getAlignmentValue32(PtrOp1));
7738    Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), NameHint);
7739    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
7740  }
7741  case NEON::BI__builtin_neon_vmovl_v: {
7742    llvm::FixedVectorType *DTy =
7743        llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
7744    Ops[0] = Builder.CreateBitCast(Ops[0], DTy);
7745    if (Usgn)
7746      return Builder.CreateZExt(Ops[0], Ty, "vmovl");
7747    return Builder.CreateSExt(Ops[0], Ty, "vmovl");
7748  }
7749  case NEON::BI__builtin_neon_vmovn_v: {
7750    llvm::FixedVectorType *QTy =
7751        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7752    Ops[0] = Builder.CreateBitCast(Ops[0], QTy);
7753    return Builder.CreateTrunc(Ops[0], Ty, "vmovn");
7754  }
7755  case NEON::BI__builtin_neon_vmull_v:
7756    // FIXME: the integer vmull operations could be emitted in terms of pure
7757    // LLVM IR (2 exts followed by a mul). Unfortunately LLVM has a habit of
7758    // hoisting the exts outside loops. Until global ISel comes along that can
7759    // see through such movement this leads to bad CodeGen. So we need an
7760    // intrinsic for now.
7761    Int = Usgn ? Intrinsic::arm_neon_vmullu : Intrinsic::arm_neon_vmulls;
7762    Int = Type.isPoly() ? (unsigned)Intrinsic::arm_neon_vmullp : Int;
7763    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
7764  case NEON::BI__builtin_neon_vpadal_v:
7765  case NEON::BI__builtin_neon_vpadalq_v: {
7766    // The source operand type has twice as many elements of half the size.
7767    unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
7768    llvm::Type *EltTy =
7769      llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
7770    auto *NarrowTy =
7771        llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
7772    llvm::Type *Tys[2] = { Ty, NarrowTy };
7773    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7774  }
7775  case NEON::BI__builtin_neon_vpaddl_v:
7776  case NEON::BI__builtin_neon_vpaddlq_v: {
7777    // The source operand type has twice as many elements of half the size.
7778    unsigned EltBits = VTy->getElementType()->getPrimitiveSizeInBits();
7779    llvm::Type *EltTy = llvm::IntegerType::get(getLLVMContext(), EltBits / 2);
7780    auto *NarrowTy =
7781        llvm::FixedVectorType::get(EltTy, VTy->getNumElements() * 2);
7782    llvm::Type *Tys[2] = { Ty, NarrowTy };
7783    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vpaddl");
7784  }
7785  case NEON::BI__builtin_neon_vqdmlal_v:
7786  case NEON::BI__builtin_neon_vqdmlsl_v: {
7787    SmallVector<Value *, 2> MulOps(Ops.begin() + 1, Ops.end());
7788    Ops[1] =
7789        EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), MulOps, "vqdmlal");
7790    Ops.resize(2);
7791    return EmitNeonCall(CGM.getIntrinsic(AltLLVMIntrinsic, Ty), Ops, NameHint);
7792  }
7793  case NEON::BI__builtin_neon_vqdmulhq_lane_v:
7794  case NEON::BI__builtin_neon_vqdmulh_lane_v:
7795  case NEON::BI__builtin_neon_vqrdmulhq_lane_v:
7796  case NEON::BI__builtin_neon_vqrdmulh_lane_v: {
7797    auto *RTy = cast<llvm::FixedVectorType>(Ty);
7798    if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v ||
7799        BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v)
7800      RTy = llvm::FixedVectorType::get(RTy->getElementType(),
7801                                       RTy->getNumElements() * 2);
7802    llvm::Type *Tys[2] = {
7803        RTy, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7804                                             /*isQuad*/ false))};
7805    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7806  }
7807  case NEON::BI__builtin_neon_vqdmulhq_laneq_v:
7808  case NEON::BI__builtin_neon_vqdmulh_laneq_v:
7809  case NEON::BI__builtin_neon_vqrdmulhq_laneq_v:
7810  case NEON::BI__builtin_neon_vqrdmulh_laneq_v: {
7811    llvm::Type *Tys[2] = {
7812        Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
7813                                            /*isQuad*/ true))};
7814    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint);
7815  }
7816  case NEON::BI__builtin_neon_vqshl_n_v:
7817  case NEON::BI__builtin_neon_vqshlq_n_v:
7818    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshl_n",
7819                        1, false);
7820  case NEON::BI__builtin_neon_vqshlu_n_v:
7821  case NEON::BI__builtin_neon_vqshluq_n_v:
7822    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshlu_n",
7823                        1, false);
7824  case NEON::BI__builtin_neon_vrecpe_v:
7825  case NEON::BI__builtin_neon_vrecpeq_v:
7826  case NEON::BI__builtin_neon_vrsqrte_v:
7827  case NEON::BI__builtin_neon_vrsqrteq_v:
7828    Int = Ty->isFPOrFPVectorTy() ? LLVMIntrinsic : AltLLVMIntrinsic;
7829    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
7830  case NEON::BI__builtin_neon_vrndi_v:
7831  case NEON::BI__builtin_neon_vrndiq_v:
7832    Int = Builder.getIsFPConstrained()
7833              ? Intrinsic::experimental_constrained_nearbyint
7834              : Intrinsic::nearbyint;
7835    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, NameHint);
7836  case NEON::BI__builtin_neon_vrshr_n_v:
7837  case NEON::BI__builtin_neon_vrshrq_n_v:
7838    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshr_n",
7839                        1, true);
7840  case NEON::BI__builtin_neon_vsha512hq_u64:
7841  case NEON::BI__builtin_neon_vsha512h2q_u64:
7842  case NEON::BI__builtin_neon_vsha512su0q_u64:
7843  case NEON::BI__builtin_neon_vsha512su1q_u64: {
7844    Function *F = CGM.getIntrinsic(Int);
7845    return EmitNeonCall(F, Ops, "");
7846  }
7847  case NEON::BI__builtin_neon_vshl_n_v:
7848  case NEON::BI__builtin_neon_vshlq_n_v:
7849    Ops[1] = EmitNeonShiftVector(Ops[1], Ty, false);
7850    return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1],
7851                             "vshl_n");
7852  case NEON::BI__builtin_neon_vshll_n_v: {
7853    llvm::FixedVectorType *SrcTy =
7854        llvm::FixedVectorType::getTruncatedElementVectorType(VTy);
7855    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7856    if (Usgn)
7857      Ops[0] = Builder.CreateZExt(Ops[0], VTy);
7858    else
7859      Ops[0] = Builder.CreateSExt(Ops[0], VTy);
7860    Ops[1] = EmitNeonShiftVector(Ops[1], VTy, false);
7861    return Builder.CreateShl(Ops[0], Ops[1], "vshll_n");
7862  }
7863  case NEON::BI__builtin_neon_vshrn_n_v: {
7864    llvm::FixedVectorType *SrcTy =
7865        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7866    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7867    Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false);
7868    if (Usgn)
7869      Ops[0] = Builder.CreateLShr(Ops[0], Ops[1]);
7870    else
7871      Ops[0] = Builder.CreateAShr(Ops[0], Ops[1]);
7872    return Builder.CreateTrunc(Ops[0], Ty, "vshrn_n");
7873  }
7874  case NEON::BI__builtin_neon_vshr_n_v:
7875  case NEON::BI__builtin_neon_vshrq_n_v:
7876    return EmitNeonRShiftImm(Ops[0], Ops[1], Ty, Usgn, "vshr_n");
7877  case NEON::BI__builtin_neon_vst1_v:
7878  case NEON::BI__builtin_neon_vst1q_v:
7879  case NEON::BI__builtin_neon_vst2_v:
7880  case NEON::BI__builtin_neon_vst2q_v:
7881  case NEON::BI__builtin_neon_vst3_v:
7882  case NEON::BI__builtin_neon_vst3q_v:
7883  case NEON::BI__builtin_neon_vst4_v:
7884  case NEON::BI__builtin_neon_vst4q_v:
7885  case NEON::BI__builtin_neon_vst2_lane_v:
7886  case NEON::BI__builtin_neon_vst2q_lane_v:
7887  case NEON::BI__builtin_neon_vst3_lane_v:
7888  case NEON::BI__builtin_neon_vst3q_lane_v:
7889  case NEON::BI__builtin_neon_vst4_lane_v:
7890  case NEON::BI__builtin_neon_vst4q_lane_v: {
7891    llvm::Type *Tys[] = {Int8PtrTy, Ty};
7892    Ops.push_back(getAlignmentValue32(PtrOp0));
7893    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "");
7894  }
7895  case NEON::BI__builtin_neon_vsm3partw1q_u32:
7896  case NEON::BI__builtin_neon_vsm3partw2q_u32:
7897  case NEON::BI__builtin_neon_vsm3ss1q_u32:
7898  case NEON::BI__builtin_neon_vsm4ekeyq_u32:
7899  case NEON::BI__builtin_neon_vsm4eq_u32: {
7900    Function *F = CGM.getIntrinsic(Int);
7901    return EmitNeonCall(F, Ops, "");
7902  }
7903  case NEON::BI__builtin_neon_vsm3tt1aq_u32:
7904  case NEON::BI__builtin_neon_vsm3tt1bq_u32:
7905  case NEON::BI__builtin_neon_vsm3tt2aq_u32:
7906  case NEON::BI__builtin_neon_vsm3tt2bq_u32: {
7907    Function *F = CGM.getIntrinsic(Int);
7908    Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
7909    return EmitNeonCall(F, Ops, "");
7910  }
7911  case NEON::BI__builtin_neon_vst1_x2_v:
7912  case NEON::BI__builtin_neon_vst1q_x2_v:
7913  case NEON::BI__builtin_neon_vst1_x3_v:
7914  case NEON::BI__builtin_neon_vst1q_x3_v:
7915  case NEON::BI__builtin_neon_vst1_x4_v:
7916  case NEON::BI__builtin_neon_vst1q_x4_v: {
7917    // TODO: Currently in AArch32 mode the pointer operand comes first, whereas
7918    // in AArch64 it comes last. We may want to stick to one or another.
7919    if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be ||
7920        Arch == llvm::Triple::aarch64_32) {
7921      llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
7922      std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
7923      return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
7924    }
7925    llvm::Type *Tys[2] = {UnqualPtrTy, VTy};
7926    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "");
7927  }
7928  case NEON::BI__builtin_neon_vsubhn_v: {
7929    llvm::FixedVectorType *SrcTy =
7930        llvm::FixedVectorType::getExtendedElementVectorType(VTy);
7931
7932    // %sum = add <4 x i32> %lhs, %rhs
7933    Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy);
7934    Ops[1] = Builder.CreateBitCast(Ops[1], SrcTy);
7935    Ops[0] = Builder.CreateSub(Ops[0], Ops[1], "vsubhn");
7936
7937    // %high = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
7938    Constant *ShiftAmt =
7939        ConstantInt::get(SrcTy, SrcTy->getScalarSizeInBits() / 2);
7940    Ops[0] = Builder.CreateLShr(Ops[0], ShiftAmt, "vsubhn");
7941
7942    // %res = trunc <4 x i32> %high to <4 x i16>
7943    return Builder.CreateTrunc(Ops[0], VTy, "vsubhn");
7944  }
7945  case NEON::BI__builtin_neon_vtrn_v:
7946  case NEON::BI__builtin_neon_vtrnq_v: {
7947    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7948    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7949    Value *SV = nullptr;
7950
7951    for (unsigned vi = 0; vi != 2; ++vi) {
7952      SmallVector<int, 16> Indices;
7953      for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
7954        Indices.push_back(i+vi);
7955        Indices.push_back(i+e+vi);
7956      }
7957      Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7958      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
7959      SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7960    }
7961    return SV;
7962  }
7963  case NEON::BI__builtin_neon_vtst_v:
7964  case NEON::BI__builtin_neon_vtstq_v: {
7965    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
7966    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7967    Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
7968    Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
7969                                ConstantAggregateZero::get(Ty));
7970    return Builder.CreateSExt(Ops[0], Ty, "vtst");
7971  }
7972  case NEON::BI__builtin_neon_vuzp_v:
7973  case NEON::BI__builtin_neon_vuzpq_v: {
7974    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7975    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7976    Value *SV = nullptr;
7977
7978    for (unsigned vi = 0; vi != 2; ++vi) {
7979      SmallVector<int, 16> Indices;
7980      for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
7981        Indices.push_back(2*i+vi);
7982
7983      Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
7984      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
7985      SV = Builder.CreateDefaultAlignedStore(SV, Addr);
7986    }
7987    return SV;
7988  }
7989  case NEON::BI__builtin_neon_vxarq_u64: {
7990    Function *F = CGM.getIntrinsic(Int);
7991    Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
7992    return EmitNeonCall(F, Ops, "");
7993  }
7994  case NEON::BI__builtin_neon_vzip_v:
7995  case NEON::BI__builtin_neon_vzipq_v: {
7996    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
7997    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
7998    Value *SV = nullptr;
7999
8000    for (unsigned vi = 0; vi != 2; ++vi) {
8001      SmallVector<int, 16> Indices;
8002      for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
8003        Indices.push_back((i + vi*e) >> 1);
8004        Indices.push_back(((i + vi*e) >> 1)+e);
8005      }
8006      Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
8007      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
8008      SV = Builder.CreateDefaultAlignedStore(SV, Addr);
8009    }
8010    return SV;
8011  }
8012  case NEON::BI__builtin_neon_vdot_s32:
8013  case NEON::BI__builtin_neon_vdot_u32:
8014  case NEON::BI__builtin_neon_vdotq_s32:
8015  case NEON::BI__builtin_neon_vdotq_u32: {
8016    auto *InputTy =
8017        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
8018    llvm::Type *Tys[2] = { Ty, InputTy };
8019    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
8020  }
8021  case NEON::BI__builtin_neon_vfmlal_low_f16:
8022  case NEON::BI__builtin_neon_vfmlalq_low_f16: {
8023    auto *InputTy =
8024        llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
8025    llvm::Type *Tys[2] = { Ty, InputTy };
8026    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low");
8027  }
8028  case NEON::BI__builtin_neon_vfmlsl_low_f16:
8029  case NEON::BI__builtin_neon_vfmlslq_low_f16: {
8030    auto *InputTy =
8031        llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
8032    llvm::Type *Tys[2] = { Ty, InputTy };
8033    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low");
8034  }
8035  case NEON::BI__builtin_neon_vfmlal_high_f16:
8036  case NEON::BI__builtin_neon_vfmlalq_high_f16: {
8037    auto *InputTy =
8038        llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
8039    llvm::Type *Tys[2] = { Ty, InputTy };
8040    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high");
8041  }
8042  case NEON::BI__builtin_neon_vfmlsl_high_f16:
8043  case NEON::BI__builtin_neon_vfmlslq_high_f16: {
8044    auto *InputTy =
8045        llvm::FixedVectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
8046    llvm::Type *Tys[2] = { Ty, InputTy };
8047    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
8048  }
8049  case NEON::BI__builtin_neon_vmmlaq_s32:
8050  case NEON::BI__builtin_neon_vmmlaq_u32: {
8051    auto *InputTy =
8052        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
8053    llvm::Type *Tys[2] = { Ty, InputTy };
8054    return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, "vmmla");
8055  }
8056  case NEON::BI__builtin_neon_vusmmlaq_s32: {
8057    auto *InputTy =
8058        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
8059    llvm::Type *Tys[2] = { Ty, InputTy };
8060    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla");
8061  }
8062  case NEON::BI__builtin_neon_vusdot_s32:
8063  case NEON::BI__builtin_neon_vusdotq_s32: {
8064    auto *InputTy =
8065        llvm::FixedVectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8);
8066    llvm::Type *Tys[2] = { Ty, InputTy };
8067    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot");
8068  }
8069  case NEON::BI__builtin_neon_vbfdot_f32:
8070  case NEON::BI__builtin_neon_vbfdotq_f32: {
8071    llvm::Type *InputTy =
8072        llvm::FixedVectorType::get(BFloatTy, Ty->getPrimitiveSizeInBits() / 16);
8073    llvm::Type *Tys[2] = { Ty, InputTy };
8074    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfdot");
8075  }
8076  case NEON::BI__builtin_neon___a32_vcvt_bf16_f32: {
8077    llvm::Type *Tys[1] = { Ty };
8078    Function *F = CGM.getIntrinsic(Int, Tys);
8079    return EmitNeonCall(F, Ops, "vcvtfp2bf");
8080  }
8081
8082  }
8083
8084  assert(Int && "Expected valid intrinsic number");
8085
8086  // Determine the type(s) of this overloaded AArch64 intrinsic.
8087  Function *F = LookupNeonLLVMIntrinsic(Int, Modifier, Ty, E);
8088
8089  Value *Result = EmitNeonCall(F, Ops, NameHint);
8090  llvm::Type *ResultType = ConvertType(E->getType());
8091  // AArch64 intrinsic one-element vector type cast to
8092  // scalar type expected by the builtin
8093  return Builder.CreateBitCast(Result, ResultType, NameHint);
8094}
8095
8096Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
8097    Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
8098    const CmpInst::Predicate Ip, const Twine &Name) {
8099  llvm::Type *OTy = Op->getType();
8100
8101  // FIXME: this is utterly horrific. We should not be looking at previous
8102  // codegen context to find out what needs doing. Unfortunately TableGen
8103  // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
8104  // (etc).
8105  if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
8106    OTy = BI->getOperand(0)->getType();
8107
8108  Op = Builder.CreateBitCast(Op, OTy);
8109  if (OTy->getScalarType()->isFloatingPointTy()) {
8110    if (Fp == CmpInst::FCMP_OEQ)
8111      Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
8112    else
8113      Op = Builder.CreateFCmpS(Fp, Op, Constant::getNullValue(OTy));
8114  } else {
8115    Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
8116  }
8117  return Builder.CreateSExt(Op, Ty, Name);
8118}
8119
8120static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
8121                                 Value *ExtOp, Value *IndexOp,
8122                                 llvm::Type *ResTy, unsigned IntID,
8123                                 const char *Name) {
8124  SmallVector<Value *, 2> TblOps;
8125  if (ExtOp)
8126    TblOps.push_back(ExtOp);
8127
8128  // Build a vector containing sequential number like (0, 1, 2, ..., 15)
8129  SmallVector<int, 16> Indices;
8130  auto *TblTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
8131  for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) {
8132    Indices.push_back(2*i);
8133    Indices.push_back(2*i+1);
8134  }
8135
8136  int PairPos = 0, End = Ops.size() - 1;
8137  while (PairPos < End) {
8138    TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
8139                                                     Ops[PairPos+1], Indices,
8140                                                     Name));
8141    PairPos += 2;
8142  }
8143
8144  // If there's an odd number of 64-bit lookup table, fill the high 64-bit
8145  // of the 128-bit lookup table with zero.
8146  if (PairPos == End) {
8147    Value *ZeroTbl = ConstantAggregateZero::get(TblTy);
8148    TblOps.push_back(CGF.Builder.CreateShuffleVector(Ops[PairPos],
8149                                                     ZeroTbl, Indices, Name));
8150  }
8151
8152  Function *TblF;
8153  TblOps.push_back(IndexOp);
8154  TblF = CGF.CGM.getIntrinsic(IntID, ResTy);
8155
8156  return CGF.EmitNeonCall(TblF, TblOps, Name);
8157}
8158
8159Value *CodeGenFunction::GetValueForARMHint(unsigned BuiltinID) {
8160  unsigned Value;
8161  switch (BuiltinID) {
8162  default:
8163    return nullptr;
8164  case clang::ARM::BI__builtin_arm_nop:
8165    Value = 0;
8166    break;
8167  case clang::ARM::BI__builtin_arm_yield:
8168  case clang::ARM::BI__yield:
8169    Value = 1;
8170    break;
8171  case clang::ARM::BI__builtin_arm_wfe:
8172  case clang::ARM::BI__wfe:
8173    Value = 2;
8174    break;
8175  case clang::ARM::BI__builtin_arm_wfi:
8176  case clang::ARM::BI__wfi:
8177    Value = 3;
8178    break;
8179  case clang::ARM::BI__builtin_arm_sev:
8180  case clang::ARM::BI__sev:
8181    Value = 4;
8182    break;
8183  case clang::ARM::BI__builtin_arm_sevl:
8184  case clang::ARM::BI__sevl:
8185    Value = 5;
8186    break;
8187  }
8188
8189  return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_hint),
8190                            llvm::ConstantInt::get(Int32Ty, Value));
8191}
8192
8193enum SpecialRegisterAccessKind {
8194  NormalRead,
8195  VolatileRead,
8196  Write,
8197};
8198
8199// Generates the IR for __builtin_read_exec_*.
8200// Lowers the builtin to amdgcn_ballot intrinsic.
8201static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E,
8202                                      llvm::Type *RegisterType,
8203                                      llvm::Type *ValueType, bool isExecHi) {
8204  CodeGen::CGBuilderTy &Builder = CGF.Builder;
8205  CodeGen::CodeGenModule &CGM = CGF.CGM;
8206
8207  Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType});
8208  llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)});
8209
8210  if (isExecHi) {
8211    Value *Rt2 = Builder.CreateLShr(Call, 32);
8212    Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty);
8213    return Rt2;
8214  }
8215
8216  return Call;
8217}
8218
8219// Generates the IR for the read/write special register builtin,
8220// ValueType is the type of the value that is to be written or read,
8221// RegisterType is the type of the register being written to or read from.
8222static Value *EmitSpecialRegisterBuiltin(CodeGenFunction &CGF,
8223                                         const CallExpr *E,
8224                                         llvm::Type *RegisterType,
8225                                         llvm::Type *ValueType,
8226                                         SpecialRegisterAccessKind AccessKind,
8227                                         StringRef SysReg = "") {
8228  // write and register intrinsics only support 32, 64 and 128 bit operations.
8229  assert((RegisterType->isIntegerTy(32) || RegisterType->isIntegerTy(64) ||
8230          RegisterType->isIntegerTy(128)) &&
8231         "Unsupported size for register.");
8232
8233  CodeGen::CGBuilderTy &Builder = CGF.Builder;
8234  CodeGen::CodeGenModule &CGM = CGF.CGM;
8235  LLVMContext &Context = CGM.getLLVMContext();
8236
8237  if (SysReg.empty()) {
8238    const Expr *SysRegStrExpr = E->getArg(0)->IgnoreParenCasts();
8239    SysReg = cast<clang::StringLiteral>(SysRegStrExpr)->getString();
8240  }
8241
8242  llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysReg) };
8243  llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
8244  llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
8245
8246  llvm::Type *Types[] = { RegisterType };
8247
8248  bool MixedTypes = RegisterType->isIntegerTy(64) && ValueType->isIntegerTy(32);
8249  assert(!(RegisterType->isIntegerTy(32) && ValueType->isIntegerTy(64))
8250            && "Can't fit 64-bit value in 32-bit register");
8251
8252  if (AccessKind != Write) {
8253    assert(AccessKind == NormalRead || AccessKind == VolatileRead);
8254    llvm::Function *F = CGM.getIntrinsic(
8255        AccessKind == VolatileRead ? llvm::Intrinsic::read_volatile_register
8256                                   : llvm::Intrinsic::read_register,
8257        Types);
8258    llvm::Value *Call = Builder.CreateCall(F, Metadata);
8259
8260    if (MixedTypes)
8261      // Read into 64 bit register and then truncate result to 32 bit.
8262      return Builder.CreateTrunc(Call, ValueType);
8263
8264    if (ValueType->isPointerTy())
8265      // Have i32/i64 result (Call) but want to return a VoidPtrTy (i8*).
8266      return Builder.CreateIntToPtr(Call, ValueType);
8267
8268    return Call;
8269  }
8270
8271  llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
8272  llvm::Value *ArgValue = CGF.EmitScalarExpr(E->getArg(1));
8273  if (MixedTypes) {
8274    // Extend 32 bit write value to 64 bit to pass to write.
8275    ArgValue = Builder.CreateZExt(ArgValue, RegisterType);
8276    return Builder.CreateCall(F, { Metadata, ArgValue });
8277  }
8278
8279  if (ValueType->isPointerTy()) {
8280    // Have VoidPtrTy ArgValue but want to return an i32/i64.
8281    ArgValue = Builder.CreatePtrToInt(ArgValue, RegisterType);
8282    return Builder.CreateCall(F, { Metadata, ArgValue });
8283  }
8284
8285  return Builder.CreateCall(F, { Metadata, ArgValue });
8286}
8287
8288/// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
8289/// argument that specifies the vector type.
8290static bool HasExtraNeonArgument(unsigned BuiltinID) {
8291  switch (BuiltinID) {
8292  default: break;
8293  case NEON::BI__builtin_neon_vget_lane_i8:
8294  case NEON::BI__builtin_neon_vget_lane_i16:
8295  case NEON::BI__builtin_neon_vget_lane_bf16:
8296  case NEON::BI__builtin_neon_vget_lane_i32:
8297  case NEON::BI__builtin_neon_vget_lane_i64:
8298  case NEON::BI__builtin_neon_vget_lane_f32:
8299  case NEON::BI__builtin_neon_vgetq_lane_i8:
8300  case NEON::BI__builtin_neon_vgetq_lane_i16:
8301  case NEON::BI__builtin_neon_vgetq_lane_bf16:
8302  case NEON::BI__builtin_neon_vgetq_lane_i32:
8303  case NEON::BI__builtin_neon_vgetq_lane_i64:
8304  case NEON::BI__builtin_neon_vgetq_lane_f32:
8305  case NEON::BI__builtin_neon_vduph_lane_bf16:
8306  case NEON::BI__builtin_neon_vduph_laneq_bf16:
8307  case NEON::BI__builtin_neon_vset_lane_i8:
8308  case NEON::BI__builtin_neon_vset_lane_i16:
8309  case NEON::BI__builtin_neon_vset_lane_bf16:
8310  case NEON::BI__builtin_neon_vset_lane_i32:
8311  case NEON::BI__builtin_neon_vset_lane_i64:
8312  case NEON::BI__builtin_neon_vset_lane_f32:
8313  case NEON::BI__builtin_neon_vsetq_lane_i8:
8314  case NEON::BI__builtin_neon_vsetq_lane_i16:
8315  case NEON::BI__builtin_neon_vsetq_lane_bf16:
8316  case NEON::BI__builtin_neon_vsetq_lane_i32:
8317  case NEON::BI__builtin_neon_vsetq_lane_i64:
8318  case NEON::BI__builtin_neon_vsetq_lane_f32:
8319  case NEON::BI__builtin_neon_vsha1h_u32:
8320  case NEON::BI__builtin_neon_vsha1cq_u32:
8321  case NEON::BI__builtin_neon_vsha1pq_u32:
8322  case NEON::BI__builtin_neon_vsha1mq_u32:
8323  case NEON::BI__builtin_neon_vcvth_bf16_f32:
8324  case clang::ARM::BI_MoveToCoprocessor:
8325  case clang::ARM::BI_MoveToCoprocessor2:
8326    return false;
8327  }
8328  return true;
8329}
8330
8331Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
8332                                           const CallExpr *E,
8333                                           ReturnValueSlot ReturnValue,
8334                                           llvm::Triple::ArchType Arch) {
8335  if (auto Hint = GetValueForARMHint(BuiltinID))
8336    return Hint;
8337
8338  if (BuiltinID == clang::ARM::BI__emit) {
8339    bool IsThumb = getTarget().getTriple().getArch() == llvm::Triple::thumb;
8340    llvm::FunctionType *FTy =
8341        llvm::FunctionType::get(VoidTy, /*Variadic=*/false);
8342
8343    Expr::EvalResult Result;
8344    if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
8345      llvm_unreachable("Sema will ensure that the parameter is constant");
8346
8347    llvm::APSInt Value = Result.Val.getInt();
8348    uint64_t ZExtValue = Value.zextOrTrunc(IsThumb ? 16 : 32).getZExtValue();
8349
8350    llvm::InlineAsm *Emit =
8351        IsThumb ? InlineAsm::get(FTy, ".inst.n 0x" + utohexstr(ZExtValue), "",
8352                                 /*hasSideEffects=*/true)
8353                : InlineAsm::get(FTy, ".inst 0x" + utohexstr(ZExtValue), "",
8354                                 /*hasSideEffects=*/true);
8355
8356    return Builder.CreateCall(Emit);
8357  }
8358
8359  if (BuiltinID == clang::ARM::BI__builtin_arm_dbg) {
8360    Value *Option = EmitScalarExpr(E->getArg(0));
8361    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_dbg), Option);
8362  }
8363
8364  if (BuiltinID == clang::ARM::BI__builtin_arm_prefetch) {
8365    Value *Address = EmitScalarExpr(E->getArg(0));
8366    Value *RW      = EmitScalarExpr(E->getArg(1));
8367    Value *IsData  = EmitScalarExpr(E->getArg(2));
8368
8369    // Locality is not supported on ARM target
8370    Value *Locality = llvm::ConstantInt::get(Int32Ty, 3);
8371
8372    Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
8373    return Builder.CreateCall(F, {Address, RW, Locality, IsData});
8374  }
8375
8376  if (BuiltinID == clang::ARM::BI__builtin_arm_rbit) {
8377    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
8378    return Builder.CreateCall(
8379        CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
8380  }
8381
8382  if (BuiltinID == clang::ARM::BI__builtin_arm_clz ||
8383      BuiltinID == clang::ARM::BI__builtin_arm_clz64) {
8384    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
8385    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Arg->getType());
8386    Value *Res = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
8387    if (BuiltinID == clang::ARM::BI__builtin_arm_clz64)
8388      Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
8389    return Res;
8390  }
8391
8392
8393  if (BuiltinID == clang::ARM::BI__builtin_arm_cls) {
8394    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
8395    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls), Arg, "cls");
8396  }
8397  if (BuiltinID == clang::ARM::BI__builtin_arm_cls64) {
8398    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
8399    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_cls64), Arg,
8400                              "cls");
8401  }
8402
8403  if (BuiltinID == clang::ARM::BI__clear_cache) {
8404    assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
8405    const FunctionDecl *FD = E->getDirectCallee();
8406    Value *Ops[2];
8407    for (unsigned i = 0; i < 2; i++)
8408      Ops[i] = EmitScalarExpr(E->getArg(i));
8409    llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
8410    llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
8411    StringRef Name = FD->getName();
8412    return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
8413  }
8414
8415  if (BuiltinID == clang::ARM::BI__builtin_arm_mcrr ||
8416      BuiltinID == clang::ARM::BI__builtin_arm_mcrr2) {
8417    Function *F;
8418
8419    switch (BuiltinID) {
8420    default: llvm_unreachable("unexpected builtin");
8421    case clang::ARM::BI__builtin_arm_mcrr:
8422      F = CGM.getIntrinsic(Intrinsic::arm_mcrr);
8423      break;
8424    case clang::ARM::BI__builtin_arm_mcrr2:
8425      F = CGM.getIntrinsic(Intrinsic::arm_mcrr2);
8426      break;
8427    }
8428
8429    // MCRR{2} instruction has 5 operands but
8430    // the intrinsic has 4 because Rt and Rt2
8431    // are represented as a single unsigned 64
8432    // bit integer in the intrinsic definition
8433    // but internally it's represented as 2 32
8434    // bit integers.
8435
8436    Value *Coproc = EmitScalarExpr(E->getArg(0));
8437    Value *Opc1 = EmitScalarExpr(E->getArg(1));
8438    Value *RtAndRt2 = EmitScalarExpr(E->getArg(2));
8439    Value *CRm = EmitScalarExpr(E->getArg(3));
8440
8441    Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
8442    Value *Rt = Builder.CreateTruncOrBitCast(RtAndRt2, Int32Ty);
8443    Value *Rt2 = Builder.CreateLShr(RtAndRt2, C1);
8444    Rt2 = Builder.CreateTruncOrBitCast(Rt2, Int32Ty);
8445
8446    return Builder.CreateCall(F, {Coproc, Opc1, Rt, Rt2, CRm});
8447  }
8448
8449  if (BuiltinID == clang::ARM::BI__builtin_arm_mrrc ||
8450      BuiltinID == clang::ARM::BI__builtin_arm_mrrc2) {
8451    Function *F;
8452
8453    switch (BuiltinID) {
8454    default: llvm_unreachable("unexpected builtin");
8455    case clang::ARM::BI__builtin_arm_mrrc:
8456      F = CGM.getIntrinsic(Intrinsic::arm_mrrc);
8457      break;
8458    case clang::ARM::BI__builtin_arm_mrrc2:
8459      F = CGM.getIntrinsic(Intrinsic::arm_mrrc2);
8460      break;
8461    }
8462
8463    Value *Coproc = EmitScalarExpr(E->getArg(0));
8464    Value *Opc1 = EmitScalarExpr(E->getArg(1));
8465    Value *CRm  = EmitScalarExpr(E->getArg(2));
8466    Value *RtAndRt2 = Builder.CreateCall(F, {Coproc, Opc1, CRm});
8467
8468    // Returns an unsigned 64 bit integer, represented
8469    // as two 32 bit integers.
8470
8471    Value *Rt = Builder.CreateExtractValue(RtAndRt2, 1);
8472    Value *Rt1 = Builder.CreateExtractValue(RtAndRt2, 0);
8473    Rt = Builder.CreateZExt(Rt, Int64Ty);
8474    Rt1 = Builder.CreateZExt(Rt1, Int64Ty);
8475
8476    Value *ShiftCast = llvm::ConstantInt::get(Int64Ty, 32);
8477    RtAndRt2 = Builder.CreateShl(Rt, ShiftCast, "shl", true);
8478    RtAndRt2 = Builder.CreateOr(RtAndRt2, Rt1);
8479
8480    return Builder.CreateBitCast(RtAndRt2, ConvertType(E->getType()));
8481  }
8482
8483  if (BuiltinID == clang::ARM::BI__builtin_arm_ldrexd ||
8484      ((BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
8485        BuiltinID == clang::ARM::BI__builtin_arm_ldaex) &&
8486       getContext().getTypeSize(E->getType()) == 64) ||
8487      BuiltinID == clang::ARM::BI__ldrexd) {
8488    Function *F;
8489
8490    switch (BuiltinID) {
8491    default: llvm_unreachable("unexpected builtin");
8492    case clang::ARM::BI__builtin_arm_ldaex:
8493      F = CGM.getIntrinsic(Intrinsic::arm_ldaexd);
8494      break;
8495    case clang::ARM::BI__builtin_arm_ldrexd:
8496    case clang::ARM::BI__builtin_arm_ldrex:
8497    case clang::ARM::BI__ldrexd:
8498      F = CGM.getIntrinsic(Intrinsic::arm_ldrexd);
8499      break;
8500    }
8501
8502    Value *LdPtr = EmitScalarExpr(E->getArg(0));
8503    Value *Val = Builder.CreateCall(F, LdPtr, "ldrexd");
8504
8505    Value *Val0 = Builder.CreateExtractValue(Val, 1);
8506    Value *Val1 = Builder.CreateExtractValue(Val, 0);
8507    Val0 = Builder.CreateZExt(Val0, Int64Ty);
8508    Val1 = Builder.CreateZExt(Val1, Int64Ty);
8509
8510    Value *ShiftCst = llvm::ConstantInt::get(Int64Ty, 32);
8511    Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
8512    Val = Builder.CreateOr(Val, Val1);
8513    return Builder.CreateBitCast(Val, ConvertType(E->getType()));
8514  }
8515
8516  if (BuiltinID == clang::ARM::BI__builtin_arm_ldrex ||
8517      BuiltinID == clang::ARM::BI__builtin_arm_ldaex) {
8518    Value *LoadAddr = EmitScalarExpr(E->getArg(0));
8519
8520    QualType Ty = E->getType();
8521    llvm::Type *RealResTy = ConvertType(Ty);
8522    llvm::Type *IntTy =
8523        llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
8524
8525    Function *F = CGM.getIntrinsic(
8526        BuiltinID == clang::ARM::BI__builtin_arm_ldaex ? Intrinsic::arm_ldaex
8527                                                       : Intrinsic::arm_ldrex,
8528        UnqualPtrTy);
8529    CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldrex");
8530    Val->addParamAttr(
8531        0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy));
8532
8533    if (RealResTy->isPointerTy())
8534      return Builder.CreateIntToPtr(Val, RealResTy);
8535    else {
8536      llvm::Type *IntResTy = llvm::IntegerType::get(
8537          getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
8538      return Builder.CreateBitCast(Builder.CreateTruncOrBitCast(Val, IntResTy),
8539                                   RealResTy);
8540    }
8541  }
8542
8543  if (BuiltinID == clang::ARM::BI__builtin_arm_strexd ||
8544      ((BuiltinID == clang::ARM::BI__builtin_arm_stlex ||
8545        BuiltinID == clang::ARM::BI__builtin_arm_strex) &&
8546       getContext().getTypeSize(E->getArg(0)->getType()) == 64)) {
8547    Function *F = CGM.getIntrinsic(
8548        BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlexd
8549                                                       : Intrinsic::arm_strexd);
8550    llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty);
8551
8552    Address Tmp = CreateMemTemp(E->getArg(0)->getType());
8553    Value *Val = EmitScalarExpr(E->getArg(0));
8554    Builder.CreateStore(Val, Tmp);
8555
8556    Address LdPtr = Tmp.withElementType(STy);
8557    Val = Builder.CreateLoad(LdPtr);
8558
8559    Value *Arg0 = Builder.CreateExtractValue(Val, 0);
8560    Value *Arg1 = Builder.CreateExtractValue(Val, 1);
8561    Value *StPtr = EmitScalarExpr(E->getArg(1));
8562    return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "strexd");
8563  }
8564
8565  if (BuiltinID == clang::ARM::BI__builtin_arm_strex ||
8566      BuiltinID == clang::ARM::BI__builtin_arm_stlex) {
8567    Value *StoreVal = EmitScalarExpr(E->getArg(0));
8568    Value *StoreAddr = EmitScalarExpr(E->getArg(1));
8569
8570    QualType Ty = E->getArg(0)->getType();
8571    llvm::Type *StoreTy =
8572        llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
8573
8574    if (StoreVal->getType()->isPointerTy())
8575      StoreVal = Builder.CreatePtrToInt(StoreVal, Int32Ty);
8576    else {
8577      llvm::Type *IntTy = llvm::IntegerType::get(
8578          getLLVMContext(),
8579          CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
8580      StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
8581      StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int32Ty);
8582    }
8583
8584    Function *F = CGM.getIntrinsic(
8585        BuiltinID == clang::ARM::BI__builtin_arm_stlex ? Intrinsic::arm_stlex
8586                                                       : Intrinsic::arm_strex,
8587        StoreAddr->getType());
8588
8589    CallInst *CI = Builder.CreateCall(F, {StoreVal, StoreAddr}, "strex");
8590    CI->addParamAttr(
8591        1, Attribute::get(getLLVMContext(), Attribute::ElementType, StoreTy));
8592    return CI;
8593  }
8594
8595  if (BuiltinID == clang::ARM::BI__builtin_arm_clrex) {
8596    Function *F = CGM.getIntrinsic(Intrinsic::arm_clrex);
8597    return Builder.CreateCall(F);
8598  }
8599
8600  // CRC32
8601  Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
8602  switch (BuiltinID) {
8603  case clang::ARM::BI__builtin_arm_crc32b:
8604    CRCIntrinsicID = Intrinsic::arm_crc32b; break;
8605  case clang::ARM::BI__builtin_arm_crc32cb:
8606    CRCIntrinsicID = Intrinsic::arm_crc32cb; break;
8607  case clang::ARM::BI__builtin_arm_crc32h:
8608    CRCIntrinsicID = Intrinsic::arm_crc32h; break;
8609  case clang::ARM::BI__builtin_arm_crc32ch:
8610    CRCIntrinsicID = Intrinsic::arm_crc32ch; break;
8611  case clang::ARM::BI__builtin_arm_crc32w:
8612  case clang::ARM::BI__builtin_arm_crc32d:
8613    CRCIntrinsicID = Intrinsic::arm_crc32w; break;
8614  case clang::ARM::BI__builtin_arm_crc32cw:
8615  case clang::ARM::BI__builtin_arm_crc32cd:
8616    CRCIntrinsicID = Intrinsic::arm_crc32cw; break;
8617  }
8618
8619  if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
8620    Value *Arg0 = EmitScalarExpr(E->getArg(0));
8621    Value *Arg1 = EmitScalarExpr(E->getArg(1));
8622
8623    // crc32{c,}d intrinsics are implemented as two calls to crc32{c,}w
8624    // intrinsics, hence we need different codegen for these cases.
8625    if (BuiltinID == clang::ARM::BI__builtin_arm_crc32d ||
8626        BuiltinID == clang::ARM::BI__builtin_arm_crc32cd) {
8627      Value *C1 = llvm::ConstantInt::get(Int64Ty, 32);
8628      Value *Arg1a = Builder.CreateTruncOrBitCast(Arg1, Int32Ty);
8629      Value *Arg1b = Builder.CreateLShr(Arg1, C1);
8630      Arg1b = Builder.CreateTruncOrBitCast(Arg1b, Int32Ty);
8631
8632      Function *F = CGM.getIntrinsic(CRCIntrinsicID);
8633      Value *Res = Builder.CreateCall(F, {Arg0, Arg1a});
8634      return Builder.CreateCall(F, {Res, Arg1b});
8635    } else {
8636      Arg1 = Builder.CreateZExtOrBitCast(Arg1, Int32Ty);
8637
8638      Function *F = CGM.getIntrinsic(CRCIntrinsicID);
8639      return Builder.CreateCall(F, {Arg0, Arg1});
8640    }
8641  }
8642
8643  if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
8644      BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
8645      BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
8646      BuiltinID == clang::ARM::BI__builtin_arm_wsr ||
8647      BuiltinID == clang::ARM::BI__builtin_arm_wsr64 ||
8648      BuiltinID == clang::ARM::BI__builtin_arm_wsrp) {
8649
8650    SpecialRegisterAccessKind AccessKind = Write;
8651    if (BuiltinID == clang::ARM::BI__builtin_arm_rsr ||
8652        BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
8653        BuiltinID == clang::ARM::BI__builtin_arm_rsrp)
8654      AccessKind = VolatileRead;
8655
8656    bool IsPointerBuiltin = BuiltinID == clang::ARM::BI__builtin_arm_rsrp ||
8657                            BuiltinID == clang::ARM::BI__builtin_arm_wsrp;
8658
8659    bool Is64Bit = BuiltinID == clang::ARM::BI__builtin_arm_rsr64 ||
8660                   BuiltinID == clang::ARM::BI__builtin_arm_wsr64;
8661
8662    llvm::Type *ValueType;
8663    llvm::Type *RegisterType;
8664    if (IsPointerBuiltin) {
8665      ValueType = VoidPtrTy;
8666      RegisterType = Int32Ty;
8667    } else if (Is64Bit) {
8668      ValueType = RegisterType = Int64Ty;
8669    } else {
8670      ValueType = RegisterType = Int32Ty;
8671    }
8672
8673    return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType,
8674                                      AccessKind);
8675  }
8676
8677  if (BuiltinID == ARM::BI__builtin_sponentry) {
8678    llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy);
8679    return Builder.CreateCall(F);
8680  }
8681
8682  // Handle MSVC intrinsics before argument evaluation to prevent double
8683  // evaluation.
8684  if (std::optional<MSVCIntrin> MsvcIntId = translateArmToMsvcIntrin(BuiltinID))
8685    return EmitMSVCBuiltinExpr(*MsvcIntId, E);
8686
8687  // Deal with MVE builtins
8688  if (Value *Result = EmitARMMVEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
8689    return Result;
8690  // Handle CDE builtins
8691  if (Value *Result = EmitARMCDEBuiltinExpr(BuiltinID, E, ReturnValue, Arch))
8692    return Result;
8693
8694  // Some intrinsics are equivalent - if they are use the base intrinsic ID.
8695  auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) {
8696    return P.first == BuiltinID;
8697  });
8698  if (It != end(NEONEquivalentIntrinsicMap))
8699    BuiltinID = It->second;
8700
8701  // Find out if any arguments are required to be integer constant
8702  // expressions.
8703  unsigned ICEArguments = 0;
8704  ASTContext::GetBuiltinTypeError Error;
8705  getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
8706  assert(Error == ASTContext::GE_None && "Should not codegen an error");
8707
8708  auto getAlignmentValue32 = [&](Address addr) -> Value* {
8709    return Builder.getInt32(addr.getAlignment().getQuantity());
8710  };
8711
8712  Address PtrOp0 = Address::invalid();
8713  Address PtrOp1 = Address::invalid();
8714  SmallVector<Value*, 4> Ops;
8715  bool HasExtraArg = HasExtraNeonArgument(BuiltinID);
8716  unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
8717  for (unsigned i = 0, e = NumArgs; i != e; i++) {
8718    if (i == 0) {
8719      switch (BuiltinID) {
8720      case NEON::BI__builtin_neon_vld1_v:
8721      case NEON::BI__builtin_neon_vld1q_v:
8722      case NEON::BI__builtin_neon_vld1q_lane_v:
8723      case NEON::BI__builtin_neon_vld1_lane_v:
8724      case NEON::BI__builtin_neon_vld1_dup_v:
8725      case NEON::BI__builtin_neon_vld1q_dup_v:
8726      case NEON::BI__builtin_neon_vst1_v:
8727      case NEON::BI__builtin_neon_vst1q_v:
8728      case NEON::BI__builtin_neon_vst1q_lane_v:
8729      case NEON::BI__builtin_neon_vst1_lane_v:
8730      case NEON::BI__builtin_neon_vst2_v:
8731      case NEON::BI__builtin_neon_vst2q_v:
8732      case NEON::BI__builtin_neon_vst2_lane_v:
8733      case NEON::BI__builtin_neon_vst2q_lane_v:
8734      case NEON::BI__builtin_neon_vst3_v:
8735      case NEON::BI__builtin_neon_vst3q_v:
8736      case NEON::BI__builtin_neon_vst3_lane_v:
8737      case NEON::BI__builtin_neon_vst3q_lane_v:
8738      case NEON::BI__builtin_neon_vst4_v:
8739      case NEON::BI__builtin_neon_vst4q_v:
8740      case NEON::BI__builtin_neon_vst4_lane_v:
8741      case NEON::BI__builtin_neon_vst4q_lane_v:
8742        // Get the alignment for the argument in addition to the value;
8743        // we'll use it later.
8744        PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
8745        Ops.push_back(PtrOp0.getPointer());
8746        continue;
8747      }
8748    }
8749    if (i == 1) {
8750      switch (BuiltinID) {
8751      case NEON::BI__builtin_neon_vld2_v:
8752      case NEON::BI__builtin_neon_vld2q_v:
8753      case NEON::BI__builtin_neon_vld3_v:
8754      case NEON::BI__builtin_neon_vld3q_v:
8755      case NEON::BI__builtin_neon_vld4_v:
8756      case NEON::BI__builtin_neon_vld4q_v:
8757      case NEON::BI__builtin_neon_vld2_lane_v:
8758      case NEON::BI__builtin_neon_vld2q_lane_v:
8759      case NEON::BI__builtin_neon_vld3_lane_v:
8760      case NEON::BI__builtin_neon_vld3q_lane_v:
8761      case NEON::BI__builtin_neon_vld4_lane_v:
8762      case NEON::BI__builtin_neon_vld4q_lane_v:
8763      case NEON::BI__builtin_neon_vld2_dup_v:
8764      case NEON::BI__builtin_neon_vld2q_dup_v:
8765      case NEON::BI__builtin_neon_vld3_dup_v:
8766      case NEON::BI__builtin_neon_vld3q_dup_v:
8767      case NEON::BI__builtin_neon_vld4_dup_v:
8768      case NEON::BI__builtin_neon_vld4q_dup_v:
8769        // Get the alignment for the argument in addition to the value;
8770        // we'll use it later.
8771        PtrOp1 = EmitPointerWithAlignment(E->getArg(1));
8772        Ops.push_back(PtrOp1.getPointer());
8773        continue;
8774      }
8775    }
8776
8777    Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
8778  }
8779
8780  switch (BuiltinID) {
8781  default: break;
8782
8783  case NEON::BI__builtin_neon_vget_lane_i8:
8784  case NEON::BI__builtin_neon_vget_lane_i16:
8785  case NEON::BI__builtin_neon_vget_lane_i32:
8786  case NEON::BI__builtin_neon_vget_lane_i64:
8787  case NEON::BI__builtin_neon_vget_lane_bf16:
8788  case NEON::BI__builtin_neon_vget_lane_f32:
8789  case NEON::BI__builtin_neon_vgetq_lane_i8:
8790  case NEON::BI__builtin_neon_vgetq_lane_i16:
8791  case NEON::BI__builtin_neon_vgetq_lane_i32:
8792  case NEON::BI__builtin_neon_vgetq_lane_i64:
8793  case NEON::BI__builtin_neon_vgetq_lane_bf16:
8794  case NEON::BI__builtin_neon_vgetq_lane_f32:
8795  case NEON::BI__builtin_neon_vduph_lane_bf16:
8796  case NEON::BI__builtin_neon_vduph_laneq_bf16:
8797    return Builder.CreateExtractElement(Ops[0], Ops[1], "vget_lane");
8798
8799  case NEON::BI__builtin_neon_vrndns_f32: {
8800    Value *Arg = EmitScalarExpr(E->getArg(0));
8801    llvm::Type *Tys[] = {Arg->getType()};
8802    Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vrintn, Tys);
8803    return Builder.CreateCall(F, {Arg}, "vrndn"); }
8804
8805  case NEON::BI__builtin_neon_vset_lane_i8:
8806  case NEON::BI__builtin_neon_vset_lane_i16:
8807  case NEON::BI__builtin_neon_vset_lane_i32:
8808  case NEON::BI__builtin_neon_vset_lane_i64:
8809  case NEON::BI__builtin_neon_vset_lane_bf16:
8810  case NEON::BI__builtin_neon_vset_lane_f32:
8811  case NEON::BI__builtin_neon_vsetq_lane_i8:
8812  case NEON::BI__builtin_neon_vsetq_lane_i16:
8813  case NEON::BI__builtin_neon_vsetq_lane_i32:
8814  case NEON::BI__builtin_neon_vsetq_lane_i64:
8815  case NEON::BI__builtin_neon_vsetq_lane_bf16:
8816  case NEON::BI__builtin_neon_vsetq_lane_f32:
8817    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
8818
8819  case NEON::BI__builtin_neon_vsha1h_u32:
8820    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1h), Ops,
8821                        "vsha1h");
8822  case NEON::BI__builtin_neon_vsha1cq_u32:
8823    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1c), Ops,
8824                        "vsha1h");
8825  case NEON::BI__builtin_neon_vsha1pq_u32:
8826    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1p), Ops,
8827                        "vsha1h");
8828  case NEON::BI__builtin_neon_vsha1mq_u32:
8829    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops,
8830                        "vsha1h");
8831
8832  case NEON::BI__builtin_neon_vcvth_bf16_f32: {
8833    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtbfp2bf), Ops,
8834                        "vcvtbfp2bf");
8835  }
8836
8837  // The ARM _MoveToCoprocessor builtins put the input register value as
8838  // the first argument, but the LLVM intrinsic expects it as the third one.
8839  case clang::ARM::BI_MoveToCoprocessor:
8840  case clang::ARM::BI_MoveToCoprocessor2: {
8841    Function *F = CGM.getIntrinsic(BuiltinID == clang::ARM::BI_MoveToCoprocessor
8842                                       ? Intrinsic::arm_mcr
8843                                       : Intrinsic::arm_mcr2);
8844    return Builder.CreateCall(F, {Ops[1], Ops[2], Ops[0],
8845                                  Ops[3], Ops[4], Ops[5]});
8846  }
8847  }
8848
8849  // Get the last argument, which specifies the vector type.
8850  assert(HasExtraArg);
8851  const Expr *Arg = E->getArg(E->getNumArgs()-1);
8852  std::optional<llvm::APSInt> Result =
8853      Arg->getIntegerConstantExpr(getContext());
8854  if (!Result)
8855    return nullptr;
8856
8857  if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f ||
8858      BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_d) {
8859    // Determine the overloaded type of this builtin.
8860    llvm::Type *Ty;
8861    if (BuiltinID == clang::ARM::BI__builtin_arm_vcvtr_f)
8862      Ty = FloatTy;
8863    else
8864      Ty = DoubleTy;
8865
8866    // Determine whether this is an unsigned conversion or not.
8867    bool usgn = Result->getZExtValue() == 1;
8868    unsigned Int = usgn ? Intrinsic::arm_vcvtru : Intrinsic::arm_vcvtr;
8869
8870    // Call the appropriate intrinsic.
8871    Function *F = CGM.getIntrinsic(Int, Ty);
8872    return Builder.CreateCall(F, Ops, "vcvtr");
8873  }
8874
8875  // Determine the type of this overloaded NEON intrinsic.
8876  NeonTypeFlags Type = Result->getZExtValue();
8877  bool usgn = Type.isUnsigned();
8878  bool rightShift = false;
8879
8880  llvm::FixedVectorType *VTy =
8881      GetNeonType(this, Type, getTarget().hasLegalHalfType(), false,
8882                  getTarget().hasBFloat16Type());
8883  llvm::Type *Ty = VTy;
8884  if (!Ty)
8885    return nullptr;
8886
8887  // Many NEON builtins have identical semantics and uses in ARM and
8888  // AArch64. Emit these in a single function.
8889  auto IntrinsicMap = ArrayRef(ARMSIMDIntrinsicMap);
8890  const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
8891      IntrinsicMap, BuiltinID, NEONSIMDIntrinsicsProvenSorted);
8892  if (Builtin)
8893    return EmitCommonNeonBuiltinExpr(
8894        Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
8895        Builtin->NameHint, Builtin->TypeModifier, E, Ops, PtrOp0, PtrOp1, Arch);
8896
8897  unsigned Int;
8898  switch (BuiltinID) {
8899  default: return nullptr;
8900  case NEON::BI__builtin_neon_vld1q_lane_v:
8901    // Handle 64-bit integer elements as a special case.  Use shuffles of
8902    // one-element vectors to avoid poor code for i64 in the backend.
8903    if (VTy->getElementType()->isIntegerTy(64)) {
8904      // Extract the other lane.
8905      Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8906      int Lane = cast<ConstantInt>(Ops[2])->getZExtValue();
8907      Value *SV = llvm::ConstantVector::get(ConstantInt::get(Int32Ty, 1-Lane));
8908      Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
8909      // Load the value as a one-element vector.
8910      Ty = llvm::FixedVectorType::get(VTy->getElementType(), 1);
8911      llvm::Type *Tys[] = {Ty, Int8PtrTy};
8912      Function *F = CGM.getIntrinsic(Intrinsic::arm_neon_vld1, Tys);
8913      Value *Align = getAlignmentValue32(PtrOp0);
8914      Value *Ld = Builder.CreateCall(F, {Ops[0], Align});
8915      // Combine them.
8916      int Indices[] = {1 - Lane, Lane};
8917      return Builder.CreateShuffleVector(Ops[1], Ld, Indices, "vld1q_lane");
8918    }
8919    [[fallthrough]];
8920  case NEON::BI__builtin_neon_vld1_lane_v: {
8921    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8922    PtrOp0 = PtrOp0.withElementType(VTy->getElementType());
8923    Value *Ld = Builder.CreateLoad(PtrOp0);
8924    return Builder.CreateInsertElement(Ops[1], Ld, Ops[2], "vld1_lane");
8925  }
8926  case NEON::BI__builtin_neon_vqrshrn_n_v:
8927    Int =
8928      usgn ? Intrinsic::arm_neon_vqrshiftnu : Intrinsic::arm_neon_vqrshiftns;
8929    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n",
8930                        1, true);
8931  case NEON::BI__builtin_neon_vqrshrun_n_v:
8932    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqrshiftnsu, Ty),
8933                        Ops, "vqrshrun_n", 1, true);
8934  case NEON::BI__builtin_neon_vqshrn_n_v:
8935    Int = usgn ? Intrinsic::arm_neon_vqshiftnu : Intrinsic::arm_neon_vqshiftns;
8936    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n",
8937                        1, true);
8938  case NEON::BI__builtin_neon_vqshrun_n_v:
8939    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vqshiftnsu, Ty),
8940                        Ops, "vqshrun_n", 1, true);
8941  case NEON::BI__builtin_neon_vrecpe_v:
8942  case NEON::BI__builtin_neon_vrecpeq_v:
8943    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrecpe, Ty),
8944                        Ops, "vrecpe");
8945  case NEON::BI__builtin_neon_vrshrn_n_v:
8946    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vrshiftn, Ty),
8947                        Ops, "vrshrn_n", 1, true);
8948  case NEON::BI__builtin_neon_vrsra_n_v:
8949  case NEON::BI__builtin_neon_vrsraq_n_v:
8950    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8951    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8952    Ops[2] = EmitNeonShiftVector(Ops[2], Ty, true);
8953    Int = usgn ? Intrinsic::arm_neon_vrshiftu : Intrinsic::arm_neon_vrshifts;
8954    Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Ty), {Ops[1], Ops[2]});
8955    return Builder.CreateAdd(Ops[0], Ops[1], "vrsra_n");
8956  case NEON::BI__builtin_neon_vsri_n_v:
8957  case NEON::BI__builtin_neon_vsriq_n_v:
8958    rightShift = true;
8959    [[fallthrough]];
8960  case NEON::BI__builtin_neon_vsli_n_v:
8961  case NEON::BI__builtin_neon_vsliq_n_v:
8962    Ops[2] = EmitNeonShiftVector(Ops[2], Ty, rightShift);
8963    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vshiftins, Ty),
8964                        Ops, "vsli_n");
8965  case NEON::BI__builtin_neon_vsra_n_v:
8966  case NEON::BI__builtin_neon_vsraq_n_v:
8967    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
8968    Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
8969    return Builder.CreateAdd(Ops[0], Ops[1]);
8970  case NEON::BI__builtin_neon_vst1q_lane_v:
8971    // Handle 64-bit integer elements as a special case.  Use a shuffle to get
8972    // a one-element vector and avoid poor code for i64 in the backend.
8973    if (VTy->getElementType()->isIntegerTy(64)) {
8974      Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8975      Value *SV = llvm::ConstantVector::get(cast<llvm::Constant>(Ops[2]));
8976      Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV);
8977      Ops[2] = getAlignmentValue32(PtrOp0);
8978      llvm::Type *Tys[] = {Int8PtrTy, Ops[1]->getType()};
8979      return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::arm_neon_vst1,
8980                                                 Tys), Ops);
8981    }
8982    [[fallthrough]];
8983  case NEON::BI__builtin_neon_vst1_lane_v: {
8984    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
8985    Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
8986    return Builder.CreateStore(Ops[1],
8987                               PtrOp0.withElementType(Ops[1]->getType()));
8988  }
8989  case NEON::BI__builtin_neon_vtbl1_v:
8990    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl1),
8991                        Ops, "vtbl1");
8992  case NEON::BI__builtin_neon_vtbl2_v:
8993    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl2),
8994                        Ops, "vtbl2");
8995  case NEON::BI__builtin_neon_vtbl3_v:
8996    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl3),
8997                        Ops, "vtbl3");
8998  case NEON::BI__builtin_neon_vtbl4_v:
8999    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbl4),
9000                        Ops, "vtbl4");
9001  case NEON::BI__builtin_neon_vtbx1_v:
9002    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx1),
9003                        Ops, "vtbx1");
9004  case NEON::BI__builtin_neon_vtbx2_v:
9005    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx2),
9006                        Ops, "vtbx2");
9007  case NEON::BI__builtin_neon_vtbx3_v:
9008    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx3),
9009                        Ops, "vtbx3");
9010  case NEON::BI__builtin_neon_vtbx4_v:
9011    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vtbx4),
9012                        Ops, "vtbx4");
9013  }
9014}
9015
9016template<typename Integer>
9017static Integer GetIntegerConstantValue(const Expr *E, ASTContext &Context) {
9018  return E->getIntegerConstantExpr(Context)->getExtValue();
9019}
9020
9021static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V,
9022                                     llvm::Type *T, bool Unsigned) {
9023  // Helper function called by Tablegen-constructed ARM MVE builtin codegen,
9024  // which finds it convenient to specify signed/unsigned as a boolean flag.
9025  return Unsigned ? Builder.CreateZExt(V, T) : Builder.CreateSExt(V, T);
9026}
9027
9028static llvm::Value *MVEImmediateShr(CGBuilderTy &Builder, llvm::Value *V,
9029                                    uint32_t Shift, bool Unsigned) {
9030  // MVE helper function for integer shift right. This must handle signed vs
9031  // unsigned, and also deal specially with the case where the shift count is
9032  // equal to the lane size. In LLVM IR, an LShr with that parameter would be
9033  // undefined behavior, but in MVE it's legal, so we must convert it to code
9034  // that is not undefined in IR.
9035  unsigned LaneBits = cast<llvm::VectorType>(V->getType())
9036                          ->getElementType()
9037                          ->getPrimitiveSizeInBits();
9038  if (Shift == LaneBits) {
9039    // An unsigned shift of the full lane size always generates zero, so we can
9040    // simply emit a zero vector. A signed shift of the full lane size does the
9041    // same thing as shifting by one bit fewer.
9042    if (Unsigned)
9043      return llvm::Constant::getNullValue(V->getType());
9044    else
9045      --Shift;
9046  }
9047  return Unsigned ? Builder.CreateLShr(V, Shift) : Builder.CreateAShr(V, Shift);
9048}
9049
9050static llvm::Value *ARMMVEVectorSplat(CGBuilderTy &Builder, llvm::Value *V) {
9051  // MVE-specific helper function for a vector splat, which infers the element
9052  // count of the output vector by knowing that MVE vectors are all 128 bits
9053  // wide.
9054  unsigned Elements = 128 / V->getType()->getPrimitiveSizeInBits();
9055  return Builder.CreateVectorSplat(Elements, V);
9056}
9057
9058static llvm::Value *ARMMVEVectorReinterpret(CGBuilderTy &Builder,
9059                                            CodeGenFunction *CGF,
9060                                            llvm::Value *V,
9061                                            llvm::Type *DestType) {
9062  // Convert one MVE vector type into another by reinterpreting its in-register
9063  // format.
9064  //
9065  // Little-endian, this is identical to a bitcast (which reinterprets the
9066  // memory format). But big-endian, they're not necessarily the same, because
9067  // the register and memory formats map to each other differently depending on
9068  // the lane size.
9069  //
9070  // We generate a bitcast whenever we can (if we're little-endian, or if the
9071  // lane sizes are the same anyway). Otherwise we fall back to an IR intrinsic
9072  // that performs the different kind of reinterpretation.
9073  if (CGF->getTarget().isBigEndian() &&
9074      V->getType()->getScalarSizeInBits() != DestType->getScalarSizeInBits()) {
9075    return Builder.CreateCall(
9076        CGF->CGM.getIntrinsic(Intrinsic::arm_mve_vreinterpretq,
9077                              {DestType, V->getType()}),
9078        V);
9079  } else {
9080    return Builder.CreateBitCast(V, DestType);
9081  }
9082}
9083
9084static llvm::Value *VectorUnzip(CGBuilderTy &Builder, llvm::Value *V, bool Odd) {
9085  // Make a shufflevector that extracts every other element of a vector (evens
9086  // or odds, as desired).
9087  SmallVector<int, 16> Indices;
9088  unsigned InputElements =
9089      cast<llvm::FixedVectorType>(V->getType())->getNumElements();
9090  for (unsigned i = 0; i < InputElements; i += 2)
9091    Indices.push_back(i + Odd);
9092  return Builder.CreateShuffleVector(V, Indices);
9093}
9094
9095static llvm::Value *VectorZip(CGBuilderTy &Builder, llvm::Value *V0,
9096                              llvm::Value *V1) {
9097  // Make a shufflevector that interleaves two vectors element by element.
9098  assert(V0->getType() == V1->getType() && "Can't zip different vector types");
9099  SmallVector<int, 16> Indices;
9100  unsigned InputElements =
9101      cast<llvm::FixedVectorType>(V0->getType())->getNumElements();
9102  for (unsigned i = 0; i < InputElements; i++) {
9103    Indices.push_back(i);
9104    Indices.push_back(i + InputElements);
9105  }
9106  return Builder.CreateShuffleVector(V0, V1, Indices);
9107}
9108
9109template<unsigned HighBit, unsigned OtherBits>
9110static llvm::Value *ARMMVEConstantSplat(CGBuilderTy &Builder, llvm::Type *VT) {
9111  // MVE-specific helper function to make a vector splat of a constant such as
9112  // UINT_MAX or INT_MIN, in which all bits below the highest one are equal.
9113  llvm::Type *T = cast<llvm::VectorType>(VT)->getElementType();
9114  unsigned LaneBits = T->getPrimitiveSizeInBits();
9115  uint32_t Value = HighBit << (LaneBits - 1);
9116  if (OtherBits)
9117    Value |= (1UL << (LaneBits - 1)) - 1;
9118  llvm::Value *Lane = llvm::ConstantInt::get(T, Value);
9119  return ARMMVEVectorSplat(Builder, Lane);
9120}
9121
9122static llvm::Value *ARMMVEVectorElementReverse(CGBuilderTy &Builder,
9123                                               llvm::Value *V,
9124                                               unsigned ReverseWidth) {
9125  // MVE-specific helper function which reverses the elements of a
9126  // vector within every (ReverseWidth)-bit collection of lanes.
9127  SmallVector<int, 16> Indices;
9128  unsigned LaneSize = V->getType()->getScalarSizeInBits();
9129  unsigned Elements = 128 / LaneSize;
9130  unsigned Mask = ReverseWidth / LaneSize - 1;
9131  for (unsigned i = 0; i < Elements; i++)
9132    Indices.push_back(i ^ Mask);
9133  return Builder.CreateShuffleVector(V, Indices);
9134}
9135
9136Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID,
9137                                              const CallExpr *E,
9138                                              ReturnValueSlot ReturnValue,
9139                                              llvm::Triple::ArchType Arch) {
9140  enum class CustomCodeGen { VLD24, VST24 } CustomCodeGenType;
9141  Intrinsic::ID IRIntr;
9142  unsigned NumVectors;
9143
9144  // Code autogenerated by Tablegen will handle all the simple builtins.
9145  switch (BuiltinID) {
9146    #include "clang/Basic/arm_mve_builtin_cg.inc"
9147
9148    // If we didn't match an MVE builtin id at all, go back to the
9149    // main EmitARMBuiltinExpr.
9150  default:
9151    return nullptr;
9152  }
9153
9154  // Anything that breaks from that switch is an MVE builtin that
9155  // needs handwritten code to generate.
9156
9157  switch (CustomCodeGenType) {
9158
9159  case CustomCodeGen::VLD24: {
9160    llvm::SmallVector<Value *, 4> Ops;
9161    llvm::SmallVector<llvm::Type *, 4> Tys;
9162
9163    auto MvecCType = E->getType();
9164    auto MvecLType = ConvertType(MvecCType);
9165    assert(MvecLType->isStructTy() &&
9166           "Return type for vld[24]q should be a struct");
9167    assert(MvecLType->getStructNumElements() == 1 &&
9168           "Return-type struct for vld[24]q should have one element");
9169    auto MvecLTypeInner = MvecLType->getStructElementType(0);
9170    assert(MvecLTypeInner->isArrayTy() &&
9171           "Return-type struct for vld[24]q should contain an array");
9172    assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
9173           "Array member of return-type struct vld[24]q has wrong length");
9174    auto VecLType = MvecLTypeInner->getArrayElementType();
9175
9176    Tys.push_back(VecLType);
9177
9178    auto Addr = E->getArg(0);
9179    Ops.push_back(EmitScalarExpr(Addr));
9180    Tys.push_back(ConvertType(Addr->getType()));
9181
9182    Function *F = CGM.getIntrinsic(IRIntr, ArrayRef(Tys));
9183    Value *LoadResult = Builder.CreateCall(F, Ops);
9184    Value *MvecOut = PoisonValue::get(MvecLType);
9185    for (unsigned i = 0; i < NumVectors; ++i) {
9186      Value *Vec = Builder.CreateExtractValue(LoadResult, i);
9187      MvecOut = Builder.CreateInsertValue(MvecOut, Vec, {0, i});
9188    }
9189
9190    if (ReturnValue.isNull())
9191      return MvecOut;
9192    else
9193      return Builder.CreateStore(MvecOut, ReturnValue.getValue());
9194  }
9195
9196  case CustomCodeGen::VST24: {
9197    llvm::SmallVector<Value *, 4> Ops;
9198    llvm::SmallVector<llvm::Type *, 4> Tys;
9199
9200    auto Addr = E->getArg(0);
9201    Ops.push_back(EmitScalarExpr(Addr));
9202    Tys.push_back(ConvertType(Addr->getType()));
9203
9204    auto MvecCType = E->getArg(1)->getType();
9205    auto MvecLType = ConvertType(MvecCType);
9206    assert(MvecLType->isStructTy() && "Data type for vst2q should be a struct");
9207    assert(MvecLType->getStructNumElements() == 1 &&
9208           "Data-type struct for vst2q should have one element");
9209    auto MvecLTypeInner = MvecLType->getStructElementType(0);
9210    assert(MvecLTypeInner->isArrayTy() &&
9211           "Data-type struct for vst2q should contain an array");
9212    assert(MvecLTypeInner->getArrayNumElements() == NumVectors &&
9213           "Array member of return-type struct vld[24]q has wrong length");
9214    auto VecLType = MvecLTypeInner->getArrayElementType();
9215
9216    Tys.push_back(VecLType);
9217
9218    AggValueSlot MvecSlot = CreateAggTemp(MvecCType);
9219    EmitAggExpr(E->getArg(1), MvecSlot);
9220    auto Mvec = Builder.CreateLoad(MvecSlot.getAddress());
9221    for (unsigned i = 0; i < NumVectors; i++)
9222      Ops.push_back(Builder.CreateExtractValue(Mvec, {0, i}));
9223
9224    Function *F = CGM.getIntrinsic(IRIntr, ArrayRef(Tys));
9225    Value *ToReturn = nullptr;
9226    for (unsigned i = 0; i < NumVectors; i++) {
9227      Ops.push_back(llvm::ConstantInt::get(Int32Ty, i));
9228      ToReturn = Builder.CreateCall(F, Ops);
9229      Ops.pop_back();
9230    }
9231    return ToReturn;
9232  }
9233  }
9234  llvm_unreachable("unknown custom codegen type.");
9235}
9236
9237Value *CodeGenFunction::EmitARMCDEBuiltinExpr(unsigned BuiltinID,
9238                                              const CallExpr *E,
9239                                              ReturnValueSlot ReturnValue,
9240                                              llvm::Triple::ArchType Arch) {
9241  switch (BuiltinID) {
9242  default:
9243    return nullptr;
9244#include "clang/Basic/arm_cde_builtin_cg.inc"
9245  }
9246}
9247
9248static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID,
9249                                      const CallExpr *E,
9250                                      SmallVectorImpl<Value *> &Ops,
9251                                      llvm::Triple::ArchType Arch) {
9252  unsigned int Int = 0;
9253  const char *s = nullptr;
9254
9255  switch (BuiltinID) {
9256  default:
9257    return nullptr;
9258  case NEON::BI__builtin_neon_vtbl1_v:
9259  case NEON::BI__builtin_neon_vqtbl1_v:
9260  case NEON::BI__builtin_neon_vqtbl1q_v:
9261  case NEON::BI__builtin_neon_vtbl2_v:
9262  case NEON::BI__builtin_neon_vqtbl2_v:
9263  case NEON::BI__builtin_neon_vqtbl2q_v:
9264  case NEON::BI__builtin_neon_vtbl3_v:
9265  case NEON::BI__builtin_neon_vqtbl3_v:
9266  case NEON::BI__builtin_neon_vqtbl3q_v:
9267  case NEON::BI__builtin_neon_vtbl4_v:
9268  case NEON::BI__builtin_neon_vqtbl4_v:
9269  case NEON::BI__builtin_neon_vqtbl4q_v:
9270    break;
9271  case NEON::BI__builtin_neon_vtbx1_v:
9272  case NEON::BI__builtin_neon_vqtbx1_v:
9273  case NEON::BI__builtin_neon_vqtbx1q_v:
9274  case NEON::BI__builtin_neon_vtbx2_v:
9275  case NEON::BI__builtin_neon_vqtbx2_v:
9276  case NEON::BI__builtin_neon_vqtbx2q_v:
9277  case NEON::BI__builtin_neon_vtbx3_v:
9278  case NEON::BI__builtin_neon_vqtbx3_v:
9279  case NEON::BI__builtin_neon_vqtbx3q_v:
9280  case NEON::BI__builtin_neon_vtbx4_v:
9281  case NEON::BI__builtin_neon_vqtbx4_v:
9282  case NEON::BI__builtin_neon_vqtbx4q_v:
9283    break;
9284  }
9285
9286  assert(E->getNumArgs() >= 3);
9287
9288  // Get the last argument, which specifies the vector type.
9289  const Expr *Arg = E->getArg(E->getNumArgs() - 1);
9290  std::optional<llvm::APSInt> Result =
9291      Arg->getIntegerConstantExpr(CGF.getContext());
9292  if (!Result)
9293    return nullptr;
9294
9295  // Determine the type of this overloaded NEON intrinsic.
9296  NeonTypeFlags Type = Result->getZExtValue();
9297  llvm::FixedVectorType *Ty = GetNeonType(&CGF, Type);
9298  if (!Ty)
9299    return nullptr;
9300
9301  CodeGen::CGBuilderTy &Builder = CGF.Builder;
9302
9303  // AArch64 scalar builtins are not overloaded, they do not have an extra
9304  // argument that specifies the vector type, need to handle each case.
9305  switch (BuiltinID) {
9306  case NEON::BI__builtin_neon_vtbl1_v: {
9307    return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 1), nullptr, Ops[1],
9308                              Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
9309  }
9310  case NEON::BI__builtin_neon_vtbl2_v: {
9311    return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 2), nullptr, Ops[2],
9312                              Ty, Intrinsic::aarch64_neon_tbl1, "vtbl1");
9313  }
9314  case NEON::BI__builtin_neon_vtbl3_v: {
9315    return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 3), nullptr, Ops[3],
9316                              Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
9317  }
9318  case NEON::BI__builtin_neon_vtbl4_v: {
9319    return packTBLDVectorList(CGF, ArrayRef(Ops).slice(0, 4), nullptr, Ops[4],
9320                              Ty, Intrinsic::aarch64_neon_tbl2, "vtbl2");
9321  }
9322  case NEON::BI__builtin_neon_vtbx1_v: {
9323    Value *TblRes =
9324        packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 1), nullptr, Ops[2], Ty,
9325                           Intrinsic::aarch64_neon_tbl1, "vtbl1");
9326
9327    llvm::Constant *EightV = ConstantInt::get(Ty, 8);
9328    Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[2], EightV);
9329    CmpRes = Builder.CreateSExt(CmpRes, Ty);
9330
9331    Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
9332    Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
9333    return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
9334  }
9335  case NEON::BI__builtin_neon_vtbx2_v: {
9336    return packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 2), Ops[0], Ops[3],
9337                              Ty, Intrinsic::aarch64_neon_tbx1, "vtbx1");
9338  }
9339  case NEON::BI__builtin_neon_vtbx3_v: {
9340    Value *TblRes =
9341        packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 3), nullptr, Ops[4], Ty,
9342                           Intrinsic::aarch64_neon_tbl2, "vtbl2");
9343
9344    llvm::Constant *TwentyFourV = ConstantInt::get(Ty, 24);
9345    Value *CmpRes = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[4],
9346                                           TwentyFourV);
9347    CmpRes = Builder.CreateSExt(CmpRes, Ty);
9348
9349    Value *EltsFromInput = Builder.CreateAnd(CmpRes, Ops[0]);
9350    Value *EltsFromTbl = Builder.CreateAnd(Builder.CreateNot(CmpRes), TblRes);
9351    return Builder.CreateOr(EltsFromInput, EltsFromTbl, "vtbx");
9352  }
9353  case NEON::BI__builtin_neon_vtbx4_v: {
9354    return packTBLDVectorList(CGF, ArrayRef(Ops).slice(1, 4), Ops[0], Ops[5],
9355                              Ty, Intrinsic::aarch64_neon_tbx2, "vtbx2");
9356  }
9357  case NEON::BI__builtin_neon_vqtbl1_v:
9358  case NEON::BI__builtin_neon_vqtbl1q_v:
9359    Int = Intrinsic::aarch64_neon_tbl1; s = "vtbl1"; break;
9360  case NEON::BI__builtin_neon_vqtbl2_v:
9361  case NEON::BI__builtin_neon_vqtbl2q_v: {
9362    Int = Intrinsic::aarch64_neon_tbl2; s = "vtbl2"; break;
9363  case NEON::BI__builtin_neon_vqtbl3_v:
9364  case NEON::BI__builtin_neon_vqtbl3q_v:
9365    Int = Intrinsic::aarch64_neon_tbl3; s = "vtbl3"; break;
9366  case NEON::BI__builtin_neon_vqtbl4_v:
9367  case NEON::BI__builtin_neon_vqtbl4q_v:
9368    Int = Intrinsic::aarch64_neon_tbl4; s = "vtbl4"; break;
9369  case NEON::BI__builtin_neon_vqtbx1_v:
9370  case NEON::BI__builtin_neon_vqtbx1q_v:
9371    Int = Intrinsic::aarch64_neon_tbx1; s = "vtbx1"; break;
9372  case NEON::BI__builtin_neon_vqtbx2_v:
9373  case NEON::BI__builtin_neon_vqtbx2q_v:
9374    Int = Intrinsic::aarch64_neon_tbx2; s = "vtbx2"; break;
9375  case NEON::BI__builtin_neon_vqtbx3_v:
9376  case NEON::BI__builtin_neon_vqtbx3q_v:
9377    Int = Intrinsic::aarch64_neon_tbx3; s = "vtbx3"; break;
9378  case NEON::BI__builtin_neon_vqtbx4_v:
9379  case NEON::BI__builtin_neon_vqtbx4q_v:
9380    Int = Intrinsic::aarch64_neon_tbx4; s = "vtbx4"; break;
9381  }
9382  }
9383
9384  if (!Int)
9385    return nullptr;
9386
9387  Function *F = CGF.CGM.getIntrinsic(Int, Ty);
9388  return CGF.EmitNeonCall(F, Ops, s);
9389}
9390
9391Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
9392  auto *VTy = llvm::FixedVectorType::get(Int16Ty, 4);
9393  Op = Builder.CreateBitCast(Op, Int16Ty);
9394  Value *V = PoisonValue::get(VTy);
9395  llvm::Constant *CI = ConstantInt::get(SizeTy, 0);
9396  Op = Builder.CreateInsertElement(V, Op, CI);
9397  return Op;
9398}
9399
9400/// SVEBuiltinMemEltTy - Returns the memory element type for this memory
9401/// access builtin.  Only required if it can't be inferred from the base pointer
9402/// operand.
9403llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(const SVETypeFlags &TypeFlags) {
9404  switch (TypeFlags.getMemEltType()) {
9405  case SVETypeFlags::MemEltTyDefault:
9406    return getEltType(TypeFlags);
9407  case SVETypeFlags::MemEltTyInt8:
9408    return Builder.getInt8Ty();
9409  case SVETypeFlags::MemEltTyInt16:
9410    return Builder.getInt16Ty();
9411  case SVETypeFlags::MemEltTyInt32:
9412    return Builder.getInt32Ty();
9413  case SVETypeFlags::MemEltTyInt64:
9414    return Builder.getInt64Ty();
9415  }
9416  llvm_unreachable("Unknown MemEltType");
9417}
9418
9419llvm::Type *CodeGenFunction::getEltType(const SVETypeFlags &TypeFlags) {
9420  switch (TypeFlags.getEltType()) {
9421  default:
9422    llvm_unreachable("Invalid SVETypeFlag!");
9423
9424  case SVETypeFlags::EltTyInt8:
9425    return Builder.getInt8Ty();
9426  case SVETypeFlags::EltTyInt16:
9427    return Builder.getInt16Ty();
9428  case SVETypeFlags::EltTyInt32:
9429    return Builder.getInt32Ty();
9430  case SVETypeFlags::EltTyInt64:
9431    return Builder.getInt64Ty();
9432  case SVETypeFlags::EltTyInt128:
9433    return Builder.getInt128Ty();
9434
9435  case SVETypeFlags::EltTyFloat16:
9436    return Builder.getHalfTy();
9437  case SVETypeFlags::EltTyFloat32:
9438    return Builder.getFloatTy();
9439  case SVETypeFlags::EltTyFloat64:
9440    return Builder.getDoubleTy();
9441
9442  case SVETypeFlags::EltTyBFloat16:
9443    return Builder.getBFloatTy();
9444
9445  case SVETypeFlags::EltTyBool8:
9446  case SVETypeFlags::EltTyBool16:
9447  case SVETypeFlags::EltTyBool32:
9448  case SVETypeFlags::EltTyBool64:
9449    return Builder.getInt1Ty();
9450  }
9451}
9452
9453// Return the llvm predicate vector type corresponding to the specified element
9454// TypeFlags.
9455llvm::ScalableVectorType *
9456CodeGenFunction::getSVEPredType(const SVETypeFlags &TypeFlags) {
9457  switch (TypeFlags.getEltType()) {
9458  default: llvm_unreachable("Unhandled SVETypeFlag!");
9459
9460  case SVETypeFlags::EltTyInt8:
9461    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
9462  case SVETypeFlags::EltTyInt16:
9463    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9464  case SVETypeFlags::EltTyInt32:
9465    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9466  case SVETypeFlags::EltTyInt64:
9467    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9468
9469  case SVETypeFlags::EltTyBFloat16:
9470    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9471  case SVETypeFlags::EltTyFloat16:
9472    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9473  case SVETypeFlags::EltTyFloat32:
9474    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9475  case SVETypeFlags::EltTyFloat64:
9476    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9477
9478  case SVETypeFlags::EltTyBool8:
9479    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
9480  case SVETypeFlags::EltTyBool16:
9481    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9482  case SVETypeFlags::EltTyBool32:
9483    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9484  case SVETypeFlags::EltTyBool64:
9485    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9486  }
9487}
9488
9489// Return the llvm vector type corresponding to the specified element TypeFlags.
9490llvm::ScalableVectorType *
9491CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
9492  switch (TypeFlags.getEltType()) {
9493  default:
9494    llvm_unreachable("Invalid SVETypeFlag!");
9495
9496  case SVETypeFlags::EltTyInt8:
9497    return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
9498  case SVETypeFlags::EltTyInt16:
9499    return llvm::ScalableVectorType::get(Builder.getInt16Ty(), 8);
9500  case SVETypeFlags::EltTyInt32:
9501    return llvm::ScalableVectorType::get(Builder.getInt32Ty(), 4);
9502  case SVETypeFlags::EltTyInt64:
9503    return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);
9504
9505  case SVETypeFlags::EltTyFloat16:
9506    return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
9507  case SVETypeFlags::EltTyBFloat16:
9508    return llvm::ScalableVectorType::get(Builder.getBFloatTy(), 8);
9509  case SVETypeFlags::EltTyFloat32:
9510    return llvm::ScalableVectorType::get(Builder.getFloatTy(), 4);
9511  case SVETypeFlags::EltTyFloat64:
9512    return llvm::ScalableVectorType::get(Builder.getDoubleTy(), 2);
9513
9514  case SVETypeFlags::EltTyBool8:
9515    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
9516  case SVETypeFlags::EltTyBool16:
9517    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8);
9518  case SVETypeFlags::EltTyBool32:
9519    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4);
9520  case SVETypeFlags::EltTyBool64:
9521    return llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2);
9522  }
9523}
9524
9525llvm::Value *
9526CodeGenFunction::EmitSVEAllTruePred(const SVETypeFlags &TypeFlags) {
9527  Function *Ptrue =
9528      CGM.getIntrinsic(Intrinsic::aarch64_sve_ptrue, getSVEPredType(TypeFlags));
9529  return Builder.CreateCall(Ptrue, {Builder.getInt32(/*SV_ALL*/ 31)});
9530}
9531
9532constexpr unsigned SVEBitsPerBlock = 128;
9533
9534static llvm::ScalableVectorType *getSVEVectorForElementType(llvm::Type *EltTy) {
9535  unsigned NumElts = SVEBitsPerBlock / EltTy->getScalarSizeInBits();
9536  return llvm::ScalableVectorType::get(EltTy, NumElts);
9537}
9538
9539// Reinterpret the input predicate so that it can be used to correctly isolate
9540// the elements of the specified datatype.
9541Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
9542                                             llvm::ScalableVectorType *VTy) {
9543
9544  if (isa<TargetExtType>(Pred->getType()) &&
9545      cast<TargetExtType>(Pred->getType())->getName() == "aarch64.svcount")
9546    return Pred;
9547
9548  auto *RTy = llvm::VectorType::get(IntegerType::get(getLLVMContext(), 1), VTy);
9549  if (Pred->getType() == RTy)
9550    return Pred;
9551
9552  unsigned IntID;
9553  llvm::Type *IntrinsicTy;
9554  switch (VTy->getMinNumElements()) {
9555  default:
9556    llvm_unreachable("unsupported element count!");
9557  case 1:
9558  case 2:
9559  case 4:
9560  case 8:
9561    IntID = Intrinsic::aarch64_sve_convert_from_svbool;
9562    IntrinsicTy = RTy;
9563    break;
9564  case 16:
9565    IntID = Intrinsic::aarch64_sve_convert_to_svbool;
9566    IntrinsicTy = Pred->getType();
9567    break;
9568  }
9569
9570  Function *F = CGM.getIntrinsic(IntID, IntrinsicTy);
9571  Value *C = Builder.CreateCall(F, Pred);
9572  assert(C->getType() == RTy && "Unexpected return type!");
9573  return C;
9574}
9575
9576Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
9577                                          SmallVectorImpl<Value *> &Ops,
9578                                          unsigned IntID) {
9579  auto *ResultTy = getSVEType(TypeFlags);
9580  auto *OverloadedTy =
9581      llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), ResultTy);
9582
9583  Function *F = nullptr;
9584  if (Ops[1]->getType()->isVectorTy())
9585    // This is the "vector base, scalar offset" case. In order to uniquely
9586    // map this built-in to an LLVM IR intrinsic, we need both the return type
9587    // and the type of the vector base.
9588    F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[1]->getType()});
9589  else
9590    // This is the "scalar base, vector offset case". The type of the offset
9591    // is encoded in the name of the intrinsic. We only need to specify the
9592    // return type in order to uniquely map this built-in to an LLVM IR
9593    // intrinsic.
9594    F = CGM.getIntrinsic(IntID, OverloadedTy);
9595
9596  // At the ACLE level there's only one predicate type, svbool_t, which is
9597  // mapped to <n x 16 x i1>. However, this might be incompatible with the
9598  // actual type being loaded. For example, when loading doubles (i64) the
9599  // predicate should be <n x 2 x i1> instead. At the IR level the type of
9600  // the predicate and the data being loaded must match. Cast to the type
9601  // expected by the intrinsic. The intrinsic itself should be defined in
9602  // a way than enforces relations between parameter types.
9603  Ops[0] = EmitSVEPredicateCast(
9604      Ops[0], cast<llvm::ScalableVectorType>(F->getArg(0)->getType()));
9605
9606  // Pass 0 when the offset is missing. This can only be applied when using
9607  // the "vector base" addressing mode for which ACLE allows no offset. The
9608  // corresponding LLVM IR always requires an offset.
9609  if (Ops.size() == 2) {
9610    assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
9611    Ops.push_back(ConstantInt::get(Int64Ty, 0));
9612  }
9613
9614  // For "vector base, scalar index" scale the index so that it becomes a
9615  // scalar offset.
9616  if (!TypeFlags.isByteIndexed() && Ops[1]->getType()->isVectorTy()) {
9617    unsigned BytesPerElt =
9618        OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
9619    Ops[2] = Builder.CreateShl(Ops[2], Log2_32(BytesPerElt));
9620  }
9621
9622  Value *Call = Builder.CreateCall(F, Ops);
9623
9624  // The following sext/zext is only needed when ResultTy != OverloadedTy. In
9625  // other cases it's folded into a nop.
9626  return TypeFlags.isZExtReturn() ? Builder.CreateZExt(Call, ResultTy)
9627                                  : Builder.CreateSExt(Call, ResultTy);
9628}
9629
9630Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
9631                                            SmallVectorImpl<Value *> &Ops,
9632                                            unsigned IntID) {
9633  auto *SrcDataTy = getSVEType(TypeFlags);
9634  auto *OverloadedTy =
9635      llvm::ScalableVectorType::get(SVEBuiltinMemEltTy(TypeFlags), SrcDataTy);
9636
9637  // In ACLE the source data is passed in the last argument, whereas in LLVM IR
9638  // it's the first argument. Move it accordingly.
9639  Ops.insert(Ops.begin(), Ops.pop_back_val());
9640
9641  Function *F = nullptr;
9642  if (Ops[2]->getType()->isVectorTy())
9643    // This is the "vector base, scalar offset" case. In order to uniquely
9644    // map this built-in to an LLVM IR intrinsic, we need both the return type
9645    // and the type of the vector base.
9646    F = CGM.getIntrinsic(IntID, {OverloadedTy, Ops[2]->getType()});
9647  else
9648    // This is the "scalar base, vector offset case". The type of the offset
9649    // is encoded in the name of the intrinsic. We only need to specify the
9650    // return type in order to uniquely map this built-in to an LLVM IR
9651    // intrinsic.
9652    F = CGM.getIntrinsic(IntID, OverloadedTy);
9653
9654  // Pass 0 when the offset is missing. This can only be applied when using
9655  // the "vector base" addressing mode for which ACLE allows no offset. The
9656  // corresponding LLVM IR always requires an offset.
9657  if (Ops.size() == 3) {
9658    assert(Ops[1]->getType()->isVectorTy() && "Scalar base requires an offset");
9659    Ops.push_back(ConstantInt::get(Int64Ty, 0));
9660  }
9661
9662  // Truncation is needed when SrcDataTy != OverloadedTy. In other cases it's
9663  // folded into a nop.
9664  Ops[0] = Builder.CreateTrunc(Ops[0], OverloadedTy);
9665
9666  // At the ACLE level there's only one predicate type, svbool_t, which is
9667  // mapped to <n x 16 x i1>. However, this might be incompatible with the
9668  // actual type being stored. For example, when storing doubles (i64) the
9669  // predicated should be <n x 2 x i1> instead. At the IR level the type of
9670  // the predicate and the data being stored must match. Cast to the type
9671  // expected by the intrinsic. The intrinsic itself should be defined in
9672  // a way that enforces relations between parameter types.
9673  Ops[1] = EmitSVEPredicateCast(
9674      Ops[1], cast<llvm::ScalableVectorType>(F->getArg(1)->getType()));
9675
9676  // For "vector base, scalar index" scale the index so that it becomes a
9677  // scalar offset.
9678  if (!TypeFlags.isByteIndexed() && Ops[2]->getType()->isVectorTy()) {
9679    unsigned BytesPerElt =
9680        OverloadedTy->getElementType()->getScalarSizeInBits() / 8;
9681    Ops[3] = Builder.CreateShl(Ops[3], Log2_32(BytesPerElt));
9682  }
9683
9684  return Builder.CreateCall(F, Ops);
9685}
9686
9687Value *CodeGenFunction::EmitSVEGatherPrefetch(const SVETypeFlags &TypeFlags,
9688                                              SmallVectorImpl<Value *> &Ops,
9689                                              unsigned IntID) {
9690  // The gather prefetches are overloaded on the vector input - this can either
9691  // be the vector of base addresses or vector of offsets.
9692  auto *OverloadedTy = dyn_cast<llvm::ScalableVectorType>(Ops[1]->getType());
9693  if (!OverloadedTy)
9694    OverloadedTy = cast<llvm::ScalableVectorType>(Ops[2]->getType());
9695
9696  // Cast the predicate from svbool_t to the right number of elements.
9697  Ops[0] = EmitSVEPredicateCast(Ops[0], OverloadedTy);
9698
9699  // vector + imm addressing modes
9700  if (Ops[1]->getType()->isVectorTy()) {
9701    if (Ops.size() == 3) {
9702      // Pass 0 for 'vector+imm' when the index is omitted.
9703      Ops.push_back(ConstantInt::get(Int64Ty, 0));
9704
9705      // The sv_prfop is the last operand in the builtin and IR intrinsic.
9706      std::swap(Ops[2], Ops[3]);
9707    } else {
9708      // Index needs to be passed as scaled offset.
9709      llvm::Type *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
9710      unsigned BytesPerElt = MemEltTy->getPrimitiveSizeInBits() / 8;
9711      if (BytesPerElt > 1)
9712        Ops[2] = Builder.CreateShl(Ops[2], Log2_32(BytesPerElt));
9713    }
9714  }
9715
9716  Function *F = CGM.getIntrinsic(IntID, OverloadedTy);
9717  return Builder.CreateCall(F, Ops);
9718}
9719
9720Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
9721                                          SmallVectorImpl<Value*> &Ops,
9722                                          unsigned IntID) {
9723  llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
9724
9725  unsigned N;
9726  switch (IntID) {
9727  case Intrinsic::aarch64_sve_ld2_sret:
9728  case Intrinsic::aarch64_sve_ld1_pn_x2:
9729  case Intrinsic::aarch64_sve_ldnt1_pn_x2:
9730  case Intrinsic::aarch64_sve_ld2q_sret:
9731    N = 2;
9732    break;
9733  case Intrinsic::aarch64_sve_ld3_sret:
9734  case Intrinsic::aarch64_sve_ld3q_sret:
9735    N = 3;
9736    break;
9737  case Intrinsic::aarch64_sve_ld4_sret:
9738  case Intrinsic::aarch64_sve_ld1_pn_x4:
9739  case Intrinsic::aarch64_sve_ldnt1_pn_x4:
9740  case Intrinsic::aarch64_sve_ld4q_sret:
9741    N = 4;
9742    break;
9743  default:
9744    llvm_unreachable("unknown intrinsic!");
9745  }
9746  auto RetTy = llvm::VectorType::get(VTy->getElementType(),
9747                                     VTy->getElementCount() * N);
9748
9749  Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy);
9750  Value *BasePtr = Ops[1];
9751
9752  // Does the load have an offset?
9753  if (Ops.size() > 2)
9754    BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
9755
9756  Function *F = CGM.getIntrinsic(IntID, {VTy});
9757  Value *Call = Builder.CreateCall(F, {Predicate, BasePtr});
9758  unsigned MinElts = VTy->getMinNumElements();
9759  Value *Ret = llvm::PoisonValue::get(RetTy);
9760  for (unsigned I = 0; I < N; I++) {
9761    Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
9762    Value *SRet = Builder.CreateExtractValue(Call, I);
9763    Ret = Builder.CreateInsertVector(RetTy, Ret, SRet, Idx);
9764  }
9765  return Ret;
9766}
9767
9768Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
9769                                           SmallVectorImpl<Value*> &Ops,
9770                                           unsigned IntID) {
9771  llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
9772
9773  unsigned N;
9774  switch (IntID) {
9775  case Intrinsic::aarch64_sve_st2:
9776  case Intrinsic::aarch64_sve_st1_pn_x2:
9777  case Intrinsic::aarch64_sve_stnt1_pn_x2:
9778  case Intrinsic::aarch64_sve_st2q:
9779    N = 2;
9780    break;
9781  case Intrinsic::aarch64_sve_st3:
9782  case Intrinsic::aarch64_sve_st3q:
9783    N = 3;
9784    break;
9785  case Intrinsic::aarch64_sve_st4:
9786  case Intrinsic::aarch64_sve_st1_pn_x4:
9787  case Intrinsic::aarch64_sve_stnt1_pn_x4:
9788  case Intrinsic::aarch64_sve_st4q:
9789    N = 4;
9790    break;
9791  default:
9792    llvm_unreachable("unknown intrinsic!");
9793  }
9794
9795  Value *Predicate = EmitSVEPredicateCast(Ops[0], VTy);
9796  Value *BasePtr = Ops[1];
9797
9798  // Does the store have an offset?
9799  if (Ops.size() > (2 + N))
9800    BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
9801
9802  // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
9803  // need to break up the tuple vector.
9804  SmallVector<llvm::Value*, 5> Operands;
9805  for (unsigned I = Ops.size() - N; I < Ops.size(); ++I)
9806    Operands.push_back(Ops[I]);
9807  Operands.append({Predicate, BasePtr});
9808  Function *F = CGM.getIntrinsic(IntID, { VTy });
9809
9810  return Builder.CreateCall(F, Operands);
9811}
9812
9813// SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and
9814// svpmullt_pair intrinsics, with the exception that their results are bitcast
9815// to a wider type.
9816Value *CodeGenFunction::EmitSVEPMull(const SVETypeFlags &TypeFlags,
9817                                     SmallVectorImpl<Value *> &Ops,
9818                                     unsigned BuiltinID) {
9819  // Splat scalar operand to vector (intrinsics with _n infix)
9820  if (TypeFlags.hasSplatOperand()) {
9821    unsigned OpNo = TypeFlags.getSplatOperand();
9822    Ops[OpNo] = EmitSVEDupX(Ops[OpNo]);
9823  }
9824
9825  // The pair-wise function has a narrower overloaded type.
9826  Function *F = CGM.getIntrinsic(BuiltinID, Ops[0]->getType());
9827  Value *Call = Builder.CreateCall(F, {Ops[0], Ops[1]});
9828
9829  // Now bitcast to the wider result type.
9830  llvm::ScalableVectorType *Ty = getSVEType(TypeFlags);
9831  return EmitSVEReinterpret(Call, Ty);
9832}
9833
9834Value *CodeGenFunction::EmitSVEMovl(const SVETypeFlags &TypeFlags,
9835                                    ArrayRef<Value *> Ops, unsigned BuiltinID) {
9836  llvm::Type *OverloadedTy = getSVEType(TypeFlags);
9837  Function *F = CGM.getIntrinsic(BuiltinID, OverloadedTy);
9838  return Builder.CreateCall(F, {Ops[0], Builder.getInt32(0)});
9839}
9840
9841Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
9842                                            SmallVectorImpl<Value *> &Ops,
9843                                            unsigned BuiltinID) {
9844  auto *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
9845  auto *VectorTy = getSVEVectorForElementType(MemEltTy);
9846  auto *MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9847
9848  Value *Predicate = EmitSVEPredicateCast(Ops[0], MemoryTy);
9849  Value *BasePtr = Ops[1];
9850
9851  // Implement the index operand if not omitted.
9852  if (Ops.size() > 3)
9853    BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
9854
9855  Value *PrfOp = Ops.back();
9856
9857  Function *F = CGM.getIntrinsic(BuiltinID, Predicate->getType());
9858  return Builder.CreateCall(F, {Predicate, BasePtr, PrfOp});
9859}
9860
9861Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E,
9862                                          llvm::Type *ReturnTy,
9863                                          SmallVectorImpl<Value *> &Ops,
9864                                          unsigned IntrinsicID,
9865                                          bool IsZExtReturn) {
9866  QualType LangPTy = E->getArg(1)->getType();
9867  llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
9868      LangPTy->castAs<PointerType>()->getPointeeType());
9869
9870  // The vector type that is returned may be different from the
9871  // eventual type loaded from memory.
9872  auto VectorTy = cast<llvm::ScalableVectorType>(ReturnTy);
9873  llvm::ScalableVectorType *MemoryTy = nullptr;
9874  llvm::ScalableVectorType *PredTy = nullptr;
9875  bool IsQuadLoad = false;
9876  switch (IntrinsicID) {
9877  case Intrinsic::aarch64_sve_ld1uwq:
9878  case Intrinsic::aarch64_sve_ld1udq:
9879    MemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
9880    PredTy = llvm::ScalableVectorType::get(
9881        llvm::Type::getInt1Ty(getLLVMContext()), 1);
9882    IsQuadLoad = true;
9883    break;
9884  default:
9885    MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9886    PredTy = MemoryTy;
9887    break;
9888  }
9889
9890  Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
9891  Value *BasePtr = Ops[1];
9892
9893  // Does the load have an offset?
9894  if (Ops.size() > 2)
9895    BasePtr = Builder.CreateGEP(MemoryTy, BasePtr, Ops[2]);
9896
9897  Function *F = CGM.getIntrinsic(IntrinsicID, IsQuadLoad ? VectorTy : MemoryTy);
9898  auto *Load =
9899      cast<llvm::Instruction>(Builder.CreateCall(F, {Predicate, BasePtr}));
9900  auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
9901  CGM.DecorateInstructionWithTBAA(Load, TBAAInfo);
9902
9903  if (IsQuadLoad)
9904    return Load;
9905
9906  return IsZExtReturn ? Builder.CreateZExt(Load, VectorTy)
9907                      : Builder.CreateSExt(Load, VectorTy);
9908}
9909
9910Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E,
9911                                           SmallVectorImpl<Value *> &Ops,
9912                                           unsigned IntrinsicID) {
9913  QualType LangPTy = E->getArg(1)->getType();
9914  llvm::Type *MemEltTy = CGM.getTypes().ConvertType(
9915      LangPTy->castAs<PointerType>()->getPointeeType());
9916
9917  // The vector type that is stored may be different from the
9918  // eventual type stored to memory.
9919  auto VectorTy = cast<llvm::ScalableVectorType>(Ops.back()->getType());
9920  auto MemoryTy = llvm::ScalableVectorType::get(MemEltTy, VectorTy);
9921
9922  auto PredTy = MemoryTy;
9923  auto AddrMemoryTy = MemoryTy;
9924  bool IsQuadStore = false;
9925
9926  switch (IntrinsicID) {
9927  case Intrinsic::aarch64_sve_st1wq:
9928  case Intrinsic::aarch64_sve_st1dq:
9929    AddrMemoryTy = llvm::ScalableVectorType::get(MemEltTy, 1);
9930    PredTy =
9931        llvm::ScalableVectorType::get(IntegerType::get(getLLVMContext(), 1), 1);
9932    IsQuadStore = true;
9933    break;
9934  default:
9935    break;
9936  }
9937  Value *Predicate = EmitSVEPredicateCast(Ops[0], PredTy);
9938  Value *BasePtr = Ops[1];
9939
9940  // Does the store have an offset?
9941  if (Ops.size() == 4)
9942    BasePtr = Builder.CreateGEP(AddrMemoryTy, BasePtr, Ops[2]);
9943
9944  // Last value is always the data
9945  Value *Val =
9946      IsQuadStore ? Ops.back() : Builder.CreateTrunc(Ops.back(), MemoryTy);
9947
9948  Function *F =
9949      CGM.getIntrinsic(IntrinsicID, IsQuadStore ? VectorTy : MemoryTy);
9950  auto *Store =
9951      cast<llvm::Instruction>(Builder.CreateCall(F, {Val, Predicate, BasePtr}));
9952  auto TBAAInfo = CGM.getTBAAAccessInfo(LangPTy->getPointeeType());
9953  CGM.DecorateInstructionWithTBAA(Store, TBAAInfo);
9954  return Store;
9955}
9956
9957Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags,
9958                                      SmallVectorImpl<Value *> &Ops,
9959                                      unsigned IntID) {
9960  Ops[2] = EmitSVEPredicateCast(
9961      Ops[2], getSVEVectorForElementType(SVEBuiltinMemEltTy(TypeFlags)));
9962
9963  SmallVector<Value *> NewOps;
9964  NewOps.push_back(Ops[2]);
9965
9966  llvm::Value *BasePtr = Ops[3];
9967
9968  // If the intrinsic contains the vnum parameter, multiply it with the vector
9969  // size in bytes.
9970  if (Ops.size() == 5) {
9971    Function *StreamingVectorLength =
9972        CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb);
9973    llvm::Value *StreamingVectorLengthCall =
9974        Builder.CreateCall(StreamingVectorLength);
9975    llvm::Value *Mulvl =
9976        Builder.CreateMul(StreamingVectorLengthCall, Ops[4], "mulvl");
9977    // The type of the ptr parameter is void *, so use Int8Ty here.
9978    BasePtr = Builder.CreateGEP(Int8Ty, Ops[3], Mulvl);
9979  }
9980  NewOps.push_back(BasePtr);
9981  NewOps.push_back(Ops[0]);
9982  NewOps.push_back(Ops[1]);
9983  Function *F = CGM.getIntrinsic(IntID);
9984  return Builder.CreateCall(F, NewOps);
9985}
9986
9987Value *CodeGenFunction::EmitSMEReadWrite(const SVETypeFlags &TypeFlags,
9988                                         SmallVectorImpl<Value *> &Ops,
9989                                         unsigned IntID) {
9990  auto *VecTy = getSVEType(TypeFlags);
9991  Function *F = CGM.getIntrinsic(IntID, VecTy);
9992  if (TypeFlags.isReadZA())
9993    Ops[1] = EmitSVEPredicateCast(Ops[1], VecTy);
9994  else if (TypeFlags.isWriteZA())
9995    Ops[2] = EmitSVEPredicateCast(Ops[2], VecTy);
9996  return Builder.CreateCall(F, Ops);
9997}
9998
9999Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags,
10000                                    SmallVectorImpl<Value *> &Ops,
10001                                    unsigned IntID) {
10002  // svzero_za() intrinsic zeros the entire za tile and has no paramters.
10003  if (Ops.size() == 0)
10004    Ops.push_back(llvm::ConstantInt::get(Int32Ty, 255));
10005  Function *F = CGM.getIntrinsic(IntID, {});
10006  return Builder.CreateCall(F, Ops);
10007}
10008
10009Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags,
10010                                      SmallVectorImpl<Value *> &Ops,
10011                                      unsigned IntID) {
10012  if (Ops.size() == 2)
10013    Ops.push_back(Builder.getInt32(0));
10014  else
10015    Ops[2] = Builder.CreateIntCast(Ops[2], Int32Ty, true);
10016  Function *F = CGM.getIntrinsic(IntID, {});
10017  return Builder.CreateCall(F, Ops);
10018}
10019
10020// Limit the usage of scalable llvm IR generated by the ACLE by using the
10021// sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat.
10022Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) {
10023  return Builder.CreateVectorSplat(
10024      cast<llvm::VectorType>(Ty)->getElementCount(), Scalar);
10025}
10026
10027Value *CodeGenFunction::EmitSVEDupX(Value* Scalar) {
10028  return EmitSVEDupX(Scalar, getSVEVectorForElementType(Scalar->getType()));
10029}
10030
10031Value *CodeGenFunction::EmitSVEReinterpret(Value *Val, llvm::Type *Ty) {
10032  // FIXME: For big endian this needs an additional REV, or needs a separate
10033  // intrinsic that is code-generated as a no-op, because the LLVM bitcast
10034  // instruction is defined as 'bitwise' equivalent from memory point of
10035  // view (when storing/reloading), whereas the svreinterpret builtin
10036  // implements bitwise equivalent cast from register point of view.
10037  // LLVM CodeGen for a bitcast must add an explicit REV for big-endian.
10038  return Builder.CreateBitCast(Val, Ty);
10039}
10040
10041static void InsertExplicitZeroOperand(CGBuilderTy &Builder, llvm::Type *Ty,
10042                                      SmallVectorImpl<Value *> &Ops) {
10043  auto *SplatZero = Constant::getNullValue(Ty);
10044  Ops.insert(Ops.begin(), SplatZero);
10045}
10046
10047static void InsertExplicitUndefOperand(CGBuilderTy &Builder, llvm::Type *Ty,
10048                                       SmallVectorImpl<Value *> &Ops) {
10049  auto *SplatUndef = UndefValue::get(Ty);
10050  Ops.insert(Ops.begin(), SplatUndef);
10051}
10052
10053SmallVector<llvm::Type *, 2>
10054CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
10055                                     llvm::Type *ResultType,
10056                                     ArrayRef<Value *> Ops) {
10057  if (TypeFlags.isOverloadNone())
10058    return {};
10059
10060  llvm::Type *DefaultType = getSVEType(TypeFlags);
10061
10062  if (TypeFlags.isOverloadWhileOrMultiVecCvt())
10063    return {DefaultType, Ops[1]->getType()};
10064
10065  if (TypeFlags.isOverloadWhileRW())
10066    return {getSVEPredType(TypeFlags), Ops[0]->getType()};
10067
10068  if (TypeFlags.isOverloadCvt())
10069    return {Ops[0]->getType(), Ops.back()->getType()};
10070
10071  if (TypeFlags.isReductionQV() && !ResultType->isScalableTy() &&
10072      ResultType->isVectorTy())
10073    return {ResultType, Ops[1]->getType()};
10074
10075  assert(TypeFlags.isOverloadDefault() && "Unexpected value for overloads");
10076  return {DefaultType};
10077}
10078
10079Value *CodeGenFunction::EmitSVETupleSetOrGet(const SVETypeFlags &TypeFlags,
10080                                             llvm::Type *Ty,
10081                                             ArrayRef<Value *> Ops) {
10082  assert((TypeFlags.isTupleSet() || TypeFlags.isTupleGet()) &&
10083         "Expects TypleFlag isTupleSet or TypeFlags.isTupleSet()");
10084
10085  unsigned I = cast<ConstantInt>(Ops[1])->getSExtValue();
10086  auto *SingleVecTy = dyn_cast<llvm::ScalableVectorType>(
10087                      TypeFlags.isTupleSet() ? Ops[2]->getType() : Ty);
10088  Value *Idx = ConstantInt::get(CGM.Int64Ty,
10089                                I * SingleVecTy->getMinNumElements());
10090
10091  if (TypeFlags.isTupleSet())
10092    return Builder.CreateInsertVector(Ty, Ops[0], Ops[2], Idx);
10093  return Builder.CreateExtractVector(Ty, Ops[0], Idx);
10094}
10095
10096Value *CodeGenFunction::EmitSVETupleCreate(const SVETypeFlags &TypeFlags,
10097                                             llvm::Type *Ty,
10098                                             ArrayRef<Value *> Ops) {
10099  assert(TypeFlags.isTupleCreate() && "Expects TypleFlag isTupleCreate");
10100
10101  auto *SrcTy = dyn_cast<llvm::ScalableVectorType>(Ops[0]->getType());
10102  unsigned MinElts = SrcTy->getMinNumElements();
10103  Value *Call = llvm::PoisonValue::get(Ty);
10104  for (unsigned I = 0; I < Ops.size(); I++) {
10105    Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
10106    Call = Builder.CreateInsertVector(Ty, Call, Ops[I], Idx);
10107  }
10108
10109  return Call;
10110}
10111
10112Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) {
10113  // Multi-vector results should be broken up into a single (wide) result
10114  // vector.
10115  auto *StructTy = dyn_cast<StructType>(Call->getType());
10116  if (!StructTy)
10117    return Call;
10118
10119  auto *VTy = dyn_cast<ScalableVectorType>(StructTy->getTypeAtIndex(0U));
10120  if (!VTy)
10121    return Call;
10122  unsigned N = StructTy->getNumElements();
10123
10124  // We may need to emit a cast to a svbool_t
10125  bool IsPredTy = VTy->getElementType()->isIntegerTy(1);
10126  unsigned MinElts = IsPredTy ? 16 : VTy->getMinNumElements();
10127
10128  ScalableVectorType *WideVTy =
10129      ScalableVectorType::get(VTy->getElementType(), MinElts * N);
10130  Value *Ret = llvm::PoisonValue::get(WideVTy);
10131  for (unsigned I = 0; I < N; ++I) {
10132    Value *SRet = Builder.CreateExtractValue(Call, I);
10133    assert(SRet->getType() == VTy && "Unexpected type for result value");
10134    Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
10135
10136    if (IsPredTy)
10137      SRet = EmitSVEPredicateCast(
10138          SRet, ScalableVectorType::get(Builder.getInt1Ty(), 16));
10139
10140    Ret = Builder.CreateInsertVector(WideVTy, Ret, SRet, Idx);
10141  }
10142  Call = Ret;
10143
10144  return Call;
10145}
10146
10147void CodeGenFunction::GetAArch64SVEProcessedOperands(
10148    unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
10149    SVETypeFlags TypeFlags) {
10150  // Find out if any arguments are required to be integer constant expressions.
10151  unsigned ICEArguments = 0;
10152  ASTContext::GetBuiltinTypeError Error;
10153  getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
10154  assert(Error == ASTContext::GE_None && "Should not codegen an error");
10155
10156  // Tuple set/get only requires one insert/extract vector, which is
10157  // created by EmitSVETupleSetOrGet.
10158  bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
10159
10160  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
10161    bool IsICE = ICEArguments & (1 << i);
10162    Value *Arg = EmitScalarExpr(E->getArg(i));
10163
10164    if (IsICE) {
10165      // If this is required to be a constant, constant fold it so that we know
10166      // that the generated intrinsic gets a ConstantInt.
10167      std::optional<llvm::APSInt> Result =
10168          E->getArg(i)->getIntegerConstantExpr(getContext());
10169      assert(Result && "Expected argument to be a constant");
10170
10171      // Immediates for SVE llvm intrinsics are always 32bit.  We can safely
10172      // truncate because the immediate has been range checked and no valid
10173      // immediate requires more than a handful of bits.
10174      *Result = Result->extOrTrunc(32);
10175      Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
10176      continue;
10177    }
10178
10179    if (IsTupleGetOrSet || !isa<ScalableVectorType>(Arg->getType())) {
10180      Ops.push_back(Arg);
10181      continue;
10182    }
10183
10184    auto *VTy = cast<ScalableVectorType>(Arg->getType());
10185    unsigned MinElts = VTy->getMinNumElements();
10186    bool IsPred = VTy->getElementType()->isIntegerTy(1);
10187    unsigned N = (MinElts * VTy->getScalarSizeInBits()) / (IsPred ? 16 : 128);
10188
10189    if (N == 1) {
10190      Ops.push_back(Arg);
10191      continue;
10192    }
10193
10194    for (unsigned I = 0; I < N; ++I) {
10195      Value *Idx = ConstantInt::get(CGM.Int64Ty, (I * MinElts) / N);
10196      auto *NewVTy =
10197          ScalableVectorType::get(VTy->getElementType(), MinElts / N);
10198      Ops.push_back(Builder.CreateExtractVector(NewVTy, Arg, Idx));
10199    }
10200  }
10201}
10202
10203Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
10204                                                  const CallExpr *E) {
10205  llvm::Type *Ty = ConvertType(E->getType());
10206  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
10207      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64_x4) {
10208    Value *Val = EmitScalarExpr(E->getArg(0));
10209    return EmitSVEReinterpret(Val, Ty);
10210  }
10211
10212  auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID,
10213                                              AArch64SVEIntrinsicsProvenSorted);
10214
10215  llvm::SmallVector<Value *, 4> Ops;
10216  SVETypeFlags TypeFlags(Builtin->TypeModifier);
10217  GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
10218
10219  if (TypeFlags.isLoad())
10220    return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic,
10221                             TypeFlags.isZExtReturn());
10222  else if (TypeFlags.isStore())
10223    return EmitSVEMaskedStore(E, Ops, Builtin->LLVMIntrinsic);
10224  else if (TypeFlags.isGatherLoad())
10225    return EmitSVEGatherLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10226  else if (TypeFlags.isScatterStore())
10227    return EmitSVEScatterStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10228  else if (TypeFlags.isPrefetch())
10229    return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10230  else if (TypeFlags.isGatherPrefetch())
10231    return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10232  else if (TypeFlags.isStructLoad())
10233    return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10234  else if (TypeFlags.isStructStore())
10235    return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10236  else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet())
10237    return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops);
10238  else if (TypeFlags.isTupleCreate())
10239    return EmitSVETupleCreate(TypeFlags, Ty, Ops);
10240  else if (TypeFlags.isUndef())
10241    return UndefValue::get(Ty);
10242  else if (Builtin->LLVMIntrinsic != 0) {
10243    if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp)
10244      InsertExplicitZeroOperand(Builder, Ty, Ops);
10245
10246    if (TypeFlags.getMergeType() == SVETypeFlags::MergeAnyExp)
10247      InsertExplicitUndefOperand(Builder, Ty, Ops);
10248
10249    // Some ACLE builtins leave out the argument to specify the predicate
10250    // pattern, which is expected to be expanded to an SV_ALL pattern.
10251    if (TypeFlags.isAppendSVALL())
10252      Ops.push_back(Builder.getInt32(/*SV_ALL*/ 31));
10253    if (TypeFlags.isInsertOp1SVALL())
10254      Ops.insert(&Ops[1], Builder.getInt32(/*SV_ALL*/ 31));
10255
10256    // Predicates must match the main datatype.
10257    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
10258      if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
10259        if (PredTy->getElementType()->isIntegerTy(1))
10260          Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
10261
10262    // Splat scalar operand to vector (intrinsics with _n infix)
10263    if (TypeFlags.hasSplatOperand()) {
10264      unsigned OpNo = TypeFlags.getSplatOperand();
10265      Ops[OpNo] = EmitSVEDupX(Ops[OpNo]);
10266    }
10267
10268    if (TypeFlags.isReverseCompare())
10269      std::swap(Ops[1], Ops[2]);
10270    else if (TypeFlags.isReverseUSDOT())
10271      std::swap(Ops[1], Ops[2]);
10272    else if (TypeFlags.isReverseMergeAnyBinOp() &&
10273             TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
10274      std::swap(Ops[1], Ops[2]);
10275    else if (TypeFlags.isReverseMergeAnyAccOp() &&
10276             TypeFlags.getMergeType() == SVETypeFlags::MergeAny)
10277      std::swap(Ops[1], Ops[3]);
10278
10279    // Predicated intrinsics with _z suffix need a select w/ zeroinitializer.
10280    if (TypeFlags.getMergeType() == SVETypeFlags::MergeZero) {
10281      llvm::Type *OpndTy = Ops[1]->getType();
10282      auto *SplatZero = Constant::getNullValue(OpndTy);
10283      Ops[1] = Builder.CreateSelect(Ops[0], Ops[1], SplatZero);
10284    }
10285
10286    Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic,
10287                                   getSVEOverloadTypes(TypeFlags, Ty, Ops));
10288    Value *Call = Builder.CreateCall(F, Ops);
10289
10290    // Predicate results must be converted to svbool_t.
10291    if (auto PredTy = dyn_cast<llvm::VectorType>(Call->getType()))
10292      if (PredTy->getScalarType()->isIntegerTy(1))
10293        Call = EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
10294
10295    return FormSVEBuiltinResult(Call);
10296  }
10297
10298  switch (BuiltinID) {
10299  default:
10300    return nullptr;
10301
10302  case SVE::BI__builtin_sve_svreinterpret_b: {
10303    auto SVCountTy =
10304        llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
10305    Function *CastFromSVCountF =
10306        CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy);
10307    return Builder.CreateCall(CastFromSVCountF, Ops[0]);
10308  }
10309  case SVE::BI__builtin_sve_svreinterpret_c: {
10310    auto SVCountTy =
10311        llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
10312    Function *CastToSVCountF =
10313        CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy);
10314    return Builder.CreateCall(CastToSVCountF, Ops[0]);
10315  }
10316
10317  case SVE::BI__builtin_sve_svpsel_lane_b8:
10318  case SVE::BI__builtin_sve_svpsel_lane_b16:
10319  case SVE::BI__builtin_sve_svpsel_lane_b32:
10320  case SVE::BI__builtin_sve_svpsel_lane_b64:
10321  case SVE::BI__builtin_sve_svpsel_lane_c8:
10322  case SVE::BI__builtin_sve_svpsel_lane_c16:
10323  case SVE::BI__builtin_sve_svpsel_lane_c32:
10324  case SVE::BI__builtin_sve_svpsel_lane_c64: {
10325    bool IsSVCount = isa<TargetExtType>(Ops[0]->getType());
10326    assert(((!IsSVCount || cast<TargetExtType>(Ops[0]->getType())->getName() ==
10327                               "aarch64.svcount")) &&
10328           "Unexpected TargetExtType");
10329    auto SVCountTy =
10330        llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
10331    Function *CastFromSVCountF =
10332        CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, SVCountTy);
10333    Function *CastToSVCountF =
10334        CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, SVCountTy);
10335
10336    auto OverloadedTy = getSVEType(SVETypeFlags(Builtin->TypeModifier));
10337    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_psel, OverloadedTy);
10338    llvm::Value *Ops0 =
10339        IsSVCount ? Builder.CreateCall(CastFromSVCountF, Ops[0]) : Ops[0];
10340    llvm::Value *Ops1 = EmitSVEPredicateCast(Ops[1], OverloadedTy);
10341    llvm::Value *PSel = Builder.CreateCall(F, {Ops0, Ops1, Ops[2]});
10342    return IsSVCount ? Builder.CreateCall(CastToSVCountF, PSel) : PSel;
10343  }
10344  case SVE::BI__builtin_sve_svmov_b_z: {
10345    // svmov_b_z(pg, op) <=> svand_b_z(pg, op, op)
10346    SVETypeFlags TypeFlags(Builtin->TypeModifier);
10347    llvm::Type* OverloadedTy = getSVEType(TypeFlags);
10348    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_and_z, OverloadedTy);
10349    return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[1]});
10350  }
10351
10352  case SVE::BI__builtin_sve_svnot_b_z: {
10353    // svnot_b_z(pg, op) <=> sveor_b_z(pg, op, pg)
10354    SVETypeFlags TypeFlags(Builtin->TypeModifier);
10355    llvm::Type* OverloadedTy = getSVEType(TypeFlags);
10356    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_eor_z, OverloadedTy);
10357    return Builder.CreateCall(F, {Ops[0], Ops[1], Ops[0]});
10358  }
10359
10360  case SVE::BI__builtin_sve_svmovlb_u16:
10361  case SVE::BI__builtin_sve_svmovlb_u32:
10362  case SVE::BI__builtin_sve_svmovlb_u64:
10363    return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllb);
10364
10365  case SVE::BI__builtin_sve_svmovlb_s16:
10366  case SVE::BI__builtin_sve_svmovlb_s32:
10367  case SVE::BI__builtin_sve_svmovlb_s64:
10368    return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllb);
10369
10370  case SVE::BI__builtin_sve_svmovlt_u16:
10371  case SVE::BI__builtin_sve_svmovlt_u32:
10372  case SVE::BI__builtin_sve_svmovlt_u64:
10373    return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_ushllt);
10374
10375  case SVE::BI__builtin_sve_svmovlt_s16:
10376  case SVE::BI__builtin_sve_svmovlt_s32:
10377  case SVE::BI__builtin_sve_svmovlt_s64:
10378    return EmitSVEMovl(TypeFlags, Ops, Intrinsic::aarch64_sve_sshllt);
10379
10380  case SVE::BI__builtin_sve_svpmullt_u16:
10381  case SVE::BI__builtin_sve_svpmullt_u64:
10382  case SVE::BI__builtin_sve_svpmullt_n_u16:
10383  case SVE::BI__builtin_sve_svpmullt_n_u64:
10384    return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullt_pair);
10385
10386  case SVE::BI__builtin_sve_svpmullb_u16:
10387  case SVE::BI__builtin_sve_svpmullb_u64:
10388  case SVE::BI__builtin_sve_svpmullb_n_u16:
10389  case SVE::BI__builtin_sve_svpmullb_n_u64:
10390    return EmitSVEPMull(TypeFlags, Ops, Intrinsic::aarch64_sve_pmullb_pair);
10391
10392  case SVE::BI__builtin_sve_svdup_n_b8:
10393  case SVE::BI__builtin_sve_svdup_n_b16:
10394  case SVE::BI__builtin_sve_svdup_n_b32:
10395  case SVE::BI__builtin_sve_svdup_n_b64: {
10396    Value *CmpNE =
10397        Builder.CreateICmpNE(Ops[0], Constant::getNullValue(Ops[0]->getType()));
10398    llvm::ScalableVectorType *OverloadedTy = getSVEType(TypeFlags);
10399    Value *Dup = EmitSVEDupX(CmpNE, OverloadedTy);
10400    return EmitSVEPredicateCast(Dup, cast<llvm::ScalableVectorType>(Ty));
10401  }
10402
10403  case SVE::BI__builtin_sve_svdupq_n_b8:
10404  case SVE::BI__builtin_sve_svdupq_n_b16:
10405  case SVE::BI__builtin_sve_svdupq_n_b32:
10406  case SVE::BI__builtin_sve_svdupq_n_b64:
10407  case SVE::BI__builtin_sve_svdupq_n_u8:
10408  case SVE::BI__builtin_sve_svdupq_n_s8:
10409  case SVE::BI__builtin_sve_svdupq_n_u64:
10410  case SVE::BI__builtin_sve_svdupq_n_f64:
10411  case SVE::BI__builtin_sve_svdupq_n_s64:
10412  case SVE::BI__builtin_sve_svdupq_n_u16:
10413  case SVE::BI__builtin_sve_svdupq_n_f16:
10414  case SVE::BI__builtin_sve_svdupq_n_bf16:
10415  case SVE::BI__builtin_sve_svdupq_n_s16:
10416  case SVE::BI__builtin_sve_svdupq_n_u32:
10417  case SVE::BI__builtin_sve_svdupq_n_f32:
10418  case SVE::BI__builtin_sve_svdupq_n_s32: {
10419    // These builtins are implemented by storing each element to an array and using
10420    // ld1rq to materialize a vector.
10421    unsigned NumOpnds = Ops.size();
10422
10423    bool IsBoolTy =
10424        cast<llvm::VectorType>(Ty)->getElementType()->isIntegerTy(1);
10425
10426    // For svdupq_n_b* the element type of is an integer of type 128/numelts,
10427    // so that the compare can use the width that is natural for the expected
10428    // number of predicate lanes.
10429    llvm::Type *EltTy = Ops[0]->getType();
10430    if (IsBoolTy)
10431      EltTy = IntegerType::get(getLLVMContext(), SVEBitsPerBlock / NumOpnds);
10432
10433    SmallVector<llvm::Value *, 16> VecOps;
10434    for (unsigned I = 0; I < NumOpnds; ++I)
10435        VecOps.push_back(Builder.CreateZExt(Ops[I], EltTy));
10436    Value *Vec = BuildVector(VecOps);
10437
10438    llvm::Type *OverloadedTy = getSVEVectorForElementType(EltTy);
10439    Value *InsertSubVec = Builder.CreateInsertVector(
10440        OverloadedTy, PoisonValue::get(OverloadedTy), Vec, Builder.getInt64(0));
10441
10442    Function *F =
10443        CGM.getIntrinsic(Intrinsic::aarch64_sve_dupq_lane, OverloadedTy);
10444    Value *DupQLane =
10445        Builder.CreateCall(F, {InsertSubVec, Builder.getInt64(0)});
10446
10447    if (!IsBoolTy)
10448      return DupQLane;
10449
10450    SVETypeFlags TypeFlags(Builtin->TypeModifier);
10451    Value *Pred = EmitSVEAllTruePred(TypeFlags);
10452
10453    // For svdupq_n_b* we need to add an additional 'cmpne' with '0'.
10454    F = CGM.getIntrinsic(NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne
10455                                       : Intrinsic::aarch64_sve_cmpne_wide,
10456                         OverloadedTy);
10457    Value *Call = Builder.CreateCall(
10458        F, {Pred, DupQLane, EmitSVEDupX(Builder.getInt64(0))});
10459    return EmitSVEPredicateCast(Call, cast<llvm::ScalableVectorType>(Ty));
10460  }
10461
10462  case SVE::BI__builtin_sve_svpfalse_b:
10463    return ConstantInt::getFalse(Ty);
10464
10465  case SVE::BI__builtin_sve_svpfalse_c: {
10466    auto SVBoolTy = ScalableVectorType::get(Builder.getInt1Ty(), 16);
10467    Function *CastToSVCountF =
10468        CGM.getIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, Ty);
10469    return Builder.CreateCall(CastToSVCountF, ConstantInt::getFalse(SVBoolTy));
10470  }
10471
10472  case SVE::BI__builtin_sve_svlen_bf16:
10473  case SVE::BI__builtin_sve_svlen_f16:
10474  case SVE::BI__builtin_sve_svlen_f32:
10475  case SVE::BI__builtin_sve_svlen_f64:
10476  case SVE::BI__builtin_sve_svlen_s8:
10477  case SVE::BI__builtin_sve_svlen_s16:
10478  case SVE::BI__builtin_sve_svlen_s32:
10479  case SVE::BI__builtin_sve_svlen_s64:
10480  case SVE::BI__builtin_sve_svlen_u8:
10481  case SVE::BI__builtin_sve_svlen_u16:
10482  case SVE::BI__builtin_sve_svlen_u32:
10483  case SVE::BI__builtin_sve_svlen_u64: {
10484    SVETypeFlags TF(Builtin->TypeModifier);
10485    auto VTy = cast<llvm::VectorType>(getSVEType(TF));
10486    auto *NumEls =
10487        llvm::ConstantInt::get(Ty, VTy->getElementCount().getKnownMinValue());
10488
10489    Function *F = CGM.getIntrinsic(Intrinsic::vscale, Ty);
10490    return Builder.CreateMul(NumEls, Builder.CreateCall(F));
10491  }
10492
10493  case SVE::BI__builtin_sve_svtbl2_u8:
10494  case SVE::BI__builtin_sve_svtbl2_s8:
10495  case SVE::BI__builtin_sve_svtbl2_u16:
10496  case SVE::BI__builtin_sve_svtbl2_s16:
10497  case SVE::BI__builtin_sve_svtbl2_u32:
10498  case SVE::BI__builtin_sve_svtbl2_s32:
10499  case SVE::BI__builtin_sve_svtbl2_u64:
10500  case SVE::BI__builtin_sve_svtbl2_s64:
10501  case SVE::BI__builtin_sve_svtbl2_f16:
10502  case SVE::BI__builtin_sve_svtbl2_bf16:
10503  case SVE::BI__builtin_sve_svtbl2_f32:
10504  case SVE::BI__builtin_sve_svtbl2_f64: {
10505    SVETypeFlags TF(Builtin->TypeModifier);
10506    auto VTy = cast<llvm::ScalableVectorType>(getSVEType(TF));
10507    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy);
10508    return Builder.CreateCall(F, Ops);
10509  }
10510
10511  case SVE::BI__builtin_sve_svset_neonq_s8:
10512  case SVE::BI__builtin_sve_svset_neonq_s16:
10513  case SVE::BI__builtin_sve_svset_neonq_s32:
10514  case SVE::BI__builtin_sve_svset_neonq_s64:
10515  case SVE::BI__builtin_sve_svset_neonq_u8:
10516  case SVE::BI__builtin_sve_svset_neonq_u16:
10517  case SVE::BI__builtin_sve_svset_neonq_u32:
10518  case SVE::BI__builtin_sve_svset_neonq_u64:
10519  case SVE::BI__builtin_sve_svset_neonq_f16:
10520  case SVE::BI__builtin_sve_svset_neonq_f32:
10521  case SVE::BI__builtin_sve_svset_neonq_f64:
10522  case SVE::BI__builtin_sve_svset_neonq_bf16: {
10523    return Builder.CreateInsertVector(Ty, Ops[0], Ops[1], Builder.getInt64(0));
10524  }
10525
10526  case SVE::BI__builtin_sve_svget_neonq_s8:
10527  case SVE::BI__builtin_sve_svget_neonq_s16:
10528  case SVE::BI__builtin_sve_svget_neonq_s32:
10529  case SVE::BI__builtin_sve_svget_neonq_s64:
10530  case SVE::BI__builtin_sve_svget_neonq_u8:
10531  case SVE::BI__builtin_sve_svget_neonq_u16:
10532  case SVE::BI__builtin_sve_svget_neonq_u32:
10533  case SVE::BI__builtin_sve_svget_neonq_u64:
10534  case SVE::BI__builtin_sve_svget_neonq_f16:
10535  case SVE::BI__builtin_sve_svget_neonq_f32:
10536  case SVE::BI__builtin_sve_svget_neonq_f64:
10537  case SVE::BI__builtin_sve_svget_neonq_bf16: {
10538    return Builder.CreateExtractVector(Ty, Ops[0], Builder.getInt64(0));
10539  }
10540
10541  case SVE::BI__builtin_sve_svdup_neonq_s8:
10542  case SVE::BI__builtin_sve_svdup_neonq_s16:
10543  case SVE::BI__builtin_sve_svdup_neonq_s32:
10544  case SVE::BI__builtin_sve_svdup_neonq_s64:
10545  case SVE::BI__builtin_sve_svdup_neonq_u8:
10546  case SVE::BI__builtin_sve_svdup_neonq_u16:
10547  case SVE::BI__builtin_sve_svdup_neonq_u32:
10548  case SVE::BI__builtin_sve_svdup_neonq_u64:
10549  case SVE::BI__builtin_sve_svdup_neonq_f16:
10550  case SVE::BI__builtin_sve_svdup_neonq_f32:
10551  case SVE::BI__builtin_sve_svdup_neonq_f64:
10552  case SVE::BI__builtin_sve_svdup_neonq_bf16: {
10553    Value *Insert = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0],
10554                                               Builder.getInt64(0));
10555    return Builder.CreateIntrinsic(Intrinsic::aarch64_sve_dupq_lane, {Ty},
10556                                   {Insert, Builder.getInt64(0)});
10557  }
10558  }
10559
10560  /// Should not happen
10561  return nullptr;
10562}
10563
10564static void swapCommutativeSMEOperands(unsigned BuiltinID,
10565                                       SmallVectorImpl<Value *> &Ops) {
10566  unsigned MultiVec;
10567  switch (BuiltinID) {
10568  default:
10569    return;
10570  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x1:
10571    MultiVec = 1;
10572    break;
10573  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x2:
10574  case SME::BI__builtin_sme_svsudot_za32_s8_vg1x2:
10575    MultiVec = 2;
10576    break;
10577  case SME::BI__builtin_sme_svsudot_za32_s8_vg1x4:
10578  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x4:
10579    MultiVec = 4;
10580    break;
10581  }
10582
10583  if (MultiVec > 0)
10584    for (unsigned I = 0; I < MultiVec; ++I)
10585      std::swap(Ops[I + 1], Ops[I + 1 + MultiVec]);
10586}
10587
10588Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
10589                                                  const CallExpr *E) {
10590  auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
10591                                              AArch64SMEIntrinsicsProvenSorted);
10592
10593  llvm::SmallVector<Value *, 4> Ops;
10594  SVETypeFlags TypeFlags(Builtin->TypeModifier);
10595  GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
10596
10597  if (TypeFlags.isLoad() || TypeFlags.isStore())
10598    return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10599  else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA())
10600    return EmitSMEReadWrite(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10601  else if (BuiltinID == SME::BI__builtin_sme_svzero_mask_za ||
10602           BuiltinID == SME::BI__builtin_sme_svzero_za)
10603    return EmitSMEZero(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10604  else if (BuiltinID == SME::BI__builtin_sme_svldr_vnum_za ||
10605           BuiltinID == SME::BI__builtin_sme_svstr_vnum_za ||
10606           BuiltinID == SME::BI__builtin_sme_svldr_za ||
10607           BuiltinID == SME::BI__builtin_sme_svstr_za)
10608    return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
10609
10610  // Handle builtins which require their multi-vector operands to be swapped
10611  swapCommutativeSMEOperands(BuiltinID, Ops);
10612
10613  // Should not happen!
10614  if (Builtin->LLVMIntrinsic == 0)
10615    return nullptr;
10616
10617  // Predicates must match the main datatype.
10618  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
10619    if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
10620      if (PredTy->getElementType()->isIntegerTy(1))
10621        Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
10622
10623  Function *F =
10624      TypeFlags.isOverloadNone()
10625          ? CGM.getIntrinsic(Builtin->LLVMIntrinsic)
10626          : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)});
10627  Value *Call = Builder.CreateCall(F, Ops);
10628
10629  return FormSVEBuiltinResult(Call);
10630}
10631
10632Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
10633                                               const CallExpr *E,
10634                                               llvm::Triple::ArchType Arch) {
10635  if (BuiltinID >= clang::AArch64::FirstSVEBuiltin &&
10636      BuiltinID <= clang::AArch64::LastSVEBuiltin)
10637    return EmitAArch64SVEBuiltinExpr(BuiltinID, E);
10638
10639  if (BuiltinID >= clang::AArch64::FirstSMEBuiltin &&
10640      BuiltinID <= clang::AArch64::LastSMEBuiltin)
10641    return EmitAArch64SMEBuiltinExpr(BuiltinID, E);
10642
10643  unsigned HintID = static_cast<unsigned>(-1);
10644  switch (BuiltinID) {
10645  default: break;
10646  case clang::AArch64::BI__builtin_arm_nop:
10647    HintID = 0;
10648    break;
10649  case clang::AArch64::BI__builtin_arm_yield:
10650  case clang::AArch64::BI__yield:
10651    HintID = 1;
10652    break;
10653  case clang::AArch64::BI__builtin_arm_wfe:
10654  case clang::AArch64::BI__wfe:
10655    HintID = 2;
10656    break;
10657  case clang::AArch64::BI__builtin_arm_wfi:
10658  case clang::AArch64::BI__wfi:
10659    HintID = 3;
10660    break;
10661  case clang::AArch64::BI__builtin_arm_sev:
10662  case clang::AArch64::BI__sev:
10663    HintID = 4;
10664    break;
10665  case clang::AArch64::BI__builtin_arm_sevl:
10666  case clang::AArch64::BI__sevl:
10667    HintID = 5;
10668    break;
10669  }
10670
10671  if (HintID != static_cast<unsigned>(-1)) {
10672    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_hint);
10673    return Builder.CreateCall(F, llvm::ConstantInt::get(Int32Ty, HintID));
10674  }
10675
10676  if (BuiltinID == clang::AArch64::BI__builtin_arm_get_sme_state) {
10677    // Create call to __arm_sme_state and store the results to the two pointers.
10678    CallInst *CI = EmitRuntimeCall(CGM.CreateRuntimeFunction(
10679        llvm::FunctionType::get(StructType::get(CGM.Int64Ty, CGM.Int64Ty), {},
10680                                false),
10681        "__arm_sme_state"));
10682    auto Attrs =
10683        AttributeList()
10684            .addFnAttribute(getLLVMContext(), "aarch64_pstate_sm_compatible")
10685            .addFnAttribute(getLLVMContext(), "aarch64_pstate_za_preserved");
10686    CI->setAttributes(Attrs);
10687    CI->setCallingConv(
10688        llvm::CallingConv::
10689            AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2);
10690    Builder.CreateStore(Builder.CreateExtractValue(CI, 0),
10691                        EmitPointerWithAlignment(E->getArg(0)));
10692    return Builder.CreateStore(Builder.CreateExtractValue(CI, 1),
10693                               EmitPointerWithAlignment(E->getArg(1)));
10694  }
10695
10696  if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit) {
10697    assert((getContext().getTypeSize(E->getType()) == 32) &&
10698           "rbit of unusual size!");
10699    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10700    return Builder.CreateCall(
10701        CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
10702  }
10703  if (BuiltinID == clang::AArch64::BI__builtin_arm_rbit64) {
10704    assert((getContext().getTypeSize(E->getType()) == 64) &&
10705           "rbit of unusual size!");
10706    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10707    return Builder.CreateCall(
10708        CGM.getIntrinsic(Intrinsic::bitreverse, Arg->getType()), Arg, "rbit");
10709  }
10710
10711  if (BuiltinID == clang::AArch64::BI__builtin_arm_clz ||
10712      BuiltinID == clang::AArch64::BI__builtin_arm_clz64) {
10713    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10714    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Arg->getType());
10715    Value *Res = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
10716    if (BuiltinID == clang::AArch64::BI__builtin_arm_clz64)
10717      Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
10718    return Res;
10719  }
10720
10721  if (BuiltinID == clang::AArch64::BI__builtin_arm_cls) {
10722    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10723    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls), Arg,
10724                              "cls");
10725  }
10726  if (BuiltinID == clang::AArch64::BI__builtin_arm_cls64) {
10727    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10728    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_cls64), Arg,
10729                              "cls");
10730  }
10731
10732  if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32zf ||
10733      BuiltinID == clang::AArch64::BI__builtin_arm_rint32z) {
10734    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10735    llvm::Type *Ty = Arg->getType();
10736    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32z, Ty),
10737                              Arg, "frint32z");
10738  }
10739
10740  if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64zf ||
10741      BuiltinID == clang::AArch64::BI__builtin_arm_rint64z) {
10742    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10743    llvm::Type *Ty = Arg->getType();
10744    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64z, Ty),
10745                              Arg, "frint64z");
10746  }
10747
10748  if (BuiltinID == clang::AArch64::BI__builtin_arm_rint32xf ||
10749      BuiltinID == clang::AArch64::BI__builtin_arm_rint32x) {
10750    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10751    llvm::Type *Ty = Arg->getType();
10752    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint32x, Ty),
10753                              Arg, "frint32x");
10754  }
10755
10756  if (BuiltinID == clang::AArch64::BI__builtin_arm_rint64xf ||
10757      BuiltinID == clang::AArch64::BI__builtin_arm_rint64x) {
10758    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10759    llvm::Type *Ty = Arg->getType();
10760    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_frint64x, Ty),
10761                              Arg, "frint64x");
10762  }
10763
10764  if (BuiltinID == clang::AArch64::BI__builtin_arm_jcvt) {
10765    assert((getContext().getTypeSize(E->getType()) == 32) &&
10766           "__jcvt of unusual size!");
10767    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
10768    return Builder.CreateCall(
10769        CGM.getIntrinsic(Intrinsic::aarch64_fjcvtzs), Arg);
10770  }
10771
10772  if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b ||
10773      BuiltinID == clang::AArch64::BI__builtin_arm_st64b ||
10774      BuiltinID == clang::AArch64::BI__builtin_arm_st64bv ||
10775      BuiltinID == clang::AArch64::BI__builtin_arm_st64bv0) {
10776    llvm::Value *MemAddr = EmitScalarExpr(E->getArg(0));
10777    llvm::Value *ValPtr = EmitScalarExpr(E->getArg(1));
10778
10779    if (BuiltinID == clang::AArch64::BI__builtin_arm_ld64b) {
10780      // Load from the address via an LLVM intrinsic, receiving a
10781      // tuple of 8 i64 words, and store each one to ValPtr.
10782      Function *F = CGM.getIntrinsic(Intrinsic::aarch64_ld64b);
10783      llvm::Value *Val = Builder.CreateCall(F, MemAddr);
10784      llvm::Value *ToRet;
10785      for (size_t i = 0; i < 8; i++) {
10786        llvm::Value *ValOffsetPtr =
10787            Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
10788        Address Addr =
10789            Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(8));
10790        ToRet = Builder.CreateStore(Builder.CreateExtractValue(Val, i), Addr);
10791      }
10792      return ToRet;
10793    } else {
10794      // Load 8 i64 words from ValPtr, and store them to the address
10795      // via an LLVM intrinsic.
10796      SmallVector<llvm::Value *, 9> Args;
10797      Args.push_back(MemAddr);
10798      for (size_t i = 0; i < 8; i++) {
10799        llvm::Value *ValOffsetPtr =
10800            Builder.CreateGEP(Int64Ty, ValPtr, Builder.getInt32(i));
10801        Address Addr =
10802            Address(ValOffsetPtr, Int64Ty, CharUnits::fromQuantity(8));
10803        Args.push_back(Builder.CreateLoad(Addr));
10804      }
10805
10806      auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_st64b
10807                       ? Intrinsic::aarch64_st64b
10808                   : BuiltinID == clang::AArch64::BI__builtin_arm_st64bv
10809                       ? Intrinsic::aarch64_st64bv
10810                       : Intrinsic::aarch64_st64bv0);
10811      Function *F = CGM.getIntrinsic(Intr);
10812      return Builder.CreateCall(F, Args);
10813    }
10814  }
10815
10816  if (BuiltinID == clang::AArch64::BI__builtin_arm_rndr ||
10817      BuiltinID == clang::AArch64::BI__builtin_arm_rndrrs) {
10818
10819    auto Intr = (BuiltinID == clang::AArch64::BI__builtin_arm_rndr
10820                     ? Intrinsic::aarch64_rndr
10821                     : Intrinsic::aarch64_rndrrs);
10822    Function *F = CGM.getIntrinsic(Intr);
10823    llvm::Value *Val = Builder.CreateCall(F);
10824    Value *RandomValue = Builder.CreateExtractValue(Val, 0);
10825    Value *Status = Builder.CreateExtractValue(Val, 1);
10826
10827    Address MemAddress = EmitPointerWithAlignment(E->getArg(0));
10828    Builder.CreateStore(RandomValue, MemAddress);
10829    Status = Builder.CreateZExt(Status, Int32Ty);
10830    return Status;
10831  }
10832
10833  if (BuiltinID == clang::AArch64::BI__clear_cache) {
10834    assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments");
10835    const FunctionDecl *FD = E->getDirectCallee();
10836    Value *Ops[2];
10837    for (unsigned i = 0; i < 2; i++)
10838      Ops[i] = EmitScalarExpr(E->getArg(i));
10839    llvm::Type *Ty = CGM.getTypes().ConvertType(FD->getType());
10840    llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
10841    StringRef Name = FD->getName();
10842    return EmitNounwindRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name), Ops);
10843  }
10844
10845  if ((BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
10846       BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) &&
10847      getContext().getTypeSize(E->getType()) == 128) {
10848    Function *F =
10849        CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
10850                             ? Intrinsic::aarch64_ldaxp
10851                             : Intrinsic::aarch64_ldxp);
10852
10853    Value *LdPtr = EmitScalarExpr(E->getArg(0));
10854    Value *Val = Builder.CreateCall(F, LdPtr, "ldxp");
10855
10856    Value *Val0 = Builder.CreateExtractValue(Val, 1);
10857    Value *Val1 = Builder.CreateExtractValue(Val, 0);
10858    llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
10859    Val0 = Builder.CreateZExt(Val0, Int128Ty);
10860    Val1 = Builder.CreateZExt(Val1, Int128Ty);
10861
10862    Value *ShiftCst = llvm::ConstantInt::get(Int128Ty, 64);
10863    Val = Builder.CreateShl(Val0, ShiftCst, "shl", true /* nuw */);
10864    Val = Builder.CreateOr(Val, Val1);
10865    return Builder.CreateBitCast(Val, ConvertType(E->getType()));
10866  } else if (BuiltinID == clang::AArch64::BI__builtin_arm_ldrex ||
10867             BuiltinID == clang::AArch64::BI__builtin_arm_ldaex) {
10868    Value *LoadAddr = EmitScalarExpr(E->getArg(0));
10869
10870    QualType Ty = E->getType();
10871    llvm::Type *RealResTy = ConvertType(Ty);
10872    llvm::Type *IntTy =
10873        llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
10874
10875    Function *F =
10876        CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_ldaex
10877                             ? Intrinsic::aarch64_ldaxr
10878                             : Intrinsic::aarch64_ldxr,
10879                         UnqualPtrTy);
10880    CallInst *Val = Builder.CreateCall(F, LoadAddr, "ldxr");
10881    Val->addParamAttr(
10882        0, Attribute::get(getLLVMContext(), Attribute::ElementType, IntTy));
10883
10884    if (RealResTy->isPointerTy())
10885      return Builder.CreateIntToPtr(Val, RealResTy);
10886
10887    llvm::Type *IntResTy = llvm::IntegerType::get(
10888        getLLVMContext(), CGM.getDataLayout().getTypeSizeInBits(RealResTy));
10889    return Builder.CreateBitCast(Builder.CreateTruncOrBitCast(Val, IntResTy),
10890                                 RealResTy);
10891  }
10892
10893  if ((BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
10894       BuiltinID == clang::AArch64::BI__builtin_arm_stlex) &&
10895      getContext().getTypeSize(E->getArg(0)->getType()) == 128) {
10896    Function *F =
10897        CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_stlex
10898                             ? Intrinsic::aarch64_stlxp
10899                             : Intrinsic::aarch64_stxp);
10900    llvm::Type *STy = llvm::StructType::get(Int64Ty, Int64Ty);
10901
10902    Address Tmp = CreateMemTemp(E->getArg(0)->getType());
10903    EmitAnyExprToMem(E->getArg(0), Tmp, Qualifiers(), /*init*/ true);
10904
10905    Tmp = Tmp.withElementType(STy);
10906    llvm::Value *Val = Builder.CreateLoad(Tmp);
10907
10908    Value *Arg0 = Builder.CreateExtractValue(Val, 0);
10909    Value *Arg1 = Builder.CreateExtractValue(Val, 1);
10910    Value *StPtr = EmitScalarExpr(E->getArg(1));
10911    return Builder.CreateCall(F, {Arg0, Arg1, StPtr}, "stxp");
10912  }
10913
10914  if (BuiltinID == clang::AArch64::BI__builtin_arm_strex ||
10915      BuiltinID == clang::AArch64::BI__builtin_arm_stlex) {
10916    Value *StoreVal = EmitScalarExpr(E->getArg(0));
10917    Value *StoreAddr = EmitScalarExpr(E->getArg(1));
10918
10919    QualType Ty = E->getArg(0)->getType();
10920    llvm::Type *StoreTy =
10921        llvm::IntegerType::get(getLLVMContext(), getContext().getTypeSize(Ty));
10922
10923    if (StoreVal->getType()->isPointerTy())
10924      StoreVal = Builder.CreatePtrToInt(StoreVal, Int64Ty);
10925    else {
10926      llvm::Type *IntTy = llvm::IntegerType::get(
10927          getLLVMContext(),
10928          CGM.getDataLayout().getTypeSizeInBits(StoreVal->getType()));
10929      StoreVal = Builder.CreateBitCast(StoreVal, IntTy);
10930      StoreVal = Builder.CreateZExtOrBitCast(StoreVal, Int64Ty);
10931    }
10932
10933    Function *F =
10934        CGM.getIntrinsic(BuiltinID == clang::AArch64::BI__builtin_arm_stlex
10935                             ? Intrinsic::aarch64_stlxr
10936                             : Intrinsic::aarch64_stxr,
10937                         StoreAddr->getType());
10938    CallInst *CI = Builder.CreateCall(F, {StoreVal, StoreAddr}, "stxr");
10939    CI->addParamAttr(
10940        1, Attribute::get(getLLVMContext(), Attribute::ElementType, StoreTy));
10941    return CI;
10942  }
10943
10944  if (BuiltinID == clang::AArch64::BI__getReg) {
10945    Expr::EvalResult Result;
10946    if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
10947      llvm_unreachable("Sema will ensure that the parameter is constant");
10948
10949    llvm::APSInt Value = Result.Val.getInt();
10950    LLVMContext &Context = CGM.getLLVMContext();
10951    std::string Reg = Value == 31 ? "sp" : "x" + toString(Value, 10);
10952
10953    llvm::Metadata *Ops[] = {llvm::MDString::get(Context, Reg)};
10954    llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
10955    llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
10956
10957    llvm::Function *F =
10958        CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
10959    return Builder.CreateCall(F, Metadata);
10960  }
10961
10962  if (BuiltinID == clang::AArch64::BI__break) {
10963    Expr::EvalResult Result;
10964    if (!E->getArg(0)->EvaluateAsInt(Result, CGM.getContext()))
10965      llvm_unreachable("Sema will ensure that the parameter is constant");
10966
10967    llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::aarch64_break);
10968    return Builder.CreateCall(F, {EmitScalarExpr(E->getArg(0))});
10969  }
10970
10971  if (BuiltinID == clang::AArch64::BI__builtin_arm_clrex) {
10972    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_clrex);
10973    return Builder.CreateCall(F);
10974  }
10975
10976  if (BuiltinID == clang::AArch64::BI_ReadWriteBarrier)
10977    return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
10978                               llvm::SyncScope::SingleThread);
10979
10980  // CRC32
10981  Intrinsic::ID CRCIntrinsicID = Intrinsic::not_intrinsic;
10982  switch (BuiltinID) {
10983  case clang::AArch64::BI__builtin_arm_crc32b:
10984    CRCIntrinsicID = Intrinsic::aarch64_crc32b; break;
10985  case clang::AArch64::BI__builtin_arm_crc32cb:
10986    CRCIntrinsicID = Intrinsic::aarch64_crc32cb; break;
10987  case clang::AArch64::BI__builtin_arm_crc32h:
10988    CRCIntrinsicID = Intrinsic::aarch64_crc32h; break;
10989  case clang::AArch64::BI__builtin_arm_crc32ch:
10990    CRCIntrinsicID = Intrinsic::aarch64_crc32ch; break;
10991  case clang::AArch64::BI__builtin_arm_crc32w:
10992    CRCIntrinsicID = Intrinsic::aarch64_crc32w; break;
10993  case clang::AArch64::BI__builtin_arm_crc32cw:
10994    CRCIntrinsicID = Intrinsic::aarch64_crc32cw; break;
10995  case clang::AArch64::BI__builtin_arm_crc32d:
10996    CRCIntrinsicID = Intrinsic::aarch64_crc32x; break;
10997  case clang::AArch64::BI__builtin_arm_crc32cd:
10998    CRCIntrinsicID = Intrinsic::aarch64_crc32cx; break;
10999  }
11000
11001  if (CRCIntrinsicID != Intrinsic::not_intrinsic) {
11002    Value *Arg0 = EmitScalarExpr(E->getArg(0));
11003    Value *Arg1 = EmitScalarExpr(E->getArg(1));
11004    Function *F = CGM.getIntrinsic(CRCIntrinsicID);
11005
11006    llvm::Type *DataTy = F->getFunctionType()->getParamType(1);
11007    Arg1 = Builder.CreateZExtOrBitCast(Arg1, DataTy);
11008
11009    return Builder.CreateCall(F, {Arg0, Arg1});
11010  }
11011
11012  // Memory Operations (MOPS)
11013  if (BuiltinID == AArch64::BI__builtin_arm_mops_memset_tag) {
11014    Value *Dst = EmitScalarExpr(E->getArg(0));
11015    Value *Val = EmitScalarExpr(E->getArg(1));
11016    Value *Size = EmitScalarExpr(E->getArg(2));
11017    Dst = Builder.CreatePointerCast(Dst, Int8PtrTy);
11018    Val = Builder.CreateTrunc(Val, Int8Ty);
11019    Size = Builder.CreateIntCast(Size, Int64Ty, false);
11020    return Builder.CreateCall(
11021        CGM.getIntrinsic(Intrinsic::aarch64_mops_memset_tag), {Dst, Val, Size});
11022  }
11023
11024  // Memory Tagging Extensions (MTE) Intrinsics
11025  Intrinsic::ID MTEIntrinsicID = Intrinsic::not_intrinsic;
11026  switch (BuiltinID) {
11027  case clang::AArch64::BI__builtin_arm_irg:
11028    MTEIntrinsicID = Intrinsic::aarch64_irg; break;
11029  case clang::AArch64::BI__builtin_arm_addg:
11030    MTEIntrinsicID = Intrinsic::aarch64_addg; break;
11031  case clang::AArch64::BI__builtin_arm_gmi:
11032    MTEIntrinsicID = Intrinsic::aarch64_gmi; break;
11033  case clang::AArch64::BI__builtin_arm_ldg:
11034    MTEIntrinsicID = Intrinsic::aarch64_ldg; break;
11035  case clang::AArch64::BI__builtin_arm_stg:
11036    MTEIntrinsicID = Intrinsic::aarch64_stg; break;
11037  case clang::AArch64::BI__builtin_arm_subp:
11038    MTEIntrinsicID = Intrinsic::aarch64_subp; break;
11039  }
11040
11041  if (MTEIntrinsicID != Intrinsic::not_intrinsic) {
11042    llvm::Type *T = ConvertType(E->getType());
11043
11044    if (MTEIntrinsicID == Intrinsic::aarch64_irg) {
11045      Value *Pointer = EmitScalarExpr(E->getArg(0));
11046      Value *Mask = EmitScalarExpr(E->getArg(1));
11047
11048      Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
11049      Mask = Builder.CreateZExt(Mask, Int64Ty);
11050      Value *RV = Builder.CreateCall(
11051                       CGM.getIntrinsic(MTEIntrinsicID), {Pointer, Mask});
11052       return Builder.CreatePointerCast(RV, T);
11053    }
11054    if (MTEIntrinsicID == Intrinsic::aarch64_addg) {
11055      Value *Pointer = EmitScalarExpr(E->getArg(0));
11056      Value *TagOffset = EmitScalarExpr(E->getArg(1));
11057
11058      Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
11059      TagOffset = Builder.CreateZExt(TagOffset, Int64Ty);
11060      Value *RV = Builder.CreateCall(
11061                       CGM.getIntrinsic(MTEIntrinsicID), {Pointer, TagOffset});
11062      return Builder.CreatePointerCast(RV, T);
11063    }
11064    if (MTEIntrinsicID == Intrinsic::aarch64_gmi) {
11065      Value *Pointer = EmitScalarExpr(E->getArg(0));
11066      Value *ExcludedMask = EmitScalarExpr(E->getArg(1));
11067
11068      ExcludedMask = Builder.CreateZExt(ExcludedMask, Int64Ty);
11069      Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
11070      return Builder.CreateCall(
11071                       CGM.getIntrinsic(MTEIntrinsicID), {Pointer, ExcludedMask});
11072    }
11073    // Although it is possible to supply a different return
11074    // address (first arg) to this intrinsic, for now we set
11075    // return address same as input address.
11076    if (MTEIntrinsicID == Intrinsic::aarch64_ldg) {
11077      Value *TagAddress = EmitScalarExpr(E->getArg(0));
11078      TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy);
11079      Value *RV = Builder.CreateCall(
11080                    CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress});
11081      return Builder.CreatePointerCast(RV, T);
11082    }
11083    // Although it is possible to supply a different tag (to set)
11084    // to this intrinsic (as first arg), for now we supply
11085    // the tag that is in input address arg (common use case).
11086    if (MTEIntrinsicID == Intrinsic::aarch64_stg) {
11087        Value *TagAddress = EmitScalarExpr(E->getArg(0));
11088        TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy);
11089        return Builder.CreateCall(
11090                 CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress});
11091    }
11092    if (MTEIntrinsicID == Intrinsic::aarch64_subp) {
11093      Value *PointerA = EmitScalarExpr(E->getArg(0));
11094      Value *PointerB = EmitScalarExpr(E->getArg(1));
11095      PointerA = Builder.CreatePointerCast(PointerA, Int8PtrTy);
11096      PointerB = Builder.CreatePointerCast(PointerB, Int8PtrTy);
11097      return Builder.CreateCall(
11098                       CGM.getIntrinsic(MTEIntrinsicID), {PointerA, PointerB});
11099    }
11100  }
11101
11102  if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
11103      BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
11104      BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
11105      BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
11106      BuiltinID == clang::AArch64::BI__builtin_arm_wsr ||
11107      BuiltinID == clang::AArch64::BI__builtin_arm_wsr64 ||
11108      BuiltinID == clang::AArch64::BI__builtin_arm_wsr128 ||
11109      BuiltinID == clang::AArch64::BI__builtin_arm_wsrp) {
11110
11111    SpecialRegisterAccessKind AccessKind = Write;
11112    if (BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
11113        BuiltinID == clang::AArch64::BI__builtin_arm_rsr64 ||
11114        BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
11115        BuiltinID == clang::AArch64::BI__builtin_arm_rsrp)
11116      AccessKind = VolatileRead;
11117
11118    bool IsPointerBuiltin = BuiltinID == clang::AArch64::BI__builtin_arm_rsrp ||
11119                            BuiltinID == clang::AArch64::BI__builtin_arm_wsrp;
11120
11121    bool Is32Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr ||
11122                   BuiltinID == clang::AArch64::BI__builtin_arm_wsr;
11123
11124    bool Is128Bit = BuiltinID == clang::AArch64::BI__builtin_arm_rsr128 ||
11125                    BuiltinID == clang::AArch64::BI__builtin_arm_wsr128;
11126
11127    llvm::Type *ValueType;
11128    llvm::Type *RegisterType = Int64Ty;
11129    if (Is32Bit) {
11130      ValueType = Int32Ty;
11131    } else if (Is128Bit) {
11132      llvm::Type *Int128Ty =
11133          llvm::IntegerType::getInt128Ty(CGM.getLLVMContext());
11134      ValueType = Int128Ty;
11135      RegisterType = Int128Ty;
11136    } else if (IsPointerBuiltin) {
11137      ValueType = VoidPtrTy;
11138    } else {
11139      ValueType = Int64Ty;
11140    };
11141
11142    return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType,
11143                                      AccessKind);
11144  }
11145
11146  if (BuiltinID == clang::AArch64::BI_ReadStatusReg ||
11147      BuiltinID == clang::AArch64::BI_WriteStatusReg) {
11148    LLVMContext &Context = CGM.getLLVMContext();
11149
11150    unsigned SysReg =
11151      E->getArg(0)->EvaluateKnownConstInt(getContext()).getZExtValue();
11152
11153    std::string SysRegStr;
11154    llvm::raw_string_ostream(SysRegStr) <<
11155                       ((1 << 1) | ((SysReg >> 14) & 1))  << ":" <<
11156                       ((SysReg >> 11) & 7)               << ":" <<
11157                       ((SysReg >> 7)  & 15)              << ":" <<
11158                       ((SysReg >> 3)  & 15)              << ":" <<
11159                       ( SysReg        & 7);
11160
11161    llvm::Metadata *Ops[] = { llvm::MDString::get(Context, SysRegStr) };
11162    llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
11163    llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
11164
11165    llvm::Type *RegisterType = Int64Ty;
11166    llvm::Type *Types[] = { RegisterType };
11167
11168    if (BuiltinID == clang::AArch64::BI_ReadStatusReg) {
11169      llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::read_register, Types);
11170
11171      return Builder.CreateCall(F, Metadata);
11172    }
11173
11174    llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::write_register, Types);
11175    llvm::Value *ArgValue = EmitScalarExpr(E->getArg(1));
11176
11177    return Builder.CreateCall(F, { Metadata, ArgValue });
11178  }
11179
11180  if (BuiltinID == clang::AArch64::BI_AddressOfReturnAddress) {
11181    llvm::Function *F =
11182        CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
11183    return Builder.CreateCall(F);
11184  }
11185
11186  if (BuiltinID == clang::AArch64::BI__builtin_sponentry) {
11187    llvm::Function *F = CGM.getIntrinsic(Intrinsic::sponentry, AllocaInt8PtrTy);
11188    return Builder.CreateCall(F);
11189  }
11190
11191  if (BuiltinID == clang::AArch64::BI__mulh ||
11192      BuiltinID == clang::AArch64::BI__umulh) {
11193    llvm::Type *ResType = ConvertType(E->getType());
11194    llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
11195
11196    bool IsSigned = BuiltinID == clang::AArch64::BI__mulh;
11197    Value *LHS =
11198        Builder.CreateIntCast(EmitScalarExpr(E->getArg(0)), Int128Ty, IsSigned);
11199    Value *RHS =
11200        Builder.CreateIntCast(EmitScalarExpr(E->getArg(1)), Int128Ty, IsSigned);
11201
11202    Value *MulResult, *HigherBits;
11203    if (IsSigned) {
11204      MulResult = Builder.CreateNSWMul(LHS, RHS);
11205      HigherBits = Builder.CreateAShr(MulResult, 64);
11206    } else {
11207      MulResult = Builder.CreateNUWMul(LHS, RHS);
11208      HigherBits = Builder.CreateLShr(MulResult, 64);
11209    }
11210    HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
11211
11212    return HigherBits;
11213  }
11214
11215  if (BuiltinID == AArch64::BI__writex18byte ||
11216      BuiltinID == AArch64::BI__writex18word ||
11217      BuiltinID == AArch64::BI__writex18dword ||
11218      BuiltinID == AArch64::BI__writex18qword) {
11219    // Read x18 as i8*
11220    LLVMContext &Context = CGM.getLLVMContext();
11221    llvm::Metadata *Ops[] = {llvm::MDString::get(Context, "x18")};
11222    llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
11223    llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
11224    llvm::Function *F =
11225        CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
11226    llvm::Value *X18 = Builder.CreateCall(F, Metadata);
11227    X18 = Builder.CreateIntToPtr(X18, Int8PtrTy);
11228
11229    // Store val at x18 + offset
11230    Value *Offset = Builder.CreateZExt(EmitScalarExpr(E->getArg(0)), Int64Ty);
11231    Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset);
11232    Value *Val = EmitScalarExpr(E->getArg(1));
11233    StoreInst *Store = Builder.CreateAlignedStore(Val, Ptr, CharUnits::One());
11234    return Store;
11235  }
11236
11237  if (BuiltinID == AArch64::BI__readx18byte ||
11238      BuiltinID == AArch64::BI__readx18word ||
11239      BuiltinID == AArch64::BI__readx18dword ||
11240      BuiltinID == AArch64::BI__readx18qword) {
11241    llvm::Type *IntTy = ConvertType(E->getType());
11242
11243    // Read x18 as i8*
11244    LLVMContext &Context = CGM.getLLVMContext();
11245    llvm::Metadata *Ops[] = {llvm::MDString::get(Context, "x18")};
11246    llvm::MDNode *RegName = llvm::MDNode::get(Context, Ops);
11247    llvm::Value *Metadata = llvm::MetadataAsValue::get(Context, RegName);
11248    llvm::Function *F =
11249        CGM.getIntrinsic(llvm::Intrinsic::read_register, {Int64Ty});
11250    llvm::Value *X18 = Builder.CreateCall(F, Metadata);
11251    X18 = Builder.CreateIntToPtr(X18, Int8PtrTy);
11252
11253    // Load x18 + offset
11254    Value *Offset = Builder.CreateZExt(EmitScalarExpr(E->getArg(0)), Int64Ty);
11255    Value *Ptr = Builder.CreateGEP(Int8Ty, X18, Offset);
11256    LoadInst *Load = Builder.CreateAlignedLoad(IntTy, Ptr, CharUnits::One());
11257    return Load;
11258  }
11259
11260  if (BuiltinID == AArch64::BI_CopyDoubleFromInt64 ||
11261      BuiltinID == AArch64::BI_CopyFloatFromInt32 ||
11262      BuiltinID == AArch64::BI_CopyInt32FromFloat ||
11263      BuiltinID == AArch64::BI_CopyInt64FromDouble) {
11264    Value *Arg = EmitScalarExpr(E->getArg(0));
11265    llvm::Type *RetTy = ConvertType(E->getType());
11266    return Builder.CreateBitCast(Arg, RetTy);
11267  }
11268
11269  if (BuiltinID == AArch64::BI_CountLeadingOnes ||
11270      BuiltinID == AArch64::BI_CountLeadingOnes64 ||
11271      BuiltinID == AArch64::BI_CountLeadingZeros ||
11272      BuiltinID == AArch64::BI_CountLeadingZeros64) {
11273    Value *Arg = EmitScalarExpr(E->getArg(0));
11274    llvm::Type *ArgType = Arg->getType();
11275
11276    if (BuiltinID == AArch64::BI_CountLeadingOnes ||
11277        BuiltinID == AArch64::BI_CountLeadingOnes64)
11278      Arg = Builder.CreateXor(Arg, Constant::getAllOnesValue(ArgType));
11279
11280    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
11281    Value *Result = Builder.CreateCall(F, {Arg, Builder.getInt1(false)});
11282
11283    if (BuiltinID == AArch64::BI_CountLeadingOnes64 ||
11284        BuiltinID == AArch64::BI_CountLeadingZeros64)
11285      Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
11286    return Result;
11287  }
11288
11289  if (BuiltinID == AArch64::BI_CountLeadingSigns ||
11290      BuiltinID == AArch64::BI_CountLeadingSigns64) {
11291    Value *Arg = EmitScalarExpr(E->getArg(0));
11292
11293    Function *F = (BuiltinID == AArch64::BI_CountLeadingSigns)
11294                      ? CGM.getIntrinsic(Intrinsic::aarch64_cls)
11295                      : CGM.getIntrinsic(Intrinsic::aarch64_cls64);
11296
11297    Value *Result = Builder.CreateCall(F, Arg, "cls");
11298    if (BuiltinID == AArch64::BI_CountLeadingSigns64)
11299      Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
11300    return Result;
11301  }
11302
11303  if (BuiltinID == AArch64::BI_CountOneBits ||
11304      BuiltinID == AArch64::BI_CountOneBits64) {
11305    Value *ArgValue = EmitScalarExpr(E->getArg(0));
11306    llvm::Type *ArgType = ArgValue->getType();
11307    Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
11308
11309    Value *Result = Builder.CreateCall(F, ArgValue);
11310    if (BuiltinID == AArch64::BI_CountOneBits64)
11311      Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
11312    return Result;
11313  }
11314
11315  if (BuiltinID == AArch64::BI__prefetch) {
11316    Value *Address = EmitScalarExpr(E->getArg(0));
11317    Value *RW = llvm::ConstantInt::get(Int32Ty, 0);
11318    Value *Locality = ConstantInt::get(Int32Ty, 3);
11319    Value *Data = llvm::ConstantInt::get(Int32Ty, 1);
11320    Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
11321    return Builder.CreateCall(F, {Address, RW, Locality, Data});
11322  }
11323
11324  // Handle MSVC intrinsics before argument evaluation to prevent double
11325  // evaluation.
11326  if (std::optional<MSVCIntrin> MsvcIntId =
11327          translateAarch64ToMsvcIntrin(BuiltinID))
11328    return EmitMSVCBuiltinExpr(*MsvcIntId, E);
11329
11330  // Some intrinsics are equivalent - if they are use the base intrinsic ID.
11331  auto It = llvm::find_if(NEONEquivalentIntrinsicMap, [BuiltinID](auto &P) {
11332    return P.first == BuiltinID;
11333  });
11334  if (It != end(NEONEquivalentIntrinsicMap))
11335    BuiltinID = It->second;
11336
11337  // Find out if any arguments are required to be integer constant
11338  // expressions.
11339  unsigned ICEArguments = 0;
11340  ASTContext::GetBuiltinTypeError Error;
11341  getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
11342  assert(Error == ASTContext::GE_None && "Should not codegen an error");
11343
11344  llvm::SmallVector<Value*, 4> Ops;
11345  Address PtrOp0 = Address::invalid();
11346  for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
11347    if (i == 0) {
11348      switch (BuiltinID) {
11349      case NEON::BI__builtin_neon_vld1_v:
11350      case NEON::BI__builtin_neon_vld1q_v:
11351      case NEON::BI__builtin_neon_vld1_dup_v:
11352      case NEON::BI__builtin_neon_vld1q_dup_v:
11353      case NEON::BI__builtin_neon_vld1_lane_v:
11354      case NEON::BI__builtin_neon_vld1q_lane_v:
11355      case NEON::BI__builtin_neon_vst1_v:
11356      case NEON::BI__builtin_neon_vst1q_v:
11357      case NEON::BI__builtin_neon_vst1_lane_v:
11358      case NEON::BI__builtin_neon_vst1q_lane_v:
11359      case NEON::BI__builtin_neon_vldap1_lane_s64:
11360      case NEON::BI__builtin_neon_vldap1q_lane_s64:
11361      case NEON::BI__builtin_neon_vstl1_lane_s64:
11362      case NEON::BI__builtin_neon_vstl1q_lane_s64:
11363        // Get the alignment for the argument in addition to the value;
11364        // we'll use it later.
11365        PtrOp0 = EmitPointerWithAlignment(E->getArg(0));
11366        Ops.push_back(PtrOp0.getPointer());
11367        continue;
11368      }
11369    }
11370    Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
11371  }
11372
11373  auto SISDMap = ArrayRef(AArch64SISDIntrinsicMap);
11374  const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
11375      SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted);
11376
11377  if (Builtin) {
11378    Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1)));
11379    Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E);
11380    assert(Result && "SISD intrinsic should have been handled");
11381    return Result;
11382  }
11383
11384  const Expr *Arg = E->getArg(E->getNumArgs()-1);
11385  NeonTypeFlags Type(0);
11386  if (std::optional<llvm::APSInt> Result =
11387          Arg->getIntegerConstantExpr(getContext()))
11388    // Determine the type of this overloaded NEON intrinsic.
11389    Type = NeonTypeFlags(Result->getZExtValue());
11390
11391  bool usgn = Type.isUnsigned();
11392  bool quad = Type.isQuad();
11393
11394  // Handle non-overloaded intrinsics first.
11395  switch (BuiltinID) {
11396  default: break;
11397  case NEON::BI__builtin_neon_vabsh_f16:
11398    Ops.push_back(EmitScalarExpr(E->getArg(0)));
11399    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, "vabs");
11400  case NEON::BI__builtin_neon_vaddq_p128: {
11401    llvm::Type *Ty = GetNeonType(this, NeonTypeFlags::Poly128);
11402    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11403    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
11404    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
11405    Ops[0] =  Builder.CreateXor(Ops[0], Ops[1]);
11406    llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
11407    return Builder.CreateBitCast(Ops[0], Int128Ty);
11408  }
11409  case NEON::BI__builtin_neon_vldrq_p128: {
11410    llvm::Type *Int128Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
11411    Value *Ptr = EmitScalarExpr(E->getArg(0));
11412    return Builder.CreateAlignedLoad(Int128Ty, Ptr,
11413                                     CharUnits::fromQuantity(16));
11414  }
11415  case NEON::BI__builtin_neon_vstrq_p128: {
11416    Value *Ptr = Ops[0];
11417    return Builder.CreateDefaultAlignedStore(EmitScalarExpr(E->getArg(1)), Ptr);
11418  }
11419  case NEON::BI__builtin_neon_vcvts_f32_u32:
11420  case NEON::BI__builtin_neon_vcvtd_f64_u64:
11421    usgn = true;
11422    [[fallthrough]];
11423  case NEON::BI__builtin_neon_vcvts_f32_s32:
11424  case NEON::BI__builtin_neon_vcvtd_f64_s64: {
11425    Ops.push_back(EmitScalarExpr(E->getArg(0)));
11426    bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
11427    llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
11428    llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
11429    Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
11430    if (usgn)
11431      return Builder.CreateUIToFP(Ops[0], FTy);
11432    return Builder.CreateSIToFP(Ops[0], FTy);
11433  }
11434  case NEON::BI__builtin_neon_vcvth_f16_u16:
11435  case NEON::BI__builtin_neon_vcvth_f16_u32:
11436  case NEON::BI__builtin_neon_vcvth_f16_u64:
11437    usgn = true;
11438    [[fallthrough]];
11439  case NEON::BI__builtin_neon_vcvth_f16_s16:
11440  case NEON::BI__builtin_neon_vcvth_f16_s32:
11441  case NEON::BI__builtin_neon_vcvth_f16_s64: {
11442    Ops.push_back(EmitScalarExpr(E->getArg(0)));
11443    llvm::Type *FTy = HalfTy;
11444    llvm::Type *InTy;
11445    if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
11446      InTy = Int64Ty;
11447    else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32)
11448      InTy = Int32Ty;
11449    else
11450      InTy = Int16Ty;
11451    Ops[0] = Builder.CreateBitCast(Ops[0], InTy);
11452    if (usgn)
11453      return Builder.CreateUIToFP(Ops[0], FTy);
11454    return Builder.CreateSIToFP(Ops[0], FTy);
11455  }
11456  case NEON::BI__builtin_neon_vcvtah_u16_f16:
11457  case NEON::BI__builtin_neon_vcvtmh_u16_f16:
11458  case NEON::BI__builtin_neon_vcvtnh_u16_f16:
11459  case NEON::BI__builtin_neon_vcvtph_u16_f16:
11460  case NEON::BI__builtin_neon_vcvth_u16_f16:
11461  case NEON::BI__builtin_neon_vcvtah_s16_f16:
11462  case NEON::BI__builtin_neon_vcvtmh_s16_f16:
11463  case NEON::BI__builtin_neon_vcvtnh_s16_f16:
11464  case NEON::BI__builtin_neon_vcvtph_s16_f16:
11465  case NEON::BI__builtin_neon_vcvth_s16_f16: {
11466    unsigned Int;
11467    llvm::Type* InTy = Int32Ty;
11468    llvm::Type* FTy  = HalfTy;
11469    llvm::Type *Tys[2] = {InTy, FTy};
11470    Ops.push_back(EmitScalarExpr(E->getArg(0)));
11471    switch (BuiltinID) {
11472    default: llvm_unreachable("missing builtin ID in switch!");
11473    case NEON::BI__builtin_neon_vcvtah_u16_f16:
11474      Int = Intrinsic::aarch64_neon_fcvtau; break;
11475    case NEON::BI__builtin_neon_vcvtmh_u16_f16:
11476      Int = Intrinsic::aarch64_neon_fcvtmu; break;
11477    case NEON::BI__builtin_neon_vcvtnh_u16_f16:
11478      Int = Intrinsic::aarch64_neon_fcvtnu; break;
11479    case NEON::BI__builtin_neon_vcvtph_u16_f16:
11480      Int = Intrinsic::aarch64_neon_fcvtpu; break;
11481    case NEON::BI__builtin_neon_vcvth_u16_f16:
11482      Int = Intrinsic::aarch64_neon_fcvtzu; break;
11483    case NEON::BI__builtin_neon_vcvtah_s16_f16:
11484      Int = Intrinsic::aarch64_neon_fcvtas; break;
11485    case NEON::BI__builtin_neon_vcvtmh_s16_f16:
11486      Int = Intrinsic::aarch64_neon_fcvtms; break;
11487    case NEON::BI__builtin_neon_vcvtnh_s16_f16:
11488      Int = Intrinsic::aarch64_neon_fcvtns; break;
11489    case NEON::BI__builtin_neon_vcvtph_s16_f16:
11490      Int = Intrinsic::aarch64_neon_fcvtps; break;
11491    case NEON::BI__builtin_neon_vcvth_s16_f16:
11492      Int = Intrinsic::aarch64_neon_fcvtzs; break;
11493    }
11494    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvt");
11495    return Builder.CreateTrunc(Ops[0], Int16Ty);
11496  }
11497  case NEON::BI__builtin_neon_vcaleh_f16:
11498  case NEON::BI__builtin_neon_vcalth_f16:
11499  case NEON::BI__builtin_neon_vcageh_f16:
11500  case NEON::BI__builtin_neon_vcagth_f16: {
11501    unsigned Int;
11502    llvm::Type* InTy = Int32Ty;
11503    llvm::Type* FTy  = HalfTy;
11504    llvm::Type *Tys[2] = {InTy, FTy};
11505    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11506    switch (BuiltinID) {
11507    default: llvm_unreachable("missing builtin ID in switch!");
11508    case NEON::BI__builtin_neon_vcageh_f16:
11509      Int = Intrinsic::aarch64_neon_facge; break;
11510    case NEON::BI__builtin_neon_vcagth_f16:
11511      Int = Intrinsic::aarch64_neon_facgt; break;
11512    case NEON::BI__builtin_neon_vcaleh_f16:
11513      Int = Intrinsic::aarch64_neon_facge; std::swap(Ops[0], Ops[1]); break;
11514    case NEON::BI__builtin_neon_vcalth_f16:
11515      Int = Intrinsic::aarch64_neon_facgt; std::swap(Ops[0], Ops[1]); break;
11516    }
11517    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "facg");
11518    return Builder.CreateTrunc(Ops[0], Int16Ty);
11519  }
11520  case NEON::BI__builtin_neon_vcvth_n_s16_f16:
11521  case NEON::BI__builtin_neon_vcvth_n_u16_f16: {
11522    unsigned Int;
11523    llvm::Type* InTy = Int32Ty;
11524    llvm::Type* FTy  = HalfTy;
11525    llvm::Type *Tys[2] = {InTy, FTy};
11526    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11527    switch (BuiltinID) {
11528    default: llvm_unreachable("missing builtin ID in switch!");
11529    case NEON::BI__builtin_neon_vcvth_n_s16_f16:
11530      Int = Intrinsic::aarch64_neon_vcvtfp2fxs; break;
11531    case NEON::BI__builtin_neon_vcvth_n_u16_f16:
11532      Int = Intrinsic::aarch64_neon_vcvtfp2fxu; break;
11533    }
11534    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
11535    return Builder.CreateTrunc(Ops[0], Int16Ty);
11536  }
11537  case NEON::BI__builtin_neon_vcvth_n_f16_s16:
11538  case NEON::BI__builtin_neon_vcvth_n_f16_u16: {
11539    unsigned Int;
11540    llvm::Type* FTy  = HalfTy;
11541    llvm::Type* InTy = Int32Ty;
11542    llvm::Type *Tys[2] = {FTy, InTy};
11543    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11544    switch (BuiltinID) {
11545    default: llvm_unreachable("missing builtin ID in switch!");
11546    case NEON::BI__builtin_neon_vcvth_n_f16_s16:
11547      Int = Intrinsic::aarch64_neon_vcvtfxs2fp;
11548      Ops[0] = Builder.CreateSExt(Ops[0], InTy, "sext");
11549      break;
11550    case NEON::BI__builtin_neon_vcvth_n_f16_u16:
11551      Int = Intrinsic::aarch64_neon_vcvtfxu2fp;
11552      Ops[0] = Builder.CreateZExt(Ops[0], InTy);
11553      break;
11554    }
11555    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
11556  }
11557  case NEON::BI__builtin_neon_vpaddd_s64: {
11558    auto *Ty = llvm::FixedVectorType::get(Int64Ty, 2);
11559    Value *Vec = EmitScalarExpr(E->getArg(0));
11560    // The vector is v2f64, so make sure it's bitcast to that.
11561    Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
11562    llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
11563    llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
11564    Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
11565    Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
11566    // Pairwise addition of a v2f64 into a scalar f64.
11567    return Builder.CreateAdd(Op0, Op1, "vpaddd");
11568  }
11569  case NEON::BI__builtin_neon_vpaddd_f64: {
11570    auto *Ty = llvm::FixedVectorType::get(DoubleTy, 2);
11571    Value *Vec = EmitScalarExpr(E->getArg(0));
11572    // The vector is v2f64, so make sure it's bitcast to that.
11573    Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
11574    llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
11575    llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
11576    Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
11577    Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
11578    // Pairwise addition of a v2f64 into a scalar f64.
11579    return Builder.CreateFAdd(Op0, Op1, "vpaddd");
11580  }
11581  case NEON::BI__builtin_neon_vpadds_f32: {
11582    auto *Ty = llvm::FixedVectorType::get(FloatTy, 2);
11583    Value *Vec = EmitScalarExpr(E->getArg(0));
11584    // The vector is v2f32, so make sure it's bitcast to that.
11585    Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
11586    llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
11587    llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
11588    Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
11589    Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
11590    // Pairwise addition of a v2f32 into a scalar f32.
11591    return Builder.CreateFAdd(Op0, Op1, "vpaddd");
11592  }
11593  case NEON::BI__builtin_neon_vceqzd_s64:
11594  case NEON::BI__builtin_neon_vceqzd_f64:
11595  case NEON::BI__builtin_neon_vceqzs_f32:
11596  case NEON::BI__builtin_neon_vceqzh_f16:
11597    Ops.push_back(EmitScalarExpr(E->getArg(0)));
11598    return EmitAArch64CompareBuiltinExpr(
11599        Ops[0], ConvertType(E->getCallReturnType(getContext())),
11600        ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
11601  case NEON::BI__builtin_neon_vcgezd_s64:
11602  case NEON::BI__builtin_neon_vcgezd_f64:
11603  case NEON::BI__builtin_neon_vcgezs_f32:
11604  case NEON::BI__builtin_neon_vcgezh_f16:
11605    Ops.push_back(EmitScalarExpr(E->getArg(0)));
11606    return EmitAArch64CompareBuiltinExpr(
11607        Ops[0], ConvertType(E->getCallReturnType(getContext())),
11608        ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
11609  case NEON::BI__builtin_neon_vclezd_s64:
11610  case NEON::BI__builtin_neon_vclezd_f64:
11611  case NEON::BI__builtin_neon_vclezs_f32:
11612  case NEON::BI__builtin_neon_vclezh_f16:
11613    Ops.push_back(EmitScalarExpr(E->getArg(0)));
11614    return EmitAArch64CompareBuiltinExpr(
11615        Ops[0], ConvertType(E->getCallReturnType(getContext())),
11616        ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
11617  case NEON::BI__builtin_neon_vcgtzd_s64:
11618  case NEON::BI__builtin_neon_vcgtzd_f64:
11619  case NEON::BI__builtin_neon_vcgtzs_f32:
11620  case NEON::BI__builtin_neon_vcgtzh_f16:
11621    Ops.push_back(EmitScalarExpr(E->getArg(0)));
11622    return EmitAArch64CompareBuiltinExpr(
11623        Ops[0], ConvertType(E->getCallReturnType(getContext())),
11624        ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
11625  case NEON::BI__builtin_neon_vcltzd_s64:
11626  case NEON::BI__builtin_neon_vcltzd_f64:
11627  case NEON::BI__builtin_neon_vcltzs_f32:
11628  case NEON::BI__builtin_neon_vcltzh_f16:
11629    Ops.push_back(EmitScalarExpr(E->getArg(0)));
11630    return EmitAArch64CompareBuiltinExpr(
11631        Ops[0], ConvertType(E->getCallReturnType(getContext())),
11632        ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz");
11633
11634  case NEON::BI__builtin_neon_vceqzd_u64: {
11635    Ops.push_back(EmitScalarExpr(E->getArg(0)));
11636    Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
11637    Ops[0] =
11638        Builder.CreateICmpEQ(Ops[0], llvm::Constant::getNullValue(Int64Ty));
11639    return Builder.CreateSExt(Ops[0], Int64Ty, "vceqzd");
11640  }
11641  case NEON::BI__builtin_neon_vceqd_f64:
11642  case NEON::BI__builtin_neon_vcled_f64:
11643  case NEON::BI__builtin_neon_vcltd_f64:
11644  case NEON::BI__builtin_neon_vcged_f64:
11645  case NEON::BI__builtin_neon_vcgtd_f64: {
11646    llvm::CmpInst::Predicate P;
11647    switch (BuiltinID) {
11648    default: llvm_unreachable("missing builtin ID in switch!");
11649    case NEON::BI__builtin_neon_vceqd_f64: P = llvm::FCmpInst::FCMP_OEQ; break;
11650    case NEON::BI__builtin_neon_vcled_f64: P = llvm::FCmpInst::FCMP_OLE; break;
11651    case NEON::BI__builtin_neon_vcltd_f64: P = llvm::FCmpInst::FCMP_OLT; break;
11652    case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
11653    case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
11654    }
11655    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11656    Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
11657    Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
11658    if (P == llvm::FCmpInst::FCMP_OEQ)
11659      Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
11660    else
11661      Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
11662    return Builder.CreateSExt(Ops[0], Int64Ty, "vcmpd");
11663  }
11664  case NEON::BI__builtin_neon_vceqs_f32:
11665  case NEON::BI__builtin_neon_vcles_f32:
11666  case NEON::BI__builtin_neon_vclts_f32:
11667  case NEON::BI__builtin_neon_vcges_f32:
11668  case NEON::BI__builtin_neon_vcgts_f32: {
11669    llvm::CmpInst::Predicate P;
11670    switch (BuiltinID) {
11671    default: llvm_unreachable("missing builtin ID in switch!");
11672    case NEON::BI__builtin_neon_vceqs_f32: P = llvm::FCmpInst::FCMP_OEQ; break;
11673    case NEON::BI__builtin_neon_vcles_f32: P = llvm::FCmpInst::FCMP_OLE; break;
11674    case NEON::BI__builtin_neon_vclts_f32: P = llvm::FCmpInst::FCMP_OLT; break;
11675    case NEON::BI__builtin_neon_vcges_f32: P = llvm::FCmpInst::FCMP_OGE; break;
11676    case NEON::BI__builtin_neon_vcgts_f32: P = llvm::FCmpInst::FCMP_OGT; break;
11677    }
11678    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11679    Ops[0] = Builder.CreateBitCast(Ops[0], FloatTy);
11680    Ops[1] = Builder.CreateBitCast(Ops[1], FloatTy);
11681    if (P == llvm::FCmpInst::FCMP_OEQ)
11682      Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
11683    else
11684      Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
11685    return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd");
11686  }
11687  case NEON::BI__builtin_neon_vceqh_f16:
11688  case NEON::BI__builtin_neon_vcleh_f16:
11689  case NEON::BI__builtin_neon_vclth_f16:
11690  case NEON::BI__builtin_neon_vcgeh_f16:
11691  case NEON::BI__builtin_neon_vcgth_f16: {
11692    llvm::CmpInst::Predicate P;
11693    switch (BuiltinID) {
11694    default: llvm_unreachable("missing builtin ID in switch!");
11695    case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break;
11696    case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break;
11697    case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break;
11698    case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break;
11699    case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break;
11700    }
11701    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11702    Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy);
11703    Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy);
11704    if (P == llvm::FCmpInst::FCMP_OEQ)
11705      Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]);
11706    else
11707      Ops[0] = Builder.CreateFCmpS(P, Ops[0], Ops[1]);
11708    return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd");
11709  }
11710  case NEON::BI__builtin_neon_vceqd_s64:
11711  case NEON::BI__builtin_neon_vceqd_u64:
11712  case NEON::BI__builtin_neon_vcgtd_s64:
11713  case NEON::BI__builtin_neon_vcgtd_u64:
11714  case NEON::BI__builtin_neon_vcltd_s64:
11715  case NEON::BI__builtin_neon_vcltd_u64:
11716  case NEON::BI__builtin_neon_vcged_u64:
11717  case NEON::BI__builtin_neon_vcged_s64:
11718  case NEON::BI__builtin_neon_vcled_u64:
11719  case NEON::BI__builtin_neon_vcled_s64: {
11720    llvm::CmpInst::Predicate P;
11721    switch (BuiltinID) {
11722    default: llvm_unreachable("missing builtin ID in switch!");
11723    case NEON::BI__builtin_neon_vceqd_s64:
11724    case NEON::BI__builtin_neon_vceqd_u64:P = llvm::ICmpInst::ICMP_EQ;break;
11725    case NEON::BI__builtin_neon_vcgtd_s64:P = llvm::ICmpInst::ICMP_SGT;break;
11726    case NEON::BI__builtin_neon_vcgtd_u64:P = llvm::ICmpInst::ICMP_UGT;break;
11727    case NEON::BI__builtin_neon_vcltd_s64:P = llvm::ICmpInst::ICMP_SLT;break;
11728    case NEON::BI__builtin_neon_vcltd_u64:P = llvm::ICmpInst::ICMP_ULT;break;
11729    case NEON::BI__builtin_neon_vcged_u64:P = llvm::ICmpInst::ICMP_UGE;break;
11730    case NEON::BI__builtin_neon_vcged_s64:P = llvm::ICmpInst::ICMP_SGE;break;
11731    case NEON::BI__builtin_neon_vcled_u64:P = llvm::ICmpInst::ICMP_ULE;break;
11732    case NEON::BI__builtin_neon_vcled_s64:P = llvm::ICmpInst::ICMP_SLE;break;
11733    }
11734    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11735    Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
11736    Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
11737    Ops[0] = Builder.CreateICmp(P, Ops[0], Ops[1]);
11738    return Builder.CreateSExt(Ops[0], Int64Ty, "vceqd");
11739  }
11740  case NEON::BI__builtin_neon_vtstd_s64:
11741  case NEON::BI__builtin_neon_vtstd_u64: {
11742    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11743    Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
11744    Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
11745    Ops[0] = Builder.CreateAnd(Ops[0], Ops[1]);
11746    Ops[0] = Builder.CreateICmp(ICmpInst::ICMP_NE, Ops[0],
11747                                llvm::Constant::getNullValue(Int64Ty));
11748    return Builder.CreateSExt(Ops[0], Int64Ty, "vtstd");
11749  }
11750  case NEON::BI__builtin_neon_vset_lane_i8:
11751  case NEON::BI__builtin_neon_vset_lane_i16:
11752  case NEON::BI__builtin_neon_vset_lane_i32:
11753  case NEON::BI__builtin_neon_vset_lane_i64:
11754  case NEON::BI__builtin_neon_vset_lane_bf16:
11755  case NEON::BI__builtin_neon_vset_lane_f32:
11756  case NEON::BI__builtin_neon_vsetq_lane_i8:
11757  case NEON::BI__builtin_neon_vsetq_lane_i16:
11758  case NEON::BI__builtin_neon_vsetq_lane_i32:
11759  case NEON::BI__builtin_neon_vsetq_lane_i64:
11760  case NEON::BI__builtin_neon_vsetq_lane_bf16:
11761  case NEON::BI__builtin_neon_vsetq_lane_f32:
11762    Ops.push_back(EmitScalarExpr(E->getArg(2)));
11763    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
11764  case NEON::BI__builtin_neon_vset_lane_f64:
11765    // The vector type needs a cast for the v1f64 variant.
11766    Ops[1] =
11767        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 1));
11768    Ops.push_back(EmitScalarExpr(E->getArg(2)));
11769    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
11770  case NEON::BI__builtin_neon_vsetq_lane_f64:
11771    // The vector type needs a cast for the v2f64 variant.
11772    Ops[1] =
11773        Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 2));
11774    Ops.push_back(EmitScalarExpr(E->getArg(2)));
11775    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane");
11776
11777  case NEON::BI__builtin_neon_vget_lane_i8:
11778  case NEON::BI__builtin_neon_vdupb_lane_i8:
11779    Ops[0] =
11780        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 8));
11781    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11782                                        "vget_lane");
11783  case NEON::BI__builtin_neon_vgetq_lane_i8:
11784  case NEON::BI__builtin_neon_vdupb_laneq_i8:
11785    Ops[0] =
11786        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 16));
11787    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11788                                        "vgetq_lane");
11789  case NEON::BI__builtin_neon_vget_lane_i16:
11790  case NEON::BI__builtin_neon_vduph_lane_i16:
11791    Ops[0] =
11792        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 4));
11793    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11794                                        "vget_lane");
11795  case NEON::BI__builtin_neon_vgetq_lane_i16:
11796  case NEON::BI__builtin_neon_vduph_laneq_i16:
11797    Ops[0] =
11798        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int16Ty, 8));
11799    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11800                                        "vgetq_lane");
11801  case NEON::BI__builtin_neon_vget_lane_i32:
11802  case NEON::BI__builtin_neon_vdups_lane_i32:
11803    Ops[0] =
11804        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 2));
11805    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11806                                        "vget_lane");
11807  case NEON::BI__builtin_neon_vdups_lane_f32:
11808    Ops[0] =
11809        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
11810    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11811                                        "vdups_lane");
11812  case NEON::BI__builtin_neon_vgetq_lane_i32:
11813  case NEON::BI__builtin_neon_vdups_laneq_i32:
11814    Ops[0] =
11815        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int32Ty, 4));
11816    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11817                                        "vgetq_lane");
11818  case NEON::BI__builtin_neon_vget_lane_i64:
11819  case NEON::BI__builtin_neon_vdupd_lane_i64:
11820    Ops[0] =
11821        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 1));
11822    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11823                                        "vget_lane");
11824  case NEON::BI__builtin_neon_vdupd_lane_f64:
11825    Ops[0] =
11826        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
11827    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11828                                        "vdupd_lane");
11829  case NEON::BI__builtin_neon_vgetq_lane_i64:
11830  case NEON::BI__builtin_neon_vdupd_laneq_i64:
11831    Ops[0] =
11832        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int64Ty, 2));
11833    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11834                                        "vgetq_lane");
11835  case NEON::BI__builtin_neon_vget_lane_f32:
11836    Ops[0] =
11837        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 2));
11838    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11839                                        "vget_lane");
11840  case NEON::BI__builtin_neon_vget_lane_f64:
11841    Ops[0] =
11842        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 1));
11843    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11844                                        "vget_lane");
11845  case NEON::BI__builtin_neon_vgetq_lane_f32:
11846  case NEON::BI__builtin_neon_vdups_laneq_f32:
11847    Ops[0] =
11848        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(FloatTy, 4));
11849    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11850                                        "vgetq_lane");
11851  case NEON::BI__builtin_neon_vgetq_lane_f64:
11852  case NEON::BI__builtin_neon_vdupd_laneq_f64:
11853    Ops[0] =
11854        Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(DoubleTy, 2));
11855    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
11856                                        "vgetq_lane");
11857  case NEON::BI__builtin_neon_vaddh_f16:
11858    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11859    return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh");
11860  case NEON::BI__builtin_neon_vsubh_f16:
11861    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11862    return Builder.CreateFSub(Ops[0], Ops[1], "vsubh");
11863  case NEON::BI__builtin_neon_vmulh_f16:
11864    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11865    return Builder.CreateFMul(Ops[0], Ops[1], "vmulh");
11866  case NEON::BI__builtin_neon_vdivh_f16:
11867    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11868    return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh");
11869  case NEON::BI__builtin_neon_vfmah_f16:
11870    // NEON intrinsic puts accumulator first, unlike the LLVM fma.
11871    return emitCallMaybeConstrainedFPBuiltin(
11872        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
11873        {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]});
11874  case NEON::BI__builtin_neon_vfmsh_f16: {
11875    Value* Neg = Builder.CreateFNeg(EmitScalarExpr(E->getArg(1)), "vsubh");
11876
11877    // NEON intrinsic puts accumulator first, unlike the LLVM fma.
11878    return emitCallMaybeConstrainedFPBuiltin(
11879        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, HalfTy,
11880        {Neg, EmitScalarExpr(E->getArg(2)), Ops[0]});
11881  }
11882  case NEON::BI__builtin_neon_vaddd_s64:
11883  case NEON::BI__builtin_neon_vaddd_u64:
11884    return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd");
11885  case NEON::BI__builtin_neon_vsubd_s64:
11886  case NEON::BI__builtin_neon_vsubd_u64:
11887    return Builder.CreateSub(Ops[0], EmitScalarExpr(E->getArg(1)), "vsubd");
11888  case NEON::BI__builtin_neon_vqdmlalh_s16:
11889  case NEON::BI__builtin_neon_vqdmlslh_s16: {
11890    SmallVector<Value *, 2> ProductOps;
11891    ProductOps.push_back(vectorWrapScalar16(Ops[1]));
11892    ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
11893    auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
11894    Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
11895                          ProductOps, "vqdmlXl");
11896    Constant *CI = ConstantInt::get(SizeTy, 0);
11897    Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
11898
11899    unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
11900                                        ? Intrinsic::aarch64_neon_sqadd
11901                                        : Intrinsic::aarch64_neon_sqsub;
11902    return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl");
11903  }
11904  case NEON::BI__builtin_neon_vqshlud_n_s64: {
11905    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11906    Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
11907    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqshlu, Int64Ty),
11908                        Ops, "vqshlu_n");
11909  }
11910  case NEON::BI__builtin_neon_vqshld_n_u64:
11911  case NEON::BI__builtin_neon_vqshld_n_s64: {
11912    unsigned Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
11913                                   ? Intrinsic::aarch64_neon_uqshl
11914                                   : Intrinsic::aarch64_neon_sqshl;
11915    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11916    Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
11917    return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vqshl_n");
11918  }
11919  case NEON::BI__builtin_neon_vrshrd_n_u64:
11920  case NEON::BI__builtin_neon_vrshrd_n_s64: {
11921    unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
11922                                   ? Intrinsic::aarch64_neon_urshl
11923                                   : Intrinsic::aarch64_neon_srshl;
11924    Ops.push_back(EmitScalarExpr(E->getArg(1)));
11925    int SV = cast<ConstantInt>(Ops[1])->getSExtValue();
11926    Ops[1] = ConstantInt::get(Int64Ty, -SV);
11927    return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vrshr_n");
11928  }
11929  case NEON::BI__builtin_neon_vrsrad_n_u64:
11930  case NEON::BI__builtin_neon_vrsrad_n_s64: {
11931    unsigned Int = BuiltinID == NEON::BI__builtin_neon_vrsrad_n_u64
11932                                   ? Intrinsic::aarch64_neon_urshl
11933                                   : Intrinsic::aarch64_neon_srshl;
11934    Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
11935    Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2))));
11936    Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Int64Ty),
11937                                {Ops[1], Builder.CreateSExt(Ops[2], Int64Ty)});
11938    return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty));
11939  }
11940  case NEON::BI__builtin_neon_vshld_n_s64:
11941  case NEON::BI__builtin_neon_vshld_n_u64: {
11942    llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11943    return Builder.CreateShl(
11944        Ops[0], ConstantInt::get(Int64Ty, Amt->getZExtValue()), "shld_n");
11945  }
11946  case NEON::BI__builtin_neon_vshrd_n_s64: {
11947    llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11948    return Builder.CreateAShr(
11949        Ops[0], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
11950                                                   Amt->getZExtValue())),
11951        "shrd_n");
11952  }
11953  case NEON::BI__builtin_neon_vshrd_n_u64: {
11954    llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
11955    uint64_t ShiftAmt = Amt->getZExtValue();
11956    // Right-shifting an unsigned value by its size yields 0.
11957    if (ShiftAmt == 64)
11958      return ConstantInt::get(Int64Ty, 0);
11959    return Builder.CreateLShr(Ops[0], ConstantInt::get(Int64Ty, ShiftAmt),
11960                              "shrd_n");
11961  }
11962  case NEON::BI__builtin_neon_vsrad_n_s64: {
11963    llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
11964    Ops[1] = Builder.CreateAShr(
11965        Ops[1], ConstantInt::get(Int64Ty, std::min(static_cast<uint64_t>(63),
11966                                                   Amt->getZExtValue())),
11967        "shrd_n");
11968    return Builder.CreateAdd(Ops[0], Ops[1]);
11969  }
11970  case NEON::BI__builtin_neon_vsrad_n_u64: {
11971    llvm::ConstantInt *Amt = cast<ConstantInt>(EmitScalarExpr(E->getArg(2)));
11972    uint64_t ShiftAmt = Amt->getZExtValue();
11973    // Right-shifting an unsigned value by its size yields 0.
11974    // As Op + 0 = Op, return Ops[0] directly.
11975    if (ShiftAmt == 64)
11976      return Ops[0];
11977    Ops[1] = Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, ShiftAmt),
11978                                "shrd_n");
11979    return Builder.CreateAdd(Ops[0], Ops[1]);
11980  }
11981  case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
11982  case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
11983  case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
11984  case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
11985    Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
11986                                          "lane");
11987    SmallVector<Value *, 2> ProductOps;
11988    ProductOps.push_back(vectorWrapScalar16(Ops[1]));
11989    ProductOps.push_back(vectorWrapScalar16(Ops[2]));
11990    auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
11991    Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, VTy),
11992                          ProductOps, "vqdmlXl");
11993    Constant *CI = ConstantInt::get(SizeTy, 0);
11994    Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
11995    Ops.pop_back();
11996
11997    unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
11998                       BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
11999                          ? Intrinsic::aarch64_neon_sqadd
12000                          : Intrinsic::aarch64_neon_sqsub;
12001    return EmitNeonCall(CGM.getIntrinsic(AccInt, Int32Ty), Ops, "vqdmlXl");
12002  }
12003  case NEON::BI__builtin_neon_vqdmlals_s32:
12004  case NEON::BI__builtin_neon_vqdmlsls_s32: {
12005    SmallVector<Value *, 2> ProductOps;
12006    ProductOps.push_back(Ops[1]);
12007    ProductOps.push_back(EmitScalarExpr(E->getArg(2)));
12008    Ops[1] =
12009        EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
12010                     ProductOps, "vqdmlXl");
12011
12012    unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
12013                                        ? Intrinsic::aarch64_neon_sqadd
12014                                        : Intrinsic::aarch64_neon_sqsub;
12015    return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl");
12016  }
12017  case NEON::BI__builtin_neon_vqdmlals_lane_s32:
12018  case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
12019  case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
12020  case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
12021    Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
12022                                          "lane");
12023    SmallVector<Value *, 2> ProductOps;
12024    ProductOps.push_back(Ops[1]);
12025    ProductOps.push_back(Ops[2]);
12026    Ops[1] =
12027        EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
12028                     ProductOps, "vqdmlXl");
12029    Ops.pop_back();
12030
12031    unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
12032                       BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
12033                          ? Intrinsic::aarch64_neon_sqadd
12034                          : Intrinsic::aarch64_neon_sqsub;
12035    return EmitNeonCall(CGM.getIntrinsic(AccInt, Int64Ty), Ops, "vqdmlXl");
12036  }
12037  case NEON::BI__builtin_neon_vget_lane_bf16:
12038  case NEON::BI__builtin_neon_vduph_lane_bf16:
12039  case NEON::BI__builtin_neon_vduph_lane_f16: {
12040    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
12041                                        "vget_lane");
12042  }
12043  case NEON::BI__builtin_neon_vgetq_lane_bf16:
12044  case NEON::BI__builtin_neon_vduph_laneq_bf16:
12045  case NEON::BI__builtin_neon_vduph_laneq_f16: {
12046    return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
12047                                        "vgetq_lane");
12048  }
12049
12050  case clang::AArch64::BI_InterlockedAdd: {
12051    Address DestAddr = CheckAtomicAlignment(*this, E);
12052    Value *Val = EmitScalarExpr(E->getArg(1));
12053    AtomicRMWInst *RMWI =
12054        Builder.CreateAtomicRMW(AtomicRMWInst::Add, DestAddr, Val,
12055                                llvm::AtomicOrdering::SequentiallyConsistent);
12056    return Builder.CreateAdd(RMWI, Val);
12057  }
12058  }
12059
12060  llvm::FixedVectorType *VTy = GetNeonType(this, Type);
12061  llvm::Type *Ty = VTy;
12062  if (!Ty)
12063    return nullptr;
12064
12065  // Not all intrinsics handled by the common case work for AArch64 yet, so only
12066  // defer to common code if it's been added to our special map.
12067  Builtin = findARMVectorIntrinsicInMap(AArch64SIMDIntrinsicMap, BuiltinID,
12068                                        AArch64SIMDIntrinsicsProvenSorted);
12069
12070  if (Builtin)
12071    return EmitCommonNeonBuiltinExpr(
12072        Builtin->BuiltinID, Builtin->LLVMIntrinsic, Builtin->AltLLVMIntrinsic,
12073        Builtin->NameHint, Builtin->TypeModifier, E, Ops,
12074        /*never use addresses*/ Address::invalid(), Address::invalid(), Arch);
12075
12076  if (Value *V = EmitAArch64TblBuiltinExpr(*this, BuiltinID, E, Ops, Arch))
12077    return V;
12078
12079  unsigned Int;
12080  switch (BuiltinID) {
12081  default: return nullptr;
12082  case NEON::BI__builtin_neon_vbsl_v:
12083  case NEON::BI__builtin_neon_vbslq_v: {
12084    llvm::Type *BitTy = llvm::VectorType::getInteger(VTy);
12085    Ops[0] = Builder.CreateBitCast(Ops[0], BitTy, "vbsl");
12086    Ops[1] = Builder.CreateBitCast(Ops[1], BitTy, "vbsl");
12087    Ops[2] = Builder.CreateBitCast(Ops[2], BitTy, "vbsl");
12088
12089    Ops[1] = Builder.CreateAnd(Ops[0], Ops[1], "vbsl");
12090    Ops[2] = Builder.CreateAnd(Builder.CreateNot(Ops[0]), Ops[2], "vbsl");
12091    Ops[0] = Builder.CreateOr(Ops[1], Ops[2], "vbsl");
12092    return Builder.CreateBitCast(Ops[0], Ty);
12093  }
12094  case NEON::BI__builtin_neon_vfma_lane_v:
12095  case NEON::BI__builtin_neon_vfmaq_lane_v: { // Only used for FP types
12096    // The ARM builtins (and instructions) have the addend as the first
12097    // operand, but the 'fma' intrinsics have it last. Swap it around here.
12098    Value *Addend = Ops[0];
12099    Value *Multiplicand = Ops[1];
12100    Value *LaneSource = Ops[2];
12101    Ops[0] = Multiplicand;
12102    Ops[1] = LaneSource;
12103    Ops[2] = Addend;
12104
12105    // Now adjust things to handle the lane access.
12106    auto *SourceTy = BuiltinID == NEON::BI__builtin_neon_vfmaq_lane_v
12107                         ? llvm::FixedVectorType::get(VTy->getElementType(),
12108                                                      VTy->getNumElements() / 2)
12109                         : VTy;
12110    llvm::Constant *cst = cast<Constant>(Ops[3]);
12111    Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(), cst);
12112    Ops[1] = Builder.CreateBitCast(Ops[1], SourceTy);
12113    Ops[1] = Builder.CreateShuffleVector(Ops[1], Ops[1], SV, "lane");
12114
12115    Ops.pop_back();
12116    Int = Builder.getIsFPConstrained() ? Intrinsic::experimental_constrained_fma
12117                                       : Intrinsic::fma;
12118    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla");
12119  }
12120  case NEON::BI__builtin_neon_vfma_laneq_v: {
12121    auto *VTy = cast<llvm::FixedVectorType>(Ty);
12122    // v1f64 fma should be mapped to Neon scalar f64 fma
12123    if (VTy && VTy->getElementType() == DoubleTy) {
12124      Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
12125      Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
12126      llvm::FixedVectorType *VTy =
12127          GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, true));
12128      Ops[2] = Builder.CreateBitCast(Ops[2], VTy);
12129      Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
12130      Value *Result;
12131      Result = emitCallMaybeConstrainedFPBuiltin(
12132          *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma,
12133          DoubleTy, {Ops[1], Ops[2], Ops[0]});
12134      return Builder.CreateBitCast(Result, Ty);
12135    }
12136    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12137    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12138
12139    auto *STy = llvm::FixedVectorType::get(VTy->getElementType(),
12140                                           VTy->getNumElements() * 2);
12141    Ops[2] = Builder.CreateBitCast(Ops[2], STy);
12142    Value *SV = llvm::ConstantVector::getSplat(VTy->getElementCount(),
12143                                               cast<ConstantInt>(Ops[3]));
12144    Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
12145
12146    return emitCallMaybeConstrainedFPBuiltin(
12147        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
12148        {Ops[2], Ops[1], Ops[0]});
12149  }
12150  case NEON::BI__builtin_neon_vfmaq_laneq_v: {
12151    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12152    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12153
12154    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
12155    Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
12156    return emitCallMaybeConstrainedFPBuiltin(
12157        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
12158        {Ops[2], Ops[1], Ops[0]});
12159  }
12160  case NEON::BI__builtin_neon_vfmah_lane_f16:
12161  case NEON::BI__builtin_neon_vfmas_lane_f32:
12162  case NEON::BI__builtin_neon_vfmah_laneq_f16:
12163  case NEON::BI__builtin_neon_vfmas_laneq_f32:
12164  case NEON::BI__builtin_neon_vfmad_lane_f64:
12165  case NEON::BI__builtin_neon_vfmad_laneq_f64: {
12166    Ops.push_back(EmitScalarExpr(E->getArg(3)));
12167    llvm::Type *Ty = ConvertType(E->getCallReturnType(getContext()));
12168    Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract");
12169    return emitCallMaybeConstrainedFPBuiltin(
12170        *this, Intrinsic::fma, Intrinsic::experimental_constrained_fma, Ty,
12171        {Ops[1], Ops[2], Ops[0]});
12172  }
12173  case NEON::BI__builtin_neon_vmull_v:
12174    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12175    Int = usgn ? Intrinsic::aarch64_neon_umull : Intrinsic::aarch64_neon_smull;
12176    if (Type.isPoly()) Int = Intrinsic::aarch64_neon_pmull;
12177    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmull");
12178  case NEON::BI__builtin_neon_vmax_v:
12179  case NEON::BI__builtin_neon_vmaxq_v:
12180    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12181    Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax;
12182    if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
12183    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
12184  case NEON::BI__builtin_neon_vmaxh_f16: {
12185    Ops.push_back(EmitScalarExpr(E->getArg(1)));
12186    Int = Intrinsic::aarch64_neon_fmax;
12187    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
12188  }
12189  case NEON::BI__builtin_neon_vmin_v:
12190  case NEON::BI__builtin_neon_vminq_v:
12191    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12192    Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin;
12193    if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
12194    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
12195  case NEON::BI__builtin_neon_vminh_f16: {
12196    Ops.push_back(EmitScalarExpr(E->getArg(1)));
12197    Int = Intrinsic::aarch64_neon_fmin;
12198    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
12199  }
12200  case NEON::BI__builtin_neon_vabd_v:
12201  case NEON::BI__builtin_neon_vabdq_v:
12202    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12203    Int = usgn ? Intrinsic::aarch64_neon_uabd : Intrinsic::aarch64_neon_sabd;
12204    if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fabd;
12205    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vabd");
12206  case NEON::BI__builtin_neon_vpadal_v:
12207  case NEON::BI__builtin_neon_vpadalq_v: {
12208    unsigned ArgElts = VTy->getNumElements();
12209    llvm::IntegerType *EltTy = cast<IntegerType>(VTy->getElementType());
12210    unsigned BitWidth = EltTy->getBitWidth();
12211    auto *ArgTy = llvm::FixedVectorType::get(
12212        llvm::IntegerType::get(getLLVMContext(), BitWidth / 2), 2 * ArgElts);
12213    llvm::Type* Tys[2] = { VTy, ArgTy };
12214    Int = usgn ? Intrinsic::aarch64_neon_uaddlp : Intrinsic::aarch64_neon_saddlp;
12215    SmallVector<llvm::Value*, 1> TmpOps;
12216    TmpOps.push_back(Ops[1]);
12217    Function *F = CGM.getIntrinsic(Int, Tys);
12218    llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vpadal");
12219    llvm::Value *addend = Builder.CreateBitCast(Ops[0], tmp->getType());
12220    return Builder.CreateAdd(tmp, addend);
12221  }
12222  case NEON::BI__builtin_neon_vpmin_v:
12223  case NEON::BI__builtin_neon_vpminq_v:
12224    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12225    Int = usgn ? Intrinsic::aarch64_neon_uminp : Intrinsic::aarch64_neon_sminp;
12226    if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fminp;
12227    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmin");
12228  case NEON::BI__builtin_neon_vpmax_v:
12229  case NEON::BI__builtin_neon_vpmaxq_v:
12230    // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics.
12231    Int = usgn ? Intrinsic::aarch64_neon_umaxp : Intrinsic::aarch64_neon_smaxp;
12232    if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmaxp;
12233    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmax");
12234  case NEON::BI__builtin_neon_vminnm_v:
12235  case NEON::BI__builtin_neon_vminnmq_v:
12236    Int = Intrinsic::aarch64_neon_fminnm;
12237    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
12238  case NEON::BI__builtin_neon_vminnmh_f16:
12239    Ops.push_back(EmitScalarExpr(E->getArg(1)));
12240    Int = Intrinsic::aarch64_neon_fminnm;
12241    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
12242  case NEON::BI__builtin_neon_vmaxnm_v:
12243  case NEON::BI__builtin_neon_vmaxnmq_v:
12244    Int = Intrinsic::aarch64_neon_fmaxnm;
12245    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
12246  case NEON::BI__builtin_neon_vmaxnmh_f16:
12247    Ops.push_back(EmitScalarExpr(E->getArg(1)));
12248    Int = Intrinsic::aarch64_neon_fmaxnm;
12249    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
12250  case NEON::BI__builtin_neon_vrecpss_f32: {
12251    Ops.push_back(EmitScalarExpr(E->getArg(1)));
12252    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy),
12253                        Ops, "vrecps");
12254  }
12255  case NEON::BI__builtin_neon_vrecpsd_f64:
12256    Ops.push_back(EmitScalarExpr(E->getArg(1)));
12257    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy),
12258                        Ops, "vrecps");
12259  case NEON::BI__builtin_neon_vrecpsh_f16:
12260    Ops.push_back(EmitScalarExpr(E->getArg(1)));
12261    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy),
12262                        Ops, "vrecps");
12263  case NEON::BI__builtin_neon_vqshrun_n_v:
12264    Int = Intrinsic::aarch64_neon_sqshrun;
12265    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n");
12266  case NEON::BI__builtin_neon_vqrshrun_n_v:
12267    Int = Intrinsic::aarch64_neon_sqrshrun;
12268    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrun_n");
12269  case NEON::BI__builtin_neon_vqshrn_n_v:
12270    Int = usgn ? Intrinsic::aarch64_neon_uqshrn : Intrinsic::aarch64_neon_sqshrn;
12271    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrn_n");
12272  case NEON::BI__builtin_neon_vrshrn_n_v:
12273    Int = Intrinsic::aarch64_neon_rshrn;
12274    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrshrn_n");
12275  case NEON::BI__builtin_neon_vqrshrn_n_v:
12276    Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn;
12277    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
12278  case NEON::BI__builtin_neon_vrndah_f16: {
12279    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12280    Int = Builder.getIsFPConstrained()
12281              ? Intrinsic::experimental_constrained_round
12282              : Intrinsic::round;
12283    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda");
12284  }
12285  case NEON::BI__builtin_neon_vrnda_v:
12286  case NEON::BI__builtin_neon_vrndaq_v: {
12287    Int = Builder.getIsFPConstrained()
12288              ? Intrinsic::experimental_constrained_round
12289              : Intrinsic::round;
12290    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
12291  }
12292  case NEON::BI__builtin_neon_vrndih_f16: {
12293    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12294    Int = Builder.getIsFPConstrained()
12295              ? Intrinsic::experimental_constrained_nearbyint
12296              : Intrinsic::nearbyint;
12297    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
12298  }
12299  case NEON::BI__builtin_neon_vrndmh_f16: {
12300    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12301    Int = Builder.getIsFPConstrained()
12302              ? Intrinsic::experimental_constrained_floor
12303              : Intrinsic::floor;
12304    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm");
12305  }
12306  case NEON::BI__builtin_neon_vrndm_v:
12307  case NEON::BI__builtin_neon_vrndmq_v: {
12308    Int = Builder.getIsFPConstrained()
12309              ? Intrinsic::experimental_constrained_floor
12310              : Intrinsic::floor;
12311    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
12312  }
12313  case NEON::BI__builtin_neon_vrndnh_f16: {
12314    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12315    Int = Builder.getIsFPConstrained()
12316              ? Intrinsic::experimental_constrained_roundeven
12317              : Intrinsic::roundeven;
12318    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn");
12319  }
12320  case NEON::BI__builtin_neon_vrndn_v:
12321  case NEON::BI__builtin_neon_vrndnq_v: {
12322    Int = Builder.getIsFPConstrained()
12323              ? Intrinsic::experimental_constrained_roundeven
12324              : Intrinsic::roundeven;
12325    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
12326  }
12327  case NEON::BI__builtin_neon_vrndns_f32: {
12328    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12329    Int = Builder.getIsFPConstrained()
12330              ? Intrinsic::experimental_constrained_roundeven
12331              : Intrinsic::roundeven;
12332    return EmitNeonCall(CGM.getIntrinsic(Int, FloatTy), Ops, "vrndn");
12333  }
12334  case NEON::BI__builtin_neon_vrndph_f16: {
12335    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12336    Int = Builder.getIsFPConstrained()
12337              ? Intrinsic::experimental_constrained_ceil
12338              : Intrinsic::ceil;
12339    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp");
12340  }
12341  case NEON::BI__builtin_neon_vrndp_v:
12342  case NEON::BI__builtin_neon_vrndpq_v: {
12343    Int = Builder.getIsFPConstrained()
12344              ? Intrinsic::experimental_constrained_ceil
12345              : Intrinsic::ceil;
12346    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
12347  }
12348  case NEON::BI__builtin_neon_vrndxh_f16: {
12349    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12350    Int = Builder.getIsFPConstrained()
12351              ? Intrinsic::experimental_constrained_rint
12352              : Intrinsic::rint;
12353    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx");
12354  }
12355  case NEON::BI__builtin_neon_vrndx_v:
12356  case NEON::BI__builtin_neon_vrndxq_v: {
12357    Int = Builder.getIsFPConstrained()
12358              ? Intrinsic::experimental_constrained_rint
12359              : Intrinsic::rint;
12360    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
12361  }
12362  case NEON::BI__builtin_neon_vrndh_f16: {
12363    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12364    Int = Builder.getIsFPConstrained()
12365              ? Intrinsic::experimental_constrained_trunc
12366              : Intrinsic::trunc;
12367    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz");
12368  }
12369  case NEON::BI__builtin_neon_vrnd32x_f32:
12370  case NEON::BI__builtin_neon_vrnd32xq_f32:
12371  case NEON::BI__builtin_neon_vrnd32x_f64:
12372  case NEON::BI__builtin_neon_vrnd32xq_f64: {
12373    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12374    Int = Intrinsic::aarch64_neon_frint32x;
12375    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32x");
12376  }
12377  case NEON::BI__builtin_neon_vrnd32z_f32:
12378  case NEON::BI__builtin_neon_vrnd32zq_f32:
12379  case NEON::BI__builtin_neon_vrnd32z_f64:
12380  case NEON::BI__builtin_neon_vrnd32zq_f64: {
12381    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12382    Int = Intrinsic::aarch64_neon_frint32z;
12383    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32z");
12384  }
12385  case NEON::BI__builtin_neon_vrnd64x_f32:
12386  case NEON::BI__builtin_neon_vrnd64xq_f32:
12387  case NEON::BI__builtin_neon_vrnd64x_f64:
12388  case NEON::BI__builtin_neon_vrnd64xq_f64: {
12389    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12390    Int = Intrinsic::aarch64_neon_frint64x;
12391    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64x");
12392  }
12393  case NEON::BI__builtin_neon_vrnd64z_f32:
12394  case NEON::BI__builtin_neon_vrnd64zq_f32:
12395  case NEON::BI__builtin_neon_vrnd64z_f64:
12396  case NEON::BI__builtin_neon_vrnd64zq_f64: {
12397    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12398    Int = Intrinsic::aarch64_neon_frint64z;
12399    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64z");
12400  }
12401  case NEON::BI__builtin_neon_vrnd_v:
12402  case NEON::BI__builtin_neon_vrndq_v: {
12403    Int = Builder.getIsFPConstrained()
12404              ? Intrinsic::experimental_constrained_trunc
12405              : Intrinsic::trunc;
12406    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndz");
12407  }
12408  case NEON::BI__builtin_neon_vcvt_f64_v:
12409  case NEON::BI__builtin_neon_vcvtq_f64_v:
12410    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12411    Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, quad));
12412    return usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt")
12413                : Builder.CreateSIToFP(Ops[0], Ty, "vcvt");
12414  case NEON::BI__builtin_neon_vcvt_f64_f32: {
12415    assert(Type.getEltType() == NeonTypeFlags::Float64 && quad &&
12416           "unexpected vcvt_f64_f32 builtin");
12417    NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float32, false, false);
12418    Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
12419
12420    return Builder.CreateFPExt(Ops[0], Ty, "vcvt");
12421  }
12422  case NEON::BI__builtin_neon_vcvt_f32_f64: {
12423    assert(Type.getEltType() == NeonTypeFlags::Float32 &&
12424           "unexpected vcvt_f32_f64 builtin");
12425    NeonTypeFlags SrcFlag = NeonTypeFlags(NeonTypeFlags::Float64, false, true);
12426    Ops[0] = Builder.CreateBitCast(Ops[0], GetNeonType(this, SrcFlag));
12427
12428    return Builder.CreateFPTrunc(Ops[0], Ty, "vcvt");
12429  }
12430  case NEON::BI__builtin_neon_vcvt_s32_v:
12431  case NEON::BI__builtin_neon_vcvt_u32_v:
12432  case NEON::BI__builtin_neon_vcvt_s64_v:
12433  case NEON::BI__builtin_neon_vcvt_u64_v:
12434  case NEON::BI__builtin_neon_vcvt_s16_f16:
12435  case NEON::BI__builtin_neon_vcvt_u16_f16:
12436  case NEON::BI__builtin_neon_vcvtq_s32_v:
12437  case NEON::BI__builtin_neon_vcvtq_u32_v:
12438  case NEON::BI__builtin_neon_vcvtq_s64_v:
12439  case NEON::BI__builtin_neon_vcvtq_u64_v:
12440  case NEON::BI__builtin_neon_vcvtq_s16_f16:
12441  case NEON::BI__builtin_neon_vcvtq_u16_f16: {
12442    Int =
12443        usgn ? Intrinsic::aarch64_neon_fcvtzu : Intrinsic::aarch64_neon_fcvtzs;
12444    llvm::Type *Tys[2] = {Ty, GetFloatNeonType(this, Type)};
12445    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtz");
12446  }
12447  case NEON::BI__builtin_neon_vcvta_s16_f16:
12448  case NEON::BI__builtin_neon_vcvta_u16_f16:
12449  case NEON::BI__builtin_neon_vcvta_s32_v:
12450  case NEON::BI__builtin_neon_vcvtaq_s16_f16:
12451  case NEON::BI__builtin_neon_vcvtaq_s32_v:
12452  case NEON::BI__builtin_neon_vcvta_u32_v:
12453  case NEON::BI__builtin_neon_vcvtaq_u16_f16:
12454  case NEON::BI__builtin_neon_vcvtaq_u32_v:
12455  case NEON::BI__builtin_neon_vcvta_s64_v:
12456  case NEON::BI__builtin_neon_vcvtaq_s64_v:
12457  case NEON::BI__builtin_neon_vcvta_u64_v:
12458  case NEON::BI__builtin_neon_vcvtaq_u64_v: {
12459    Int = usgn ? Intrinsic::aarch64_neon_fcvtau : Intrinsic::aarch64_neon_fcvtas;
12460    llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
12461    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvta");
12462  }
12463  case NEON::BI__builtin_neon_vcvtm_s16_f16:
12464  case NEON::BI__builtin_neon_vcvtm_s32_v:
12465  case NEON::BI__builtin_neon_vcvtmq_s16_f16:
12466  case NEON::BI__builtin_neon_vcvtmq_s32_v:
12467  case NEON::BI__builtin_neon_vcvtm_u16_f16:
12468  case NEON::BI__builtin_neon_vcvtm_u32_v:
12469  case NEON::BI__builtin_neon_vcvtmq_u16_f16:
12470  case NEON::BI__builtin_neon_vcvtmq_u32_v:
12471  case NEON::BI__builtin_neon_vcvtm_s64_v:
12472  case NEON::BI__builtin_neon_vcvtmq_s64_v:
12473  case NEON::BI__builtin_neon_vcvtm_u64_v:
12474  case NEON::BI__builtin_neon_vcvtmq_u64_v: {
12475    Int = usgn ? Intrinsic::aarch64_neon_fcvtmu : Intrinsic::aarch64_neon_fcvtms;
12476    llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
12477    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtm");
12478  }
12479  case NEON::BI__builtin_neon_vcvtn_s16_f16:
12480  case NEON::BI__builtin_neon_vcvtn_s32_v:
12481  case NEON::BI__builtin_neon_vcvtnq_s16_f16:
12482  case NEON::BI__builtin_neon_vcvtnq_s32_v:
12483  case NEON::BI__builtin_neon_vcvtn_u16_f16:
12484  case NEON::BI__builtin_neon_vcvtn_u32_v:
12485  case NEON::BI__builtin_neon_vcvtnq_u16_f16:
12486  case NEON::BI__builtin_neon_vcvtnq_u32_v:
12487  case NEON::BI__builtin_neon_vcvtn_s64_v:
12488  case NEON::BI__builtin_neon_vcvtnq_s64_v:
12489  case NEON::BI__builtin_neon_vcvtn_u64_v:
12490  case NEON::BI__builtin_neon_vcvtnq_u64_v: {
12491    Int = usgn ? Intrinsic::aarch64_neon_fcvtnu : Intrinsic::aarch64_neon_fcvtns;
12492    llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
12493    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtn");
12494  }
12495  case NEON::BI__builtin_neon_vcvtp_s16_f16:
12496  case NEON::BI__builtin_neon_vcvtp_s32_v:
12497  case NEON::BI__builtin_neon_vcvtpq_s16_f16:
12498  case NEON::BI__builtin_neon_vcvtpq_s32_v:
12499  case NEON::BI__builtin_neon_vcvtp_u16_f16:
12500  case NEON::BI__builtin_neon_vcvtp_u32_v:
12501  case NEON::BI__builtin_neon_vcvtpq_u16_f16:
12502  case NEON::BI__builtin_neon_vcvtpq_u32_v:
12503  case NEON::BI__builtin_neon_vcvtp_s64_v:
12504  case NEON::BI__builtin_neon_vcvtpq_s64_v:
12505  case NEON::BI__builtin_neon_vcvtp_u64_v:
12506  case NEON::BI__builtin_neon_vcvtpq_u64_v: {
12507    Int = usgn ? Intrinsic::aarch64_neon_fcvtpu : Intrinsic::aarch64_neon_fcvtps;
12508    llvm::Type *Tys[2] = { Ty, GetFloatNeonType(this, Type) };
12509    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vcvtp");
12510  }
12511  case NEON::BI__builtin_neon_vmulx_v:
12512  case NEON::BI__builtin_neon_vmulxq_v: {
12513    Int = Intrinsic::aarch64_neon_fmulx;
12514    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
12515  }
12516  case NEON::BI__builtin_neon_vmulxh_lane_f16:
12517  case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
12518    // vmulx_lane should be mapped to Neon scalar mulx after
12519    // extracting the scalar element
12520    Ops.push_back(EmitScalarExpr(E->getArg(2)));
12521    Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
12522    Ops.pop_back();
12523    Int = Intrinsic::aarch64_neon_fmulx;
12524    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx");
12525  }
12526  case NEON::BI__builtin_neon_vmul_lane_v:
12527  case NEON::BI__builtin_neon_vmul_laneq_v: {
12528    // v1f64 vmul_lane should be mapped to Neon scalar mul lane
12529    bool Quad = false;
12530    if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v)
12531      Quad = true;
12532    Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
12533    llvm::FixedVectorType *VTy =
12534        GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, Quad));
12535    Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
12536    Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
12537    Value *Result = Builder.CreateFMul(Ops[0], Ops[1]);
12538    return Builder.CreateBitCast(Result, Ty);
12539  }
12540  case NEON::BI__builtin_neon_vnegd_s64:
12541    return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd");
12542  case NEON::BI__builtin_neon_vnegh_f16:
12543    return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh");
12544  case NEON::BI__builtin_neon_vpmaxnm_v:
12545  case NEON::BI__builtin_neon_vpmaxnmq_v: {
12546    Int = Intrinsic::aarch64_neon_fmaxnmp;
12547    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpmaxnm");
12548  }
12549  case NEON::BI__builtin_neon_vpminnm_v:
12550  case NEON::BI__builtin_neon_vpminnmq_v: {
12551    Int = Intrinsic::aarch64_neon_fminnmp;
12552    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
12553  }
12554  case NEON::BI__builtin_neon_vsqrth_f16: {
12555    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12556    Int = Builder.getIsFPConstrained()
12557              ? Intrinsic::experimental_constrained_sqrt
12558              : Intrinsic::sqrt;
12559    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt");
12560  }
12561  case NEON::BI__builtin_neon_vsqrt_v:
12562  case NEON::BI__builtin_neon_vsqrtq_v: {
12563    Int = Builder.getIsFPConstrained()
12564              ? Intrinsic::experimental_constrained_sqrt
12565              : Intrinsic::sqrt;
12566    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12567    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqrt");
12568  }
12569  case NEON::BI__builtin_neon_vrbit_v:
12570  case NEON::BI__builtin_neon_vrbitq_v: {
12571    Int = Intrinsic::bitreverse;
12572    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrbit");
12573  }
12574  case NEON::BI__builtin_neon_vaddv_u8:
12575    // FIXME: These are handled by the AArch64 scalar code.
12576    usgn = true;
12577    [[fallthrough]];
12578  case NEON::BI__builtin_neon_vaddv_s8: {
12579    Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
12580    Ty = Int32Ty;
12581    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12582    llvm::Type *Tys[2] = { Ty, VTy };
12583    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12584    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
12585    return Builder.CreateTrunc(Ops[0], Int8Ty);
12586  }
12587  case NEON::BI__builtin_neon_vaddv_u16:
12588    usgn = true;
12589    [[fallthrough]];
12590  case NEON::BI__builtin_neon_vaddv_s16: {
12591    Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
12592    Ty = Int32Ty;
12593    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12594    llvm::Type *Tys[2] = { Ty, VTy };
12595    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12596    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
12597    return Builder.CreateTrunc(Ops[0], Int16Ty);
12598  }
12599  case NEON::BI__builtin_neon_vaddvq_u8:
12600    usgn = true;
12601    [[fallthrough]];
12602  case NEON::BI__builtin_neon_vaddvq_s8: {
12603    Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
12604    Ty = Int32Ty;
12605    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12606    llvm::Type *Tys[2] = { Ty, VTy };
12607    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12608    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
12609    return Builder.CreateTrunc(Ops[0], Int8Ty);
12610  }
12611  case NEON::BI__builtin_neon_vaddvq_u16:
12612    usgn = true;
12613    [[fallthrough]];
12614  case NEON::BI__builtin_neon_vaddvq_s16: {
12615    Int = usgn ? Intrinsic::aarch64_neon_uaddv : Intrinsic::aarch64_neon_saddv;
12616    Ty = Int32Ty;
12617    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12618    llvm::Type *Tys[2] = { Ty, VTy };
12619    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12620    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddv");
12621    return Builder.CreateTrunc(Ops[0], Int16Ty);
12622  }
12623  case NEON::BI__builtin_neon_vmaxv_u8: {
12624    Int = Intrinsic::aarch64_neon_umaxv;
12625    Ty = Int32Ty;
12626    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12627    llvm::Type *Tys[2] = { Ty, VTy };
12628    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12629    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12630    return Builder.CreateTrunc(Ops[0], Int8Ty);
12631  }
12632  case NEON::BI__builtin_neon_vmaxv_u16: {
12633    Int = Intrinsic::aarch64_neon_umaxv;
12634    Ty = Int32Ty;
12635    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12636    llvm::Type *Tys[2] = { Ty, VTy };
12637    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12638    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12639    return Builder.CreateTrunc(Ops[0], Int16Ty);
12640  }
12641  case NEON::BI__builtin_neon_vmaxvq_u8: {
12642    Int = Intrinsic::aarch64_neon_umaxv;
12643    Ty = Int32Ty;
12644    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12645    llvm::Type *Tys[2] = { Ty, VTy };
12646    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12647    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12648    return Builder.CreateTrunc(Ops[0], Int8Ty);
12649  }
12650  case NEON::BI__builtin_neon_vmaxvq_u16: {
12651    Int = Intrinsic::aarch64_neon_umaxv;
12652    Ty = Int32Ty;
12653    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12654    llvm::Type *Tys[2] = { Ty, VTy };
12655    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12656    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12657    return Builder.CreateTrunc(Ops[0], Int16Ty);
12658  }
12659  case NEON::BI__builtin_neon_vmaxv_s8: {
12660    Int = Intrinsic::aarch64_neon_smaxv;
12661    Ty = Int32Ty;
12662    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12663    llvm::Type *Tys[2] = { Ty, VTy };
12664    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12665    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12666    return Builder.CreateTrunc(Ops[0], Int8Ty);
12667  }
12668  case NEON::BI__builtin_neon_vmaxv_s16: {
12669    Int = Intrinsic::aarch64_neon_smaxv;
12670    Ty = Int32Ty;
12671    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12672    llvm::Type *Tys[2] = { Ty, VTy };
12673    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12674    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12675    return Builder.CreateTrunc(Ops[0], Int16Ty);
12676  }
12677  case NEON::BI__builtin_neon_vmaxvq_s8: {
12678    Int = Intrinsic::aarch64_neon_smaxv;
12679    Ty = Int32Ty;
12680    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12681    llvm::Type *Tys[2] = { Ty, VTy };
12682    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12683    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12684    return Builder.CreateTrunc(Ops[0], Int8Ty);
12685  }
12686  case NEON::BI__builtin_neon_vmaxvq_s16: {
12687    Int = Intrinsic::aarch64_neon_smaxv;
12688    Ty = Int32Ty;
12689    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12690    llvm::Type *Tys[2] = { Ty, VTy };
12691    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12692    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12693    return Builder.CreateTrunc(Ops[0], Int16Ty);
12694  }
12695  case NEON::BI__builtin_neon_vmaxv_f16: {
12696    Int = Intrinsic::aarch64_neon_fmaxv;
12697    Ty = HalfTy;
12698    VTy = llvm::FixedVectorType::get(HalfTy, 4);
12699    llvm::Type *Tys[2] = { Ty, VTy };
12700    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12701    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12702    return Builder.CreateTrunc(Ops[0], HalfTy);
12703  }
12704  case NEON::BI__builtin_neon_vmaxvq_f16: {
12705    Int = Intrinsic::aarch64_neon_fmaxv;
12706    Ty = HalfTy;
12707    VTy = llvm::FixedVectorType::get(HalfTy, 8);
12708    llvm::Type *Tys[2] = { Ty, VTy };
12709    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12710    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
12711    return Builder.CreateTrunc(Ops[0], HalfTy);
12712  }
12713  case NEON::BI__builtin_neon_vminv_u8: {
12714    Int = Intrinsic::aarch64_neon_uminv;
12715    Ty = Int32Ty;
12716    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12717    llvm::Type *Tys[2] = { Ty, VTy };
12718    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12719    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12720    return Builder.CreateTrunc(Ops[0], Int8Ty);
12721  }
12722  case NEON::BI__builtin_neon_vminv_u16: {
12723    Int = Intrinsic::aarch64_neon_uminv;
12724    Ty = Int32Ty;
12725    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12726    llvm::Type *Tys[2] = { Ty, VTy };
12727    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12728    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12729    return Builder.CreateTrunc(Ops[0], Int16Ty);
12730  }
12731  case NEON::BI__builtin_neon_vminvq_u8: {
12732    Int = Intrinsic::aarch64_neon_uminv;
12733    Ty = Int32Ty;
12734    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12735    llvm::Type *Tys[2] = { Ty, VTy };
12736    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12737    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12738    return Builder.CreateTrunc(Ops[0], Int8Ty);
12739  }
12740  case NEON::BI__builtin_neon_vminvq_u16: {
12741    Int = Intrinsic::aarch64_neon_uminv;
12742    Ty = Int32Ty;
12743    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12744    llvm::Type *Tys[2] = { Ty, VTy };
12745    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12746    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12747    return Builder.CreateTrunc(Ops[0], Int16Ty);
12748  }
12749  case NEON::BI__builtin_neon_vminv_s8: {
12750    Int = Intrinsic::aarch64_neon_sminv;
12751    Ty = Int32Ty;
12752    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12753    llvm::Type *Tys[2] = { Ty, VTy };
12754    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12755    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12756    return Builder.CreateTrunc(Ops[0], Int8Ty);
12757  }
12758  case NEON::BI__builtin_neon_vminv_s16: {
12759    Int = Intrinsic::aarch64_neon_sminv;
12760    Ty = Int32Ty;
12761    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12762    llvm::Type *Tys[2] = { Ty, VTy };
12763    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12764    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12765    return Builder.CreateTrunc(Ops[0], Int16Ty);
12766  }
12767  case NEON::BI__builtin_neon_vminvq_s8: {
12768    Int = Intrinsic::aarch64_neon_sminv;
12769    Ty = Int32Ty;
12770    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12771    llvm::Type *Tys[2] = { Ty, VTy };
12772    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12773    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12774    return Builder.CreateTrunc(Ops[0], Int8Ty);
12775  }
12776  case NEON::BI__builtin_neon_vminvq_s16: {
12777    Int = Intrinsic::aarch64_neon_sminv;
12778    Ty = Int32Ty;
12779    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12780    llvm::Type *Tys[2] = { Ty, VTy };
12781    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12782    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12783    return Builder.CreateTrunc(Ops[0], Int16Ty);
12784  }
12785  case NEON::BI__builtin_neon_vminv_f16: {
12786    Int = Intrinsic::aarch64_neon_fminv;
12787    Ty = HalfTy;
12788    VTy = llvm::FixedVectorType::get(HalfTy, 4);
12789    llvm::Type *Tys[2] = { Ty, VTy };
12790    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12791    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12792    return Builder.CreateTrunc(Ops[0], HalfTy);
12793  }
12794  case NEON::BI__builtin_neon_vminvq_f16: {
12795    Int = Intrinsic::aarch64_neon_fminv;
12796    Ty = HalfTy;
12797    VTy = llvm::FixedVectorType::get(HalfTy, 8);
12798    llvm::Type *Tys[2] = { Ty, VTy };
12799    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12800    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
12801    return Builder.CreateTrunc(Ops[0], HalfTy);
12802  }
12803  case NEON::BI__builtin_neon_vmaxnmv_f16: {
12804    Int = Intrinsic::aarch64_neon_fmaxnmv;
12805    Ty = HalfTy;
12806    VTy = llvm::FixedVectorType::get(HalfTy, 4);
12807    llvm::Type *Tys[2] = { Ty, VTy };
12808    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12809    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
12810    return Builder.CreateTrunc(Ops[0], HalfTy);
12811  }
12812  case NEON::BI__builtin_neon_vmaxnmvq_f16: {
12813    Int = Intrinsic::aarch64_neon_fmaxnmv;
12814    Ty = HalfTy;
12815    VTy = llvm::FixedVectorType::get(HalfTy, 8);
12816    llvm::Type *Tys[2] = { Ty, VTy };
12817    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12818    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
12819    return Builder.CreateTrunc(Ops[0], HalfTy);
12820  }
12821  case NEON::BI__builtin_neon_vminnmv_f16: {
12822    Int = Intrinsic::aarch64_neon_fminnmv;
12823    Ty = HalfTy;
12824    VTy = llvm::FixedVectorType::get(HalfTy, 4);
12825    llvm::Type *Tys[2] = { Ty, VTy };
12826    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12827    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
12828    return Builder.CreateTrunc(Ops[0], HalfTy);
12829  }
12830  case NEON::BI__builtin_neon_vminnmvq_f16: {
12831    Int = Intrinsic::aarch64_neon_fminnmv;
12832    Ty = HalfTy;
12833    VTy = llvm::FixedVectorType::get(HalfTy, 8);
12834    llvm::Type *Tys[2] = { Ty, VTy };
12835    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12836    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
12837    return Builder.CreateTrunc(Ops[0], HalfTy);
12838  }
12839  case NEON::BI__builtin_neon_vmul_n_f64: {
12840    Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
12841    Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
12842    return Builder.CreateFMul(Ops[0], RHS);
12843  }
12844  case NEON::BI__builtin_neon_vaddlv_u8: {
12845    Int = Intrinsic::aarch64_neon_uaddlv;
12846    Ty = Int32Ty;
12847    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12848    llvm::Type *Tys[2] = { Ty, VTy };
12849    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12850    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12851    return Builder.CreateTrunc(Ops[0], Int16Ty);
12852  }
12853  case NEON::BI__builtin_neon_vaddlv_u16: {
12854    Int = Intrinsic::aarch64_neon_uaddlv;
12855    Ty = Int32Ty;
12856    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12857    llvm::Type *Tys[2] = { Ty, VTy };
12858    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12859    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12860  }
12861  case NEON::BI__builtin_neon_vaddlvq_u8: {
12862    Int = Intrinsic::aarch64_neon_uaddlv;
12863    Ty = Int32Ty;
12864    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12865    llvm::Type *Tys[2] = { Ty, VTy };
12866    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12867    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12868    return Builder.CreateTrunc(Ops[0], Int16Ty);
12869  }
12870  case NEON::BI__builtin_neon_vaddlvq_u16: {
12871    Int = Intrinsic::aarch64_neon_uaddlv;
12872    Ty = Int32Ty;
12873    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12874    llvm::Type *Tys[2] = { Ty, VTy };
12875    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12876    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12877  }
12878  case NEON::BI__builtin_neon_vaddlv_s8: {
12879    Int = Intrinsic::aarch64_neon_saddlv;
12880    Ty = Int32Ty;
12881    VTy = llvm::FixedVectorType::get(Int8Ty, 8);
12882    llvm::Type *Tys[2] = { Ty, VTy };
12883    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12884    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12885    return Builder.CreateTrunc(Ops[0], Int16Ty);
12886  }
12887  case NEON::BI__builtin_neon_vaddlv_s16: {
12888    Int = Intrinsic::aarch64_neon_saddlv;
12889    Ty = Int32Ty;
12890    VTy = llvm::FixedVectorType::get(Int16Ty, 4);
12891    llvm::Type *Tys[2] = { Ty, VTy };
12892    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12893    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12894  }
12895  case NEON::BI__builtin_neon_vaddlvq_s8: {
12896    Int = Intrinsic::aarch64_neon_saddlv;
12897    Ty = Int32Ty;
12898    VTy = llvm::FixedVectorType::get(Int8Ty, 16);
12899    llvm::Type *Tys[2] = { Ty, VTy };
12900    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12901    Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12902    return Builder.CreateTrunc(Ops[0], Int16Ty);
12903  }
12904  case NEON::BI__builtin_neon_vaddlvq_s16: {
12905    Int = Intrinsic::aarch64_neon_saddlv;
12906    Ty = Int32Ty;
12907    VTy = llvm::FixedVectorType::get(Int16Ty, 8);
12908    llvm::Type *Tys[2] = { Ty, VTy };
12909    Ops.push_back(EmitScalarExpr(E->getArg(0)));
12910    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
12911  }
12912  case NEON::BI__builtin_neon_vsri_n_v:
12913  case NEON::BI__builtin_neon_vsriq_n_v: {
12914    Int = Intrinsic::aarch64_neon_vsri;
12915    llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
12916    return EmitNeonCall(Intrin, Ops, "vsri_n");
12917  }
12918  case NEON::BI__builtin_neon_vsli_n_v:
12919  case NEON::BI__builtin_neon_vsliq_n_v: {
12920    Int = Intrinsic::aarch64_neon_vsli;
12921    llvm::Function *Intrin = CGM.getIntrinsic(Int, Ty);
12922    return EmitNeonCall(Intrin, Ops, "vsli_n");
12923  }
12924  case NEON::BI__builtin_neon_vsra_n_v:
12925  case NEON::BI__builtin_neon_vsraq_n_v:
12926    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
12927    Ops[1] = EmitNeonRShiftImm(Ops[1], Ops[2], Ty, usgn, "vsra_n");
12928    return Builder.CreateAdd(Ops[0], Ops[1]);
12929  case NEON::BI__builtin_neon_vrsra_n_v:
12930  case NEON::BI__builtin_neon_vrsraq_n_v: {
12931    Int = usgn ? Intrinsic::aarch64_neon_urshl : Intrinsic::aarch64_neon_srshl;
12932    SmallVector<llvm::Value*,2> TmpOps;
12933    TmpOps.push_back(Ops[1]);
12934    TmpOps.push_back(Ops[2]);
12935    Function* F = CGM.getIntrinsic(Int, Ty);
12936    llvm::Value *tmp = EmitNeonCall(F, TmpOps, "vrshr_n", 1, true);
12937    Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
12938    return Builder.CreateAdd(Ops[0], tmp);
12939  }
12940  case NEON::BI__builtin_neon_vld1_v:
12941  case NEON::BI__builtin_neon_vld1q_v: {
12942    return Builder.CreateAlignedLoad(VTy, Ops[0], PtrOp0.getAlignment());
12943  }
12944  case NEON::BI__builtin_neon_vst1_v:
12945  case NEON::BI__builtin_neon_vst1q_v:
12946    Ops[1] = Builder.CreateBitCast(Ops[1], VTy);
12947    return Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
12948  case NEON::BI__builtin_neon_vld1_lane_v:
12949  case NEON::BI__builtin_neon_vld1q_lane_v: {
12950    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12951    Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0],
12952                                       PtrOp0.getAlignment());
12953    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vld1_lane");
12954  }
12955  case NEON::BI__builtin_neon_vldap1_lane_s64:
12956  case NEON::BI__builtin_neon_vldap1q_lane_s64: {
12957    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12958    llvm::LoadInst *LI = Builder.CreateAlignedLoad(
12959        VTy->getElementType(), Ops[0], PtrOp0.getAlignment());
12960    LI->setAtomic(llvm::AtomicOrdering::Acquire);
12961    Ops[0] = LI;
12962    return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vldap1_lane");
12963  }
12964  case NEON::BI__builtin_neon_vld1_dup_v:
12965  case NEON::BI__builtin_neon_vld1q_dup_v: {
12966    Value *V = PoisonValue::get(Ty);
12967    Ops[0] = Builder.CreateAlignedLoad(VTy->getElementType(), Ops[0],
12968                                       PtrOp0.getAlignment());
12969    llvm::Constant *CI = ConstantInt::get(Int32Ty, 0);
12970    Ops[0] = Builder.CreateInsertElement(V, Ops[0], CI);
12971    return EmitNeonSplat(Ops[0], CI);
12972  }
12973  case NEON::BI__builtin_neon_vst1_lane_v:
12974  case NEON::BI__builtin_neon_vst1q_lane_v:
12975    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12976    Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
12977    return Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
12978  case NEON::BI__builtin_neon_vstl1_lane_s64:
12979  case NEON::BI__builtin_neon_vstl1q_lane_s64: {
12980    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
12981    Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2]);
12982    llvm::StoreInst *SI =
12983        Builder.CreateAlignedStore(Ops[1], Ops[0], PtrOp0.getAlignment());
12984    SI->setAtomic(llvm::AtomicOrdering::Release);
12985    return SI;
12986  }
12987  case NEON::BI__builtin_neon_vld2_v:
12988  case NEON::BI__builtin_neon_vld2q_v: {
12989    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
12990    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2, Tys);
12991    Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
12992    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
12993  }
12994  case NEON::BI__builtin_neon_vld3_v:
12995  case NEON::BI__builtin_neon_vld3q_v: {
12996    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
12997    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3, Tys);
12998    Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
12999    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13000  }
13001  case NEON::BI__builtin_neon_vld4_v:
13002  case NEON::BI__builtin_neon_vld4q_v: {
13003    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
13004    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4, Tys);
13005    Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
13006    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13007  }
13008  case NEON::BI__builtin_neon_vld2_dup_v:
13009  case NEON::BI__builtin_neon_vld2q_dup_v: {
13010    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
13011    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2r, Tys);
13012    Ops[1] = Builder.CreateCall(F, Ops[1], "vld2");
13013    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13014  }
13015  case NEON::BI__builtin_neon_vld3_dup_v:
13016  case NEON::BI__builtin_neon_vld3q_dup_v: {
13017    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
13018    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3r, Tys);
13019    Ops[1] = Builder.CreateCall(F, Ops[1], "vld3");
13020    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13021  }
13022  case NEON::BI__builtin_neon_vld4_dup_v:
13023  case NEON::BI__builtin_neon_vld4q_dup_v: {
13024    llvm::Type *Tys[2] = {VTy, UnqualPtrTy};
13025    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4r, Tys);
13026    Ops[1] = Builder.CreateCall(F, Ops[1], "vld4");
13027    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13028  }
13029  case NEON::BI__builtin_neon_vld2_lane_v:
13030  case NEON::BI__builtin_neon_vld2q_lane_v: {
13031    llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
13032    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld2lane, Tys);
13033    std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
13034    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13035    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13036    Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
13037    Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld2_lane");
13038    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13039  }
13040  case NEON::BI__builtin_neon_vld3_lane_v:
13041  case NEON::BI__builtin_neon_vld3q_lane_v: {
13042    llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
13043    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld3lane, Tys);
13044    std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
13045    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13046    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13047    Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
13048    Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
13049    Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld3_lane");
13050    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13051  }
13052  case NEON::BI__builtin_neon_vld4_lane_v:
13053  case NEON::BI__builtin_neon_vld4q_lane_v: {
13054    llvm::Type *Tys[2] = { VTy, Ops[1]->getType() };
13055    Function *F = CGM.getIntrinsic(Intrinsic::aarch64_neon_ld4lane, Tys);
13056    std::rotate(Ops.begin() + 1, Ops.begin() + 2, Ops.end());
13057    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13058    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13059    Ops[3] = Builder.CreateBitCast(Ops[3], Ty);
13060    Ops[4] = Builder.CreateBitCast(Ops[4], Ty);
13061    Ops[5] = Builder.CreateZExt(Ops[5], Int64Ty);
13062    Ops[1] = Builder.CreateCall(F, ArrayRef(Ops).slice(1), "vld4_lane");
13063    return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
13064  }
13065  case NEON::BI__builtin_neon_vst2_v:
13066  case NEON::BI__builtin_neon_vst2q_v: {
13067    std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13068    llvm::Type *Tys[2] = { VTy, Ops[2]->getType() };
13069    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2, Tys),
13070                        Ops, "");
13071  }
13072  case NEON::BI__builtin_neon_vst2_lane_v:
13073  case NEON::BI__builtin_neon_vst2q_lane_v: {
13074    std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13075    Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
13076    llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
13077    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st2lane, Tys),
13078                        Ops, "");
13079  }
13080  case NEON::BI__builtin_neon_vst3_v:
13081  case NEON::BI__builtin_neon_vst3q_v: {
13082    std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13083    llvm::Type *Tys[2] = { VTy, Ops[3]->getType() };
13084    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3, Tys),
13085                        Ops, "");
13086  }
13087  case NEON::BI__builtin_neon_vst3_lane_v:
13088  case NEON::BI__builtin_neon_vst3q_lane_v: {
13089    std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13090    Ops[3] = Builder.CreateZExt(Ops[3], Int64Ty);
13091    llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
13092    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st3lane, Tys),
13093                        Ops, "");
13094  }
13095  case NEON::BI__builtin_neon_vst4_v:
13096  case NEON::BI__builtin_neon_vst4q_v: {
13097    std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13098    llvm::Type *Tys[2] = { VTy, Ops[4]->getType() };
13099    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4, Tys),
13100                        Ops, "");
13101  }
13102  case NEON::BI__builtin_neon_vst4_lane_v:
13103  case NEON::BI__builtin_neon_vst4q_lane_v: {
13104    std::rotate(Ops.begin(), Ops.begin() + 1, Ops.end());
13105    Ops[4] = Builder.CreateZExt(Ops[4], Int64Ty);
13106    llvm::Type *Tys[2] = { VTy, Ops[5]->getType() };
13107    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_st4lane, Tys),
13108                        Ops, "");
13109  }
13110  case NEON::BI__builtin_neon_vtrn_v:
13111  case NEON::BI__builtin_neon_vtrnq_v: {
13112    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13113    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13114    Value *SV = nullptr;
13115
13116    for (unsigned vi = 0; vi != 2; ++vi) {
13117      SmallVector<int, 16> Indices;
13118      for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
13119        Indices.push_back(i+vi);
13120        Indices.push_back(i+e+vi);
13121      }
13122      Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
13123      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vtrn");
13124      SV = Builder.CreateDefaultAlignedStore(SV, Addr);
13125    }
13126    return SV;
13127  }
13128  case NEON::BI__builtin_neon_vuzp_v:
13129  case NEON::BI__builtin_neon_vuzpq_v: {
13130    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13131    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13132    Value *SV = nullptr;
13133
13134    for (unsigned vi = 0; vi != 2; ++vi) {
13135      SmallVector<int, 16> Indices;
13136      for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
13137        Indices.push_back(2*i+vi);
13138
13139      Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
13140      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vuzp");
13141      SV = Builder.CreateDefaultAlignedStore(SV, Addr);
13142    }
13143    return SV;
13144  }
13145  case NEON::BI__builtin_neon_vzip_v:
13146  case NEON::BI__builtin_neon_vzipq_v: {
13147    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
13148    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
13149    Value *SV = nullptr;
13150
13151    for (unsigned vi = 0; vi != 2; ++vi) {
13152      SmallVector<int, 16> Indices;
13153      for (unsigned i = 0, e = VTy->getNumElements(); i != e; i += 2) {
13154        Indices.push_back((i + vi*e) >> 1);
13155        Indices.push_back(((i + vi*e) >> 1)+e);
13156      }
13157      Value *Addr = Builder.CreateConstInBoundsGEP1_32(Ty, Ops[0], vi);
13158      SV = Builder.CreateShuffleVector(Ops[1], Ops[2], Indices, "vzip");
13159      SV = Builder.CreateDefaultAlignedStore(SV, Addr);
13160    }
13161    return SV;
13162  }
13163  case NEON::BI__builtin_neon_vqtbl1q_v: {
13164    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl1, Ty),
13165                        Ops, "vtbl1");
13166  }
13167  case NEON::BI__builtin_neon_vqtbl2q_v: {
13168    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl2, Ty),
13169                        Ops, "vtbl2");
13170  }
13171  case NEON::BI__builtin_neon_vqtbl3q_v: {
13172    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl3, Ty),
13173                        Ops, "vtbl3");
13174  }
13175  case NEON::BI__builtin_neon_vqtbl4q_v: {
13176    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbl4, Ty),
13177                        Ops, "vtbl4");
13178  }
13179  case NEON::BI__builtin_neon_vqtbx1q_v: {
13180    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx1, Ty),
13181                        Ops, "vtbx1");
13182  }
13183  case NEON::BI__builtin_neon_vqtbx2q_v: {
13184    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx2, Ty),
13185                        Ops, "vtbx2");
13186  }
13187  case NEON::BI__builtin_neon_vqtbx3q_v: {
13188    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx3, Ty),
13189                        Ops, "vtbx3");
13190  }
13191  case NEON::BI__builtin_neon_vqtbx4q_v: {
13192    return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_tbx4, Ty),
13193                        Ops, "vtbx4");
13194  }
13195  case NEON::BI__builtin_neon_vsqadd_v:
13196  case NEON::BI__builtin_neon_vsqaddq_v: {
13197    Int = Intrinsic::aarch64_neon_usqadd;
13198    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vsqadd");
13199  }
13200  case NEON::BI__builtin_neon_vuqadd_v:
13201  case NEON::BI__builtin_neon_vuqaddq_v: {
13202    Int = Intrinsic::aarch64_neon_suqadd;
13203    return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd");
13204  }
13205  }
13206}
13207
13208Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
13209                                           const CallExpr *E) {
13210  assert((BuiltinID == BPF::BI__builtin_preserve_field_info ||
13211          BuiltinID == BPF::BI__builtin_btf_type_id ||
13212          BuiltinID == BPF::BI__builtin_preserve_type_info ||
13213          BuiltinID == BPF::BI__builtin_preserve_enum_value) &&
13214         "unexpected BPF builtin");
13215
13216  // A sequence number, injected into IR builtin functions, to
13217  // prevent CSE given the only difference of the function
13218  // may just be the debuginfo metadata.
13219  static uint32_t BuiltinSeqNum;
13220
13221  switch (BuiltinID) {
13222  default:
13223    llvm_unreachable("Unexpected BPF builtin");
13224  case BPF::BI__builtin_preserve_field_info: {
13225    const Expr *Arg = E->getArg(0);
13226    bool IsBitField = Arg->IgnoreParens()->getObjectKind() == OK_BitField;
13227
13228    if (!getDebugInfo()) {
13229      CGM.Error(E->getExprLoc(),
13230                "using __builtin_preserve_field_info() without -g");
13231      return IsBitField ? EmitLValue(Arg).getBitFieldPointer()
13232                        : EmitLValue(Arg).getPointer(*this);
13233    }
13234
13235    // Enable underlying preserve_*_access_index() generation.
13236    bool OldIsInPreservedAIRegion = IsInPreservedAIRegion;
13237    IsInPreservedAIRegion = true;
13238    Value *FieldAddr = IsBitField ? EmitLValue(Arg).getBitFieldPointer()
13239                                  : EmitLValue(Arg).getPointer(*this);
13240    IsInPreservedAIRegion = OldIsInPreservedAIRegion;
13241
13242    ConstantInt *C = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
13243    Value *InfoKind = ConstantInt::get(Int64Ty, C->getSExtValue());
13244
13245    // Built the IR for the preserve_field_info intrinsic.
13246    llvm::Function *FnGetFieldInfo = llvm::Intrinsic::getDeclaration(
13247        &CGM.getModule(), llvm::Intrinsic::bpf_preserve_field_info,
13248        {FieldAddr->getType()});
13249    return Builder.CreateCall(FnGetFieldInfo, {FieldAddr, InfoKind});
13250  }
13251  case BPF::BI__builtin_btf_type_id:
13252  case BPF::BI__builtin_preserve_type_info: {
13253    if (!getDebugInfo()) {
13254      CGM.Error(E->getExprLoc(), "using builtin function without -g");
13255      return nullptr;
13256    }
13257
13258    const Expr *Arg0 = E->getArg(0);
13259    llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
13260        Arg0->getType(), Arg0->getExprLoc());
13261
13262    ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
13263    Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
13264    Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
13265
13266    llvm::Function *FnDecl;
13267    if (BuiltinID == BPF::BI__builtin_btf_type_id)
13268      FnDecl = llvm::Intrinsic::getDeclaration(
13269          &CGM.getModule(), llvm::Intrinsic::bpf_btf_type_id, {});
13270    else
13271      FnDecl = llvm::Intrinsic::getDeclaration(
13272          &CGM.getModule(), llvm::Intrinsic::bpf_preserve_type_info, {});
13273    CallInst *Fn = Builder.CreateCall(FnDecl, {SeqNumVal, FlagValue});
13274    Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
13275    return Fn;
13276  }
13277  case BPF::BI__builtin_preserve_enum_value: {
13278    if (!getDebugInfo()) {
13279      CGM.Error(E->getExprLoc(), "using builtin function without -g");
13280      return nullptr;
13281    }
13282
13283    const Expr *Arg0 = E->getArg(0);
13284    llvm::DIType *DbgInfo = getDebugInfo()->getOrCreateStandaloneType(
13285        Arg0->getType(), Arg0->getExprLoc());
13286
13287    // Find enumerator
13288    const auto *UO = cast<UnaryOperator>(Arg0->IgnoreParens());
13289    const auto *CE = cast<CStyleCastExpr>(UO->getSubExpr());
13290    const auto *DR = cast<DeclRefExpr>(CE->getSubExpr());
13291    const auto *Enumerator = cast<EnumConstantDecl>(DR->getDecl());
13292
13293    auto InitVal = Enumerator->getInitVal();
13294    std::string InitValStr;
13295    if (InitVal.isNegative() || InitVal > uint64_t(INT64_MAX))
13296      InitValStr = std::to_string(InitVal.getSExtValue());
13297    else
13298      InitValStr = std::to_string(InitVal.getZExtValue());
13299    std::string EnumStr = Enumerator->getNameAsString() + ":" + InitValStr;
13300    Value *EnumStrVal = Builder.CreateGlobalStringPtr(EnumStr);
13301
13302    ConstantInt *Flag = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
13303    Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue());
13304    Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++);
13305
13306    llvm::Function *IntrinsicFn = llvm::Intrinsic::getDeclaration(
13307        &CGM.getModule(), llvm::Intrinsic::bpf_preserve_enum_value, {});
13308    CallInst *Fn =
13309        Builder.CreateCall(IntrinsicFn, {SeqNumVal, EnumStrVal, FlagValue});
13310    Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
13311    return Fn;
13312  }
13313  }
13314}
13315
13316llvm::Value *CodeGenFunction::
13317BuildVector(ArrayRef<llvm::Value*> Ops) {
13318  assert((Ops.size() & (Ops.size() - 1)) == 0 &&
13319         "Not a power-of-two sized vector!");
13320  bool AllConstants = true;
13321  for (unsigned i = 0, e = Ops.size(); i != e && AllConstants; ++i)
13322    AllConstants &= isa<Constant>(Ops[i]);
13323
13324  // If this is a constant vector, create a ConstantVector.
13325  if (AllConstants) {
13326    SmallVector<llvm::Constant*, 16> CstOps;
13327    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
13328      CstOps.push_back(cast<Constant>(Ops[i]));
13329    return llvm::ConstantVector::get(CstOps);
13330  }
13331
13332  // Otherwise, insertelement the values to build the vector.
13333  Value *Result = llvm::PoisonValue::get(
13334      llvm::FixedVectorType::get(Ops[0]->getType(), Ops.size()));
13335
13336  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
13337    Result = Builder.CreateInsertElement(Result, Ops[i], Builder.getInt64(i));
13338
13339  return Result;
13340}
13341
13342// Convert the mask from an integer type to a vector of i1.
13343static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
13344                              unsigned NumElts) {
13345
13346  auto *MaskTy = llvm::FixedVectorType::get(
13347      CGF.Builder.getInt1Ty(),
13348      cast<IntegerType>(Mask->getType())->getBitWidth());
13349  Value *MaskVec = CGF.Builder.CreateBitCast(Mask, MaskTy);
13350
13351  // If we have less than 8 elements, then the starting mask was an i8 and
13352  // we need to extract down to the right number of elements.
13353  if (NumElts < 8) {
13354    int Indices[4];
13355    for (unsigned i = 0; i != NumElts; ++i)
13356      Indices[i] = i;
13357    MaskVec = CGF.Builder.CreateShuffleVector(
13358        MaskVec, MaskVec, ArrayRef(Indices, NumElts), "extract");
13359  }
13360  return MaskVec;
13361}
13362
13363static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
13364                                 Align Alignment) {
13365  Value *Ptr = Ops[0];
13366
13367  Value *MaskVec = getMaskVecValue(
13368      CGF, Ops[2],
13369      cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements());
13370
13371  return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec);
13372}
13373
13374static Value *EmitX86MaskedLoad(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
13375                                Align Alignment) {
13376  llvm::Type *Ty = Ops[1]->getType();
13377  Value *Ptr = Ops[0];
13378
13379  Value *MaskVec = getMaskVecValue(
13380      CGF, Ops[2], cast<llvm::FixedVectorType>(Ty)->getNumElements());
13381
13382  return CGF.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, MaskVec, Ops[1]);
13383}
13384
13385static Value *EmitX86ExpandLoad(CodeGenFunction &CGF,
13386                                ArrayRef<Value *> Ops) {
13387  auto *ResultTy = cast<llvm::VectorType>(Ops[1]->getType());
13388  Value *Ptr = Ops[0];
13389
13390  Value *MaskVec = getMaskVecValue(
13391      CGF, Ops[2], cast<FixedVectorType>(ResultTy)->getNumElements());
13392
13393  llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload,
13394                                           ResultTy);
13395  return CGF.Builder.CreateCall(F, { Ptr, MaskVec, Ops[1] });
13396}
13397
13398static Value *EmitX86CompressExpand(CodeGenFunction &CGF,
13399                                    ArrayRef<Value *> Ops,
13400                                    bool IsCompress) {
13401  auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
13402
13403  Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
13404
13405  Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
13406                                 : Intrinsic::x86_avx512_mask_expand;
13407  llvm::Function *F = CGF.CGM.getIntrinsic(IID, ResultTy);
13408  return CGF.Builder.CreateCall(F, { Ops[0], Ops[1], MaskVec });
13409}
13410
13411static Value *EmitX86CompressStore(CodeGenFunction &CGF,
13412                                   ArrayRef<Value *> Ops) {
13413  auto *ResultTy = cast<llvm::FixedVectorType>(Ops[1]->getType());
13414  Value *Ptr = Ops[0];
13415
13416  Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements());
13417
13418  llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_compressstore,
13419                                           ResultTy);
13420  return CGF.Builder.CreateCall(F, { Ops[1], Ptr, MaskVec });
13421}
13422
13423static Value *EmitX86MaskLogic(CodeGenFunction &CGF, Instruction::BinaryOps Opc,
13424                              ArrayRef<Value *> Ops,
13425                              bool InvertLHS = false) {
13426  unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
13427  Value *LHS = getMaskVecValue(CGF, Ops[0], NumElts);
13428  Value *RHS = getMaskVecValue(CGF, Ops[1], NumElts);
13429
13430  if (InvertLHS)
13431    LHS = CGF.Builder.CreateNot(LHS);
13432
13433  return CGF.Builder.CreateBitCast(CGF.Builder.CreateBinOp(Opc, LHS, RHS),
13434                                   Ops[0]->getType());
13435}
13436
13437static Value *EmitX86FunnelShift(CodeGenFunction &CGF, Value *Op0, Value *Op1,
13438                                 Value *Amt, bool IsRight) {
13439  llvm::Type *Ty = Op0->getType();
13440
13441  // Amount may be scalar immediate, in which case create a splat vector.
13442  // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
13443  // we only care about the lowest log2 bits anyway.
13444  if (Amt->getType() != Ty) {
13445    unsigned NumElts = cast<llvm::FixedVectorType>(Ty)->getNumElements();
13446    Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
13447    Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt);
13448  }
13449
13450  unsigned IID = IsRight ? Intrinsic::fshr : Intrinsic::fshl;
13451  Function *F = CGF.CGM.getIntrinsic(IID, Ty);
13452  return CGF.Builder.CreateCall(F, {Op0, Op1, Amt});
13453}
13454
13455static Value *EmitX86vpcom(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
13456                           bool IsSigned) {
13457  Value *Op0 = Ops[0];
13458  Value *Op1 = Ops[1];
13459  llvm::Type *Ty = Op0->getType();
13460  uint64_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
13461
13462  CmpInst::Predicate Pred;
13463  switch (Imm) {
13464  case 0x0:
13465    Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
13466    break;
13467  case 0x1:
13468    Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
13469    break;
13470  case 0x2:
13471    Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
13472    break;
13473  case 0x3:
13474    Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
13475    break;
13476  case 0x4:
13477    Pred = ICmpInst::ICMP_EQ;
13478    break;
13479  case 0x5:
13480    Pred = ICmpInst::ICMP_NE;
13481    break;
13482  case 0x6:
13483    return llvm::Constant::getNullValue(Ty); // FALSE
13484  case 0x7:
13485    return llvm::Constant::getAllOnesValue(Ty); // TRUE
13486  default:
13487    llvm_unreachable("Unexpected XOP vpcom/vpcomu predicate");
13488  }
13489
13490  Value *Cmp = CGF.Builder.CreateICmp(Pred, Op0, Op1);
13491  Value *Res = CGF.Builder.CreateSExt(Cmp, Ty);
13492  return Res;
13493}
13494
13495static Value *EmitX86Select(CodeGenFunction &CGF,
13496                            Value *Mask, Value *Op0, Value *Op1) {
13497
13498  // If the mask is all ones just return first argument.
13499  if (const auto *C = dyn_cast<Constant>(Mask))
13500    if (C->isAllOnesValue())
13501      return Op0;
13502
13503  Mask = getMaskVecValue(
13504      CGF, Mask, cast<llvm::FixedVectorType>(Op0->getType())->getNumElements());
13505
13506  return CGF.Builder.CreateSelect(Mask, Op0, Op1);
13507}
13508
13509static Value *EmitX86ScalarSelect(CodeGenFunction &CGF,
13510                                  Value *Mask, Value *Op0, Value *Op1) {
13511  // If the mask is all ones just return first argument.
13512  if (const auto *C = dyn_cast<Constant>(Mask))
13513    if (C->isAllOnesValue())
13514      return Op0;
13515
13516  auto *MaskTy = llvm::FixedVectorType::get(
13517      CGF.Builder.getInt1Ty(), Mask->getType()->getIntegerBitWidth());
13518  Mask = CGF.Builder.CreateBitCast(Mask, MaskTy);
13519  Mask = CGF.Builder.CreateExtractElement(Mask, (uint64_t)0);
13520  return CGF.Builder.CreateSelect(Mask, Op0, Op1);
13521}
13522
13523static Value *EmitX86MaskedCompareResult(CodeGenFunction &CGF, Value *Cmp,
13524                                         unsigned NumElts, Value *MaskIn) {
13525  if (MaskIn) {
13526    const auto *C = dyn_cast<Constant>(MaskIn);
13527    if (!C || !C->isAllOnesValue())
13528      Cmp = CGF.Builder.CreateAnd(Cmp, getMaskVecValue(CGF, MaskIn, NumElts));
13529  }
13530
13531  if (NumElts < 8) {
13532    int Indices[8];
13533    for (unsigned i = 0; i != NumElts; ++i)
13534      Indices[i] = i;
13535    for (unsigned i = NumElts; i != 8; ++i)
13536      Indices[i] = i % NumElts + NumElts;
13537    Cmp = CGF.Builder.CreateShuffleVector(
13538        Cmp, llvm::Constant::getNullValue(Cmp->getType()), Indices);
13539  }
13540
13541  return CGF.Builder.CreateBitCast(Cmp,
13542                                   IntegerType::get(CGF.getLLVMContext(),
13543                                                    std::max(NumElts, 8U)));
13544}
13545
13546static Value *EmitX86MaskedCompare(CodeGenFunction &CGF, unsigned CC,
13547                                   bool Signed, ArrayRef<Value *> Ops) {
13548  assert((Ops.size() == 2 || Ops.size() == 4) &&
13549         "Unexpected number of arguments");
13550  unsigned NumElts =
13551      cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
13552  Value *Cmp;
13553
13554  if (CC == 3) {
13555    Cmp = Constant::getNullValue(
13556        llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
13557  } else if (CC == 7) {
13558    Cmp = Constant::getAllOnesValue(
13559        llvm::FixedVectorType::get(CGF.Builder.getInt1Ty(), NumElts));
13560  } else {
13561    ICmpInst::Predicate Pred;
13562    switch (CC) {
13563    default: llvm_unreachable("Unknown condition code");
13564    case 0: Pred = ICmpInst::ICMP_EQ;  break;
13565    case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
13566    case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
13567    case 4: Pred = ICmpInst::ICMP_NE;  break;
13568    case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
13569    case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
13570    }
13571    Cmp = CGF.Builder.CreateICmp(Pred, Ops[0], Ops[1]);
13572  }
13573
13574  Value *MaskIn = nullptr;
13575  if (Ops.size() == 4)
13576    MaskIn = Ops[3];
13577
13578  return EmitX86MaskedCompareResult(CGF, Cmp, NumElts, MaskIn);
13579}
13580
13581static Value *EmitX86ConvertToMask(CodeGenFunction &CGF, Value *In) {
13582  Value *Zero = Constant::getNullValue(In->getType());
13583  return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
13584}
13585
13586static Value *EmitX86ConvertIntToFp(CodeGenFunction &CGF, const CallExpr *E,
13587                                    ArrayRef<Value *> Ops, bool IsSigned) {
13588  unsigned Rnd = cast<llvm::ConstantInt>(Ops[3])->getZExtValue();
13589  llvm::Type *Ty = Ops[1]->getType();
13590
13591  Value *Res;
13592  if (Rnd != 4) {
13593    Intrinsic::ID IID = IsSigned ? Intrinsic::x86_avx512_sitofp_round
13594                                 : Intrinsic::x86_avx512_uitofp_round;
13595    Function *F = CGF.CGM.getIntrinsic(IID, { Ty, Ops[0]->getType() });
13596    Res = CGF.Builder.CreateCall(F, { Ops[0], Ops[3] });
13597  } else {
13598    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
13599    Res = IsSigned ? CGF.Builder.CreateSIToFP(Ops[0], Ty)
13600                   : CGF.Builder.CreateUIToFP(Ops[0], Ty);
13601  }
13602
13603  return EmitX86Select(CGF, Ops[2], Res, Ops[1]);
13604}
13605
13606// Lowers X86 FMA intrinsics to IR.
13607static Value *EmitX86FMAExpr(CodeGenFunction &CGF, const CallExpr *E,
13608                             ArrayRef<Value *> Ops, unsigned BuiltinID,
13609                             bool IsAddSub) {
13610
13611  bool Subtract = false;
13612  Intrinsic::ID IID = Intrinsic::not_intrinsic;
13613  switch (BuiltinID) {
13614  default: break;
13615  case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
13616    Subtract = true;
13617    [[fallthrough]];
13618  case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
13619  case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
13620  case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
13621    IID = llvm::Intrinsic::x86_avx512fp16_vfmadd_ph_512;
13622    break;
13623  case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
13624    Subtract = true;
13625    [[fallthrough]];
13626  case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
13627  case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
13628  case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
13629    IID = llvm::Intrinsic::x86_avx512fp16_vfmaddsub_ph_512;
13630    break;
13631  case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
13632    Subtract = true;
13633    [[fallthrough]];
13634  case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
13635  case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
13636  case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
13637    IID = llvm::Intrinsic::x86_avx512_vfmadd_ps_512; break;
13638  case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
13639    Subtract = true;
13640    [[fallthrough]];
13641  case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
13642  case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
13643  case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
13644    IID = llvm::Intrinsic::x86_avx512_vfmadd_pd_512; break;
13645  case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
13646    Subtract = true;
13647    [[fallthrough]];
13648  case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
13649  case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
13650  case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
13651    IID = llvm::Intrinsic::x86_avx512_vfmaddsub_ps_512;
13652    break;
13653  case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
13654    Subtract = true;
13655    [[fallthrough]];
13656  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
13657  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
13658  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
13659    IID = llvm::Intrinsic::x86_avx512_vfmaddsub_pd_512;
13660    break;
13661  }
13662
13663  Value *A = Ops[0];
13664  Value *B = Ops[1];
13665  Value *C = Ops[2];
13666
13667  if (Subtract)
13668    C = CGF.Builder.CreateFNeg(C);
13669
13670  Value *Res;
13671
13672  // Only handle in case of _MM_FROUND_CUR_DIRECTION/4 (no rounding).
13673  if (IID != Intrinsic::not_intrinsic &&
13674      (cast<llvm::ConstantInt>(Ops.back())->getZExtValue() != (uint64_t)4 ||
13675       IsAddSub)) {
13676    Function *Intr = CGF.CGM.getIntrinsic(IID);
13677    Res = CGF.Builder.CreateCall(Intr, {A, B, C, Ops.back() });
13678  } else {
13679    llvm::Type *Ty = A->getType();
13680    Function *FMA;
13681    if (CGF.Builder.getIsFPConstrained()) {
13682      CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
13683      FMA = CGF.CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, Ty);
13684      Res = CGF.Builder.CreateConstrainedFPCall(FMA, {A, B, C});
13685    } else {
13686      FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ty);
13687      Res = CGF.Builder.CreateCall(FMA, {A, B, C});
13688    }
13689  }
13690
13691  // Handle any required masking.
13692  Value *MaskFalseVal = nullptr;
13693  switch (BuiltinID) {
13694  case clang::X86::BI__builtin_ia32_vfmaddph512_mask:
13695  case clang::X86::BI__builtin_ia32_vfmaddps512_mask:
13696  case clang::X86::BI__builtin_ia32_vfmaddpd512_mask:
13697  case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask:
13698  case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask:
13699  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask:
13700    MaskFalseVal = Ops[0];
13701    break;
13702  case clang::X86::BI__builtin_ia32_vfmaddph512_maskz:
13703  case clang::X86::BI__builtin_ia32_vfmaddps512_maskz:
13704  case clang::X86::BI__builtin_ia32_vfmaddpd512_maskz:
13705  case clang::X86::BI__builtin_ia32_vfmaddsubph512_maskz:
13706  case clang::X86::BI__builtin_ia32_vfmaddsubps512_maskz:
13707  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
13708    MaskFalseVal = Constant::getNullValue(Ops[0]->getType());
13709    break;
13710  case clang::X86::BI__builtin_ia32_vfmsubph512_mask3:
13711  case clang::X86::BI__builtin_ia32_vfmaddph512_mask3:
13712  case clang::X86::BI__builtin_ia32_vfmsubps512_mask3:
13713  case clang::X86::BI__builtin_ia32_vfmaddps512_mask3:
13714  case clang::X86::BI__builtin_ia32_vfmsubpd512_mask3:
13715  case clang::X86::BI__builtin_ia32_vfmaddpd512_mask3:
13716  case clang::X86::BI__builtin_ia32_vfmsubaddph512_mask3:
13717  case clang::X86::BI__builtin_ia32_vfmaddsubph512_mask3:
13718  case clang::X86::BI__builtin_ia32_vfmsubaddps512_mask3:
13719  case clang::X86::BI__builtin_ia32_vfmaddsubps512_mask3:
13720  case clang::X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
13721  case clang::X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
13722    MaskFalseVal = Ops[2];
13723    break;
13724  }
13725
13726  if (MaskFalseVal)
13727    return EmitX86Select(CGF, Ops[3], Res, MaskFalseVal);
13728
13729  return Res;
13730}
13731
13732static Value *EmitScalarFMAExpr(CodeGenFunction &CGF, const CallExpr *E,
13733                                MutableArrayRef<Value *> Ops, Value *Upper,
13734                                bool ZeroMask = false, unsigned PTIdx = 0,
13735                                bool NegAcc = false) {
13736  unsigned Rnd = 4;
13737  if (Ops.size() > 4)
13738    Rnd = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
13739
13740  if (NegAcc)
13741    Ops[2] = CGF.Builder.CreateFNeg(Ops[2]);
13742
13743  Ops[0] = CGF.Builder.CreateExtractElement(Ops[0], (uint64_t)0);
13744  Ops[1] = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0);
13745  Ops[2] = CGF.Builder.CreateExtractElement(Ops[2], (uint64_t)0);
13746  Value *Res;
13747  if (Rnd != 4) {
13748    Intrinsic::ID IID;
13749
13750    switch (Ops[0]->getType()->getPrimitiveSizeInBits()) {
13751    case 16:
13752      IID = Intrinsic::x86_avx512fp16_vfmadd_f16;
13753      break;
13754    case 32:
13755      IID = Intrinsic::x86_avx512_vfmadd_f32;
13756      break;
13757    case 64:
13758      IID = Intrinsic::x86_avx512_vfmadd_f64;
13759      break;
13760    default:
13761      llvm_unreachable("Unexpected size");
13762    }
13763    Res = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
13764                                 {Ops[0], Ops[1], Ops[2], Ops[4]});
13765  } else if (CGF.Builder.getIsFPConstrained()) {
13766    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
13767    Function *FMA = CGF.CGM.getIntrinsic(
13768        Intrinsic::experimental_constrained_fma, Ops[0]->getType());
13769    Res = CGF.Builder.CreateConstrainedFPCall(FMA, Ops.slice(0, 3));
13770  } else {
13771    Function *FMA = CGF.CGM.getIntrinsic(Intrinsic::fma, Ops[0]->getType());
13772    Res = CGF.Builder.CreateCall(FMA, Ops.slice(0, 3));
13773  }
13774  // If we have more than 3 arguments, we need to do masking.
13775  if (Ops.size() > 3) {
13776    Value *PassThru = ZeroMask ? Constant::getNullValue(Res->getType())
13777                               : Ops[PTIdx];
13778
13779    // If we negated the accumulator and the its the PassThru value we need to
13780    // bypass the negate. Conveniently Upper should be the same thing in this
13781    // case.
13782    if (NegAcc && PTIdx == 2)
13783      PassThru = CGF.Builder.CreateExtractElement(Upper, (uint64_t)0);
13784
13785    Res = EmitX86ScalarSelect(CGF, Ops[3], Res, PassThru);
13786  }
13787  return CGF.Builder.CreateInsertElement(Upper, Res, (uint64_t)0);
13788}
13789
13790static Value *EmitX86Muldq(CodeGenFunction &CGF, bool IsSigned,
13791                           ArrayRef<Value *> Ops) {
13792  llvm::Type *Ty = Ops[0]->getType();
13793  // Arguments have a vXi32 type so cast to vXi64.
13794  Ty = llvm::FixedVectorType::get(CGF.Int64Ty,
13795                                  Ty->getPrimitiveSizeInBits() / 64);
13796  Value *LHS = CGF.Builder.CreateBitCast(Ops[0], Ty);
13797  Value *RHS = CGF.Builder.CreateBitCast(Ops[1], Ty);
13798
13799  if (IsSigned) {
13800    // Shift left then arithmetic shift right.
13801    Constant *ShiftAmt = ConstantInt::get(Ty, 32);
13802    LHS = CGF.Builder.CreateShl(LHS, ShiftAmt);
13803    LHS = CGF.Builder.CreateAShr(LHS, ShiftAmt);
13804    RHS = CGF.Builder.CreateShl(RHS, ShiftAmt);
13805    RHS = CGF.Builder.CreateAShr(RHS, ShiftAmt);
13806  } else {
13807    // Clear the upper bits.
13808    Constant *Mask = ConstantInt::get(Ty, 0xffffffff);
13809    LHS = CGF.Builder.CreateAnd(LHS, Mask);
13810    RHS = CGF.Builder.CreateAnd(RHS, Mask);
13811  }
13812
13813  return CGF.Builder.CreateMul(LHS, RHS);
13814}
13815
13816// Emit a masked pternlog intrinsic. This only exists because the header has to
13817// use a macro and we aren't able to pass the input argument to a pternlog
13818// builtin and a select builtin without evaluating it twice.
13819static Value *EmitX86Ternlog(CodeGenFunction &CGF, bool ZeroMask,
13820                             ArrayRef<Value *> Ops) {
13821  llvm::Type *Ty = Ops[0]->getType();
13822
13823  unsigned VecWidth = Ty->getPrimitiveSizeInBits();
13824  unsigned EltWidth = Ty->getScalarSizeInBits();
13825  Intrinsic::ID IID;
13826  if (VecWidth == 128 && EltWidth == 32)
13827    IID = Intrinsic::x86_avx512_pternlog_d_128;
13828  else if (VecWidth == 256 && EltWidth == 32)
13829    IID = Intrinsic::x86_avx512_pternlog_d_256;
13830  else if (VecWidth == 512 && EltWidth == 32)
13831    IID = Intrinsic::x86_avx512_pternlog_d_512;
13832  else if (VecWidth == 128 && EltWidth == 64)
13833    IID = Intrinsic::x86_avx512_pternlog_q_128;
13834  else if (VecWidth == 256 && EltWidth == 64)
13835    IID = Intrinsic::x86_avx512_pternlog_q_256;
13836  else if (VecWidth == 512 && EltWidth == 64)
13837    IID = Intrinsic::x86_avx512_pternlog_q_512;
13838  else
13839    llvm_unreachable("Unexpected intrinsic");
13840
13841  Value *Ternlog = CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IID),
13842                                          Ops.drop_back());
13843  Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Ops[0];
13844  return EmitX86Select(CGF, Ops[4], Ternlog, PassThru);
13845}
13846
13847static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op,
13848                              llvm::Type *DstTy) {
13849  unsigned NumberOfElements =
13850      cast<llvm::FixedVectorType>(DstTy)->getNumElements();
13851  Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements);
13852  return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
13853}
13854
13855Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
13856  const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
13857  StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
13858  return EmitX86CpuIs(CPUStr);
13859}
13860
13861// Convert F16 halfs to floats.
13862static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF,
13863                                       ArrayRef<Value *> Ops,
13864                                       llvm::Type *DstTy) {
13865  assert((Ops.size() == 1 || Ops.size() == 3 || Ops.size() == 4) &&
13866         "Unknown cvtph2ps intrinsic");
13867
13868  // If the SAE intrinsic doesn't use default rounding then we can't upgrade.
13869  if (Ops.size() == 4 && cast<llvm::ConstantInt>(Ops[3])->getZExtValue() != 4) {
13870    Function *F =
13871        CGF.CGM.getIntrinsic(Intrinsic::x86_avx512_mask_vcvtph2ps_512);
13872    return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]});
13873  }
13874
13875  unsigned NumDstElts = cast<llvm::FixedVectorType>(DstTy)->getNumElements();
13876  Value *Src = Ops[0];
13877
13878  // Extract the subvector.
13879  if (NumDstElts !=
13880      cast<llvm::FixedVectorType>(Src->getType())->getNumElements()) {
13881    assert(NumDstElts == 4 && "Unexpected vector size");
13882    Src = CGF.Builder.CreateShuffleVector(Src, ArrayRef<int>{0, 1, 2, 3});
13883  }
13884
13885  // Bitcast from vXi16 to vXf16.
13886  auto *HalfTy = llvm::FixedVectorType::get(
13887      llvm::Type::getHalfTy(CGF.getLLVMContext()), NumDstElts);
13888  Src = CGF.Builder.CreateBitCast(Src, HalfTy);
13889
13890  // Perform the fp-extension.
13891  Value *Res = CGF.Builder.CreateFPExt(Src, DstTy, "cvtph2ps");
13892
13893  if (Ops.size() >= 3)
13894    Res = EmitX86Select(CGF, Ops[2], Res, Ops[1]);
13895  return Res;
13896}
13897
13898Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {
13899
13900  llvm::Type *Int32Ty = Builder.getInt32Ty();
13901
13902  // Matching the struct layout from the compiler-rt/libgcc structure that is
13903  // filled in:
13904  // unsigned int __cpu_vendor;
13905  // unsigned int __cpu_type;
13906  // unsigned int __cpu_subtype;
13907  // unsigned int __cpu_features[1];
13908  llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
13909                                          llvm::ArrayType::get(Int32Ty, 1));
13910
13911  // Grab the global __cpu_model.
13912  llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
13913  cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
13914
13915  // Calculate the index needed to access the correct field based on the
13916  // range. Also adjust the expected value.
13917  unsigned Index;
13918  unsigned Value;
13919  std::tie(Index, Value) = StringSwitch<std::pair<unsigned, unsigned>>(CPUStr)
13920#define X86_VENDOR(ENUM, STRING)                                               \
13921  .Case(STRING, {0u, static_cast<unsigned>(llvm::X86::ENUM)})
13922#define X86_CPU_TYPE_ALIAS(ENUM, ALIAS)                                        \
13923  .Case(ALIAS, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
13924#define X86_CPU_TYPE(ENUM, STR)                                                \
13925  .Case(STR, {1u, static_cast<unsigned>(llvm::X86::ENUM)})
13926#define X86_CPU_SUBTYPE_ALIAS(ENUM, ALIAS)                                     \
13927  .Case(ALIAS, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
13928#define X86_CPU_SUBTYPE(ENUM, STR)                                             \
13929  .Case(STR, {2u, static_cast<unsigned>(llvm::X86::ENUM)})
13930#include "llvm/TargetParser/X86TargetParser.def"
13931                               .Default({0, 0});
13932  assert(Value != 0 && "Invalid CPUStr passed to CpuIs");
13933
13934  // Grab the appropriate field from __cpu_model.
13935  llvm::Value *Idxs[] = {ConstantInt::get(Int32Ty, 0),
13936                         ConstantInt::get(Int32Ty, Index)};
13937  llvm::Value *CpuValue = Builder.CreateGEP(STy, CpuModel, Idxs);
13938  CpuValue = Builder.CreateAlignedLoad(Int32Ty, CpuValue,
13939                                       CharUnits::fromQuantity(4));
13940
13941  // Check the value of the field against the requested value.
13942  return Builder.CreateICmpEQ(CpuValue,
13943                                  llvm::ConstantInt::get(Int32Ty, Value));
13944}
13945
13946Value *CodeGenFunction::EmitX86CpuSupports(const CallExpr *E) {
13947  const Expr *FeatureExpr = E->getArg(0)->IgnoreParenCasts();
13948  StringRef FeatureStr = cast<StringLiteral>(FeatureExpr)->getString();
13949  return EmitX86CpuSupports(FeatureStr);
13950}
13951
13952Value *CodeGenFunction::EmitX86CpuSupports(ArrayRef<StringRef> FeatureStrs) {
13953  return EmitX86CpuSupports(llvm::X86::getCpuSupportsMask(FeatureStrs));
13954}
13955
13956llvm::Value *
13957CodeGenFunction::EmitX86CpuSupports(std::array<uint32_t, 4> FeatureMask) {
13958  Value *Result = Builder.getTrue();
13959  if (FeatureMask[0] != 0) {
13960    // Matching the struct layout from the compiler-rt/libgcc structure that is
13961    // filled in:
13962    // unsigned int __cpu_vendor;
13963    // unsigned int __cpu_type;
13964    // unsigned int __cpu_subtype;
13965    // unsigned int __cpu_features[1];
13966    llvm::Type *STy = llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty,
13967                                            llvm::ArrayType::get(Int32Ty, 1));
13968
13969    // Grab the global __cpu_model.
13970    llvm::Constant *CpuModel = CGM.CreateRuntimeVariable(STy, "__cpu_model");
13971    cast<llvm::GlobalValue>(CpuModel)->setDSOLocal(true);
13972
13973    // Grab the first (0th) element from the field __cpu_features off of the
13974    // global in the struct STy.
13975    Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(3),
13976                     Builder.getInt32(0)};
13977    Value *CpuFeatures = Builder.CreateGEP(STy, CpuModel, Idxs);
13978    Value *Features = Builder.CreateAlignedLoad(Int32Ty, CpuFeatures,
13979                                                CharUnits::fromQuantity(4));
13980
13981    // Check the value of the bit corresponding to the feature requested.
13982    Value *Mask = Builder.getInt32(FeatureMask[0]);
13983    Value *Bitset = Builder.CreateAnd(Features, Mask);
13984    Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
13985    Result = Builder.CreateAnd(Result, Cmp);
13986  }
13987
13988  llvm::Type *ATy = llvm::ArrayType::get(Int32Ty, 3);
13989  llvm::Constant *CpuFeatures2 =
13990      CGM.CreateRuntimeVariable(ATy, "__cpu_features2");
13991  cast<llvm::GlobalValue>(CpuFeatures2)->setDSOLocal(true);
13992  for (int i = 1; i != 4; ++i) {
13993    const uint32_t M = FeatureMask[i];
13994    if (!M)
13995      continue;
13996    Value *Idxs[] = {Builder.getInt32(0), Builder.getInt32(i - 1)};
13997    Value *Features = Builder.CreateAlignedLoad(
13998        Int32Ty, Builder.CreateGEP(ATy, CpuFeatures2, Idxs),
13999        CharUnits::fromQuantity(4));
14000    // Check the value of the bit corresponding to the feature requested.
14001    Value *Mask = Builder.getInt32(M);
14002    Value *Bitset = Builder.CreateAnd(Features, Mask);
14003    Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
14004    Result = Builder.CreateAnd(Result, Cmp);
14005  }
14006
14007  return Result;
14008}
14009
14010Value *CodeGenFunction::EmitAArch64CpuInit() {
14011  llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
14012  llvm::FunctionCallee Func =
14013      CGM.CreateRuntimeFunction(FTy, "__init_cpu_features_resolver");
14014  cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true);
14015  cast<llvm::GlobalValue>(Func.getCallee())
14016      ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
14017  return Builder.CreateCall(Func);
14018}
14019
14020Value *CodeGenFunction::EmitX86CpuInit() {
14021  llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy,
14022                                                    /*Variadic*/ false);
14023  llvm::FunctionCallee Func =
14024      CGM.CreateRuntimeFunction(FTy, "__cpu_indicator_init");
14025  cast<llvm::GlobalValue>(Func.getCallee())->setDSOLocal(true);
14026  cast<llvm::GlobalValue>(Func.getCallee())
14027      ->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
14028  return Builder.CreateCall(Func);
14029}
14030
14031llvm::Value *
14032CodeGenFunction::EmitAArch64CpuSupports(ArrayRef<StringRef> FeaturesStrs) {
14033  uint64_t FeaturesMask = llvm::AArch64::getCpuSupportsMask(FeaturesStrs);
14034  Value *Result = Builder.getTrue();
14035  if (FeaturesMask != 0) {
14036    // Get features from structure in runtime library
14037    // struct {
14038    //   unsigned long long features;
14039    // } __aarch64_cpu_features;
14040    llvm::Type *STy = llvm::StructType::get(Int64Ty);
14041    llvm::Constant *AArch64CPUFeatures =
14042        CGM.CreateRuntimeVariable(STy, "__aarch64_cpu_features");
14043    cast<llvm::GlobalValue>(AArch64CPUFeatures)->setDSOLocal(true);
14044    llvm::Value *CpuFeatures = Builder.CreateGEP(
14045        STy, AArch64CPUFeatures,
14046        {ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 0)});
14047    Value *Features = Builder.CreateAlignedLoad(Int64Ty, CpuFeatures,
14048                                                CharUnits::fromQuantity(8));
14049    Value *Mask = Builder.getInt64(FeaturesMask);
14050    Value *Bitset = Builder.CreateAnd(Features, Mask);
14051    Value *Cmp = Builder.CreateICmpEQ(Bitset, Mask);
14052    Result = Builder.CreateAnd(Result, Cmp);
14053  }
14054  return Result;
14055}
14056
14057Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
14058                                           const CallExpr *E) {
14059  if (BuiltinID == X86::BI__builtin_cpu_is)
14060    return EmitX86CpuIs(E);
14061  if (BuiltinID == X86::BI__builtin_cpu_supports)
14062    return EmitX86CpuSupports(E);
14063  if (BuiltinID == X86::BI__builtin_cpu_init)
14064    return EmitX86CpuInit();
14065
14066  // Handle MSVC intrinsics before argument evaluation to prevent double
14067  // evaluation.
14068  if (std::optional<MSVCIntrin> MsvcIntId = translateX86ToMsvcIntrin(BuiltinID))
14069    return EmitMSVCBuiltinExpr(*MsvcIntId, E);
14070
14071  SmallVector<Value*, 4> Ops;
14072  bool IsMaskFCmp = false;
14073  bool IsConjFMA = false;
14074
14075  // Find out if any arguments are required to be integer constant expressions.
14076  unsigned ICEArguments = 0;
14077  ASTContext::GetBuiltinTypeError Error;
14078  getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
14079  assert(Error == ASTContext::GE_None && "Should not codegen an error");
14080
14081  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
14082    Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
14083  }
14084
14085  // These exist so that the builtin that takes an immediate can be bounds
14086  // checked by clang to avoid passing bad immediates to the backend. Since
14087  // AVX has a larger immediate than SSE we would need separate builtins to
14088  // do the different bounds checking. Rather than create a clang specific
14089  // SSE only builtin, this implements eight separate builtins to match gcc
14090  // implementation.
14091  auto getCmpIntrinsicCall = [this, &Ops](Intrinsic::ID ID, unsigned Imm) {
14092    Ops.push_back(llvm::ConstantInt::get(Int8Ty, Imm));
14093    llvm::Function *F = CGM.getIntrinsic(ID);
14094    return Builder.CreateCall(F, Ops);
14095  };
14096
14097  // For the vector forms of FP comparisons, translate the builtins directly to
14098  // IR.
14099  // TODO: The builtins could be removed if the SSE header files used vector
14100  // extension comparisons directly (vector ordered/unordered may need
14101  // additional support via __builtin_isnan()).
14102  auto getVectorFCmpIR = [this, &Ops, E](CmpInst::Predicate Pred,
14103                                         bool IsSignaling) {
14104    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
14105    Value *Cmp;
14106    if (IsSignaling)
14107      Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
14108    else
14109      Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
14110    llvm::VectorType *FPVecTy = cast<llvm::VectorType>(Ops[0]->getType());
14111    llvm::VectorType *IntVecTy = llvm::VectorType::getInteger(FPVecTy);
14112    Value *Sext = Builder.CreateSExt(Cmp, IntVecTy);
14113    return Builder.CreateBitCast(Sext, FPVecTy);
14114  };
14115
14116  switch (BuiltinID) {
14117  default: return nullptr;
14118  case X86::BI_mm_prefetch: {
14119    Value *Address = Ops[0];
14120    ConstantInt *C = cast<ConstantInt>(Ops[1]);
14121    Value *RW = ConstantInt::get(Int32Ty, (C->getZExtValue() >> 2) & 0x1);
14122    Value *Locality = ConstantInt::get(Int32Ty, C->getZExtValue() & 0x3);
14123    Value *Data = ConstantInt::get(Int32Ty, 1);
14124    Function *F = CGM.getIntrinsic(Intrinsic::prefetch, Address->getType());
14125    return Builder.CreateCall(F, {Address, RW, Locality, Data});
14126  }
14127  case X86::BI_mm_clflush: {
14128    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_clflush),
14129                              Ops[0]);
14130  }
14131  case X86::BI_mm_lfence: {
14132    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_lfence));
14133  }
14134  case X86::BI_mm_mfence: {
14135    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_mfence));
14136  }
14137  case X86::BI_mm_sfence: {
14138    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_sfence));
14139  }
14140  case X86::BI_mm_pause: {
14141    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse2_pause));
14142  }
14143  case X86::BI__rdtsc: {
14144    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtsc));
14145  }
14146  case X86::BI__builtin_ia32_rdtscp: {
14147    Value *Call = Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_rdtscp));
14148    Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
14149                                      Ops[0]);
14150    return Builder.CreateExtractValue(Call, 0);
14151  }
14152  case X86::BI__builtin_ia32_lzcnt_u16:
14153  case X86::BI__builtin_ia32_lzcnt_u32:
14154  case X86::BI__builtin_ia32_lzcnt_u64: {
14155    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
14156    return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
14157  }
14158  case X86::BI__builtin_ia32_tzcnt_u16:
14159  case X86::BI__builtin_ia32_tzcnt_u32:
14160  case X86::BI__builtin_ia32_tzcnt_u64: {
14161    Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
14162    return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
14163  }
14164  case X86::BI__builtin_ia32_undef128:
14165  case X86::BI__builtin_ia32_undef256:
14166  case X86::BI__builtin_ia32_undef512:
14167    // The x86 definition of "undef" is not the same as the LLVM definition
14168    // (PR32176). We leave optimizing away an unnecessary zero constant to the
14169    // IR optimizer and backend.
14170    // TODO: If we had a "freeze" IR instruction to generate a fixed undef
14171    // value, we should use that here instead of a zero.
14172    return llvm::Constant::getNullValue(ConvertType(E->getType()));
14173  case X86::BI__builtin_ia32_vec_init_v8qi:
14174  case X86::BI__builtin_ia32_vec_init_v4hi:
14175  case X86::BI__builtin_ia32_vec_init_v2si:
14176    return Builder.CreateBitCast(BuildVector(Ops),
14177                                 llvm::Type::getX86_MMXTy(getLLVMContext()));
14178  case X86::BI__builtin_ia32_vec_ext_v2si:
14179  case X86::BI__builtin_ia32_vec_ext_v16qi:
14180  case X86::BI__builtin_ia32_vec_ext_v8hi:
14181  case X86::BI__builtin_ia32_vec_ext_v4si:
14182  case X86::BI__builtin_ia32_vec_ext_v4sf:
14183  case X86::BI__builtin_ia32_vec_ext_v2di:
14184  case X86::BI__builtin_ia32_vec_ext_v32qi:
14185  case X86::BI__builtin_ia32_vec_ext_v16hi:
14186  case X86::BI__builtin_ia32_vec_ext_v8si:
14187  case X86::BI__builtin_ia32_vec_ext_v4di: {
14188    unsigned NumElts =
14189        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14190    uint64_t Index = cast<ConstantInt>(Ops[1])->getZExtValue();
14191    Index &= NumElts - 1;
14192    // These builtins exist so we can ensure the index is an ICE and in range.
14193    // Otherwise we could just do this in the header file.
14194    return Builder.CreateExtractElement(Ops[0], Index);
14195  }
14196  case X86::BI__builtin_ia32_vec_set_v16qi:
14197  case X86::BI__builtin_ia32_vec_set_v8hi:
14198  case X86::BI__builtin_ia32_vec_set_v4si:
14199  case X86::BI__builtin_ia32_vec_set_v2di:
14200  case X86::BI__builtin_ia32_vec_set_v32qi:
14201  case X86::BI__builtin_ia32_vec_set_v16hi:
14202  case X86::BI__builtin_ia32_vec_set_v8si:
14203  case X86::BI__builtin_ia32_vec_set_v4di: {
14204    unsigned NumElts =
14205        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14206    unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
14207    Index &= NumElts - 1;
14208    // These builtins exist so we can ensure the index is an ICE and in range.
14209    // Otherwise we could just do this in the header file.
14210    return Builder.CreateInsertElement(Ops[0], Ops[1], Index);
14211  }
14212  case X86::BI_mm_setcsr:
14213  case X86::BI__builtin_ia32_ldmxcsr: {
14214    Address Tmp = CreateMemTemp(E->getArg(0)->getType());
14215    Builder.CreateStore(Ops[0], Tmp);
14216    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_ldmxcsr),
14217                              Tmp.getPointer());
14218  }
14219  case X86::BI_mm_getcsr:
14220  case X86::BI__builtin_ia32_stmxcsr: {
14221    Address Tmp = CreateMemTemp(E->getType());
14222    Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_sse_stmxcsr),
14223                       Tmp.getPointer());
14224    return Builder.CreateLoad(Tmp, "stmxcsr");
14225  }
14226  case X86::BI__builtin_ia32_xsave:
14227  case X86::BI__builtin_ia32_xsave64:
14228  case X86::BI__builtin_ia32_xrstor:
14229  case X86::BI__builtin_ia32_xrstor64:
14230  case X86::BI__builtin_ia32_xsaveopt:
14231  case X86::BI__builtin_ia32_xsaveopt64:
14232  case X86::BI__builtin_ia32_xrstors:
14233  case X86::BI__builtin_ia32_xrstors64:
14234  case X86::BI__builtin_ia32_xsavec:
14235  case X86::BI__builtin_ia32_xsavec64:
14236  case X86::BI__builtin_ia32_xsaves:
14237  case X86::BI__builtin_ia32_xsaves64:
14238  case X86::BI__builtin_ia32_xsetbv:
14239  case X86::BI_xsetbv: {
14240    Intrinsic::ID ID;
14241#define INTRINSIC_X86_XSAVE_ID(NAME) \
14242    case X86::BI__builtin_ia32_##NAME: \
14243      ID = Intrinsic::x86_##NAME; \
14244      break
14245    switch (BuiltinID) {
14246    default: llvm_unreachable("Unsupported intrinsic!");
14247    INTRINSIC_X86_XSAVE_ID(xsave);
14248    INTRINSIC_X86_XSAVE_ID(xsave64);
14249    INTRINSIC_X86_XSAVE_ID(xrstor);
14250    INTRINSIC_X86_XSAVE_ID(xrstor64);
14251    INTRINSIC_X86_XSAVE_ID(xsaveopt);
14252    INTRINSIC_X86_XSAVE_ID(xsaveopt64);
14253    INTRINSIC_X86_XSAVE_ID(xrstors);
14254    INTRINSIC_X86_XSAVE_ID(xrstors64);
14255    INTRINSIC_X86_XSAVE_ID(xsavec);
14256    INTRINSIC_X86_XSAVE_ID(xsavec64);
14257    INTRINSIC_X86_XSAVE_ID(xsaves);
14258    INTRINSIC_X86_XSAVE_ID(xsaves64);
14259    INTRINSIC_X86_XSAVE_ID(xsetbv);
14260    case X86::BI_xsetbv:
14261      ID = Intrinsic::x86_xsetbv;
14262      break;
14263    }
14264#undef INTRINSIC_X86_XSAVE_ID
14265    Value *Mhi = Builder.CreateTrunc(
14266      Builder.CreateLShr(Ops[1], ConstantInt::get(Int64Ty, 32)), Int32Ty);
14267    Value *Mlo = Builder.CreateTrunc(Ops[1], Int32Ty);
14268    Ops[1] = Mhi;
14269    Ops.push_back(Mlo);
14270    return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
14271  }
14272  case X86::BI__builtin_ia32_xgetbv:
14273  case X86::BI_xgetbv:
14274    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::x86_xgetbv), Ops);
14275  case X86::BI__builtin_ia32_storedqudi128_mask:
14276  case X86::BI__builtin_ia32_storedqusi128_mask:
14277  case X86::BI__builtin_ia32_storedquhi128_mask:
14278  case X86::BI__builtin_ia32_storedquqi128_mask:
14279  case X86::BI__builtin_ia32_storeupd128_mask:
14280  case X86::BI__builtin_ia32_storeups128_mask:
14281  case X86::BI__builtin_ia32_storedqudi256_mask:
14282  case X86::BI__builtin_ia32_storedqusi256_mask:
14283  case X86::BI__builtin_ia32_storedquhi256_mask:
14284  case X86::BI__builtin_ia32_storedquqi256_mask:
14285  case X86::BI__builtin_ia32_storeupd256_mask:
14286  case X86::BI__builtin_ia32_storeups256_mask:
14287  case X86::BI__builtin_ia32_storedqudi512_mask:
14288  case X86::BI__builtin_ia32_storedqusi512_mask:
14289  case X86::BI__builtin_ia32_storedquhi512_mask:
14290  case X86::BI__builtin_ia32_storedquqi512_mask:
14291  case X86::BI__builtin_ia32_storeupd512_mask:
14292  case X86::BI__builtin_ia32_storeups512_mask:
14293    return EmitX86MaskedStore(*this, Ops, Align(1));
14294
14295  case X86::BI__builtin_ia32_storesh128_mask:
14296  case X86::BI__builtin_ia32_storess128_mask:
14297  case X86::BI__builtin_ia32_storesd128_mask:
14298    return EmitX86MaskedStore(*this, Ops, Align(1));
14299
14300  case X86::BI__builtin_ia32_vpopcntb_128:
14301  case X86::BI__builtin_ia32_vpopcntd_128:
14302  case X86::BI__builtin_ia32_vpopcntq_128:
14303  case X86::BI__builtin_ia32_vpopcntw_128:
14304  case X86::BI__builtin_ia32_vpopcntb_256:
14305  case X86::BI__builtin_ia32_vpopcntd_256:
14306  case X86::BI__builtin_ia32_vpopcntq_256:
14307  case X86::BI__builtin_ia32_vpopcntw_256:
14308  case X86::BI__builtin_ia32_vpopcntb_512:
14309  case X86::BI__builtin_ia32_vpopcntd_512:
14310  case X86::BI__builtin_ia32_vpopcntq_512:
14311  case X86::BI__builtin_ia32_vpopcntw_512: {
14312    llvm::Type *ResultType = ConvertType(E->getType());
14313    llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
14314    return Builder.CreateCall(F, Ops);
14315  }
14316  case X86::BI__builtin_ia32_cvtmask2b128:
14317  case X86::BI__builtin_ia32_cvtmask2b256:
14318  case X86::BI__builtin_ia32_cvtmask2b512:
14319  case X86::BI__builtin_ia32_cvtmask2w128:
14320  case X86::BI__builtin_ia32_cvtmask2w256:
14321  case X86::BI__builtin_ia32_cvtmask2w512:
14322  case X86::BI__builtin_ia32_cvtmask2d128:
14323  case X86::BI__builtin_ia32_cvtmask2d256:
14324  case X86::BI__builtin_ia32_cvtmask2d512:
14325  case X86::BI__builtin_ia32_cvtmask2q128:
14326  case X86::BI__builtin_ia32_cvtmask2q256:
14327  case X86::BI__builtin_ia32_cvtmask2q512:
14328    return EmitX86SExtMask(*this, Ops[0], ConvertType(E->getType()));
14329
14330  case X86::BI__builtin_ia32_cvtb2mask128:
14331  case X86::BI__builtin_ia32_cvtb2mask256:
14332  case X86::BI__builtin_ia32_cvtb2mask512:
14333  case X86::BI__builtin_ia32_cvtw2mask128:
14334  case X86::BI__builtin_ia32_cvtw2mask256:
14335  case X86::BI__builtin_ia32_cvtw2mask512:
14336  case X86::BI__builtin_ia32_cvtd2mask128:
14337  case X86::BI__builtin_ia32_cvtd2mask256:
14338  case X86::BI__builtin_ia32_cvtd2mask512:
14339  case X86::BI__builtin_ia32_cvtq2mask128:
14340  case X86::BI__builtin_ia32_cvtq2mask256:
14341  case X86::BI__builtin_ia32_cvtq2mask512:
14342    return EmitX86ConvertToMask(*this, Ops[0]);
14343
14344  case X86::BI__builtin_ia32_cvtdq2ps512_mask:
14345  case X86::BI__builtin_ia32_cvtqq2ps512_mask:
14346  case X86::BI__builtin_ia32_cvtqq2pd512_mask:
14347  case X86::BI__builtin_ia32_vcvtw2ph512_mask:
14348  case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
14349  case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
14350    return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true);
14351  case X86::BI__builtin_ia32_cvtudq2ps512_mask:
14352  case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
14353  case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
14354  case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
14355  case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
14356  case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
14357    return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
14358
14359  case X86::BI__builtin_ia32_vfmaddss3:
14360  case X86::BI__builtin_ia32_vfmaddsd3:
14361  case X86::BI__builtin_ia32_vfmaddsh3_mask:
14362  case X86::BI__builtin_ia32_vfmaddss3_mask:
14363  case X86::BI__builtin_ia32_vfmaddsd3_mask:
14364    return EmitScalarFMAExpr(*this, E, Ops, Ops[0]);
14365  case X86::BI__builtin_ia32_vfmaddss:
14366  case X86::BI__builtin_ia32_vfmaddsd:
14367    return EmitScalarFMAExpr(*this, E, Ops,
14368                             Constant::getNullValue(Ops[0]->getType()));
14369  case X86::BI__builtin_ia32_vfmaddsh3_maskz:
14370  case X86::BI__builtin_ia32_vfmaddss3_maskz:
14371  case X86::BI__builtin_ia32_vfmaddsd3_maskz:
14372    return EmitScalarFMAExpr(*this, E, Ops, Ops[0], /*ZeroMask*/ true);
14373  case X86::BI__builtin_ia32_vfmaddsh3_mask3:
14374  case X86::BI__builtin_ia32_vfmaddss3_mask3:
14375  case X86::BI__builtin_ia32_vfmaddsd3_mask3:
14376    return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2);
14377  case X86::BI__builtin_ia32_vfmsubsh3_mask3:
14378  case X86::BI__builtin_ia32_vfmsubss3_mask3:
14379  case X86::BI__builtin_ia32_vfmsubsd3_mask3:
14380    return EmitScalarFMAExpr(*this, E, Ops, Ops[2], /*ZeroMask*/ false, 2,
14381                             /*NegAcc*/ true);
14382  case X86::BI__builtin_ia32_vfmaddph:
14383  case X86::BI__builtin_ia32_vfmaddps:
14384  case X86::BI__builtin_ia32_vfmaddpd:
14385  case X86::BI__builtin_ia32_vfmaddph256:
14386  case X86::BI__builtin_ia32_vfmaddps256:
14387  case X86::BI__builtin_ia32_vfmaddpd256:
14388  case X86::BI__builtin_ia32_vfmaddph512_mask:
14389  case X86::BI__builtin_ia32_vfmaddph512_maskz:
14390  case X86::BI__builtin_ia32_vfmaddph512_mask3:
14391  case X86::BI__builtin_ia32_vfmaddps512_mask:
14392  case X86::BI__builtin_ia32_vfmaddps512_maskz:
14393  case X86::BI__builtin_ia32_vfmaddps512_mask3:
14394  case X86::BI__builtin_ia32_vfmsubps512_mask3:
14395  case X86::BI__builtin_ia32_vfmaddpd512_mask:
14396  case X86::BI__builtin_ia32_vfmaddpd512_maskz:
14397  case X86::BI__builtin_ia32_vfmaddpd512_mask3:
14398  case X86::BI__builtin_ia32_vfmsubpd512_mask3:
14399  case X86::BI__builtin_ia32_vfmsubph512_mask3:
14400    return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ false);
14401  case X86::BI__builtin_ia32_vfmaddsubph512_mask:
14402  case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
14403  case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
14404  case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
14405  case X86::BI__builtin_ia32_vfmaddsubps512_mask:
14406  case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
14407  case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
14408  case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
14409  case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
14410  case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
14411  case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
14412  case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
14413    return EmitX86FMAExpr(*this, E, Ops, BuiltinID, /*IsAddSub*/ true);
14414
14415  case X86::BI__builtin_ia32_movdqa32store128_mask:
14416  case X86::BI__builtin_ia32_movdqa64store128_mask:
14417  case X86::BI__builtin_ia32_storeaps128_mask:
14418  case X86::BI__builtin_ia32_storeapd128_mask:
14419  case X86::BI__builtin_ia32_movdqa32store256_mask:
14420  case X86::BI__builtin_ia32_movdqa64store256_mask:
14421  case X86::BI__builtin_ia32_storeaps256_mask:
14422  case X86::BI__builtin_ia32_storeapd256_mask:
14423  case X86::BI__builtin_ia32_movdqa32store512_mask:
14424  case X86::BI__builtin_ia32_movdqa64store512_mask:
14425  case X86::BI__builtin_ia32_storeaps512_mask:
14426  case X86::BI__builtin_ia32_storeapd512_mask:
14427    return EmitX86MaskedStore(
14428        *this, Ops,
14429        getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
14430
14431  case X86::BI__builtin_ia32_loadups128_mask:
14432  case X86::BI__builtin_ia32_loadups256_mask:
14433  case X86::BI__builtin_ia32_loadups512_mask:
14434  case X86::BI__builtin_ia32_loadupd128_mask:
14435  case X86::BI__builtin_ia32_loadupd256_mask:
14436  case X86::BI__builtin_ia32_loadupd512_mask:
14437  case X86::BI__builtin_ia32_loaddquqi128_mask:
14438  case X86::BI__builtin_ia32_loaddquqi256_mask:
14439  case X86::BI__builtin_ia32_loaddquqi512_mask:
14440  case X86::BI__builtin_ia32_loaddquhi128_mask:
14441  case X86::BI__builtin_ia32_loaddquhi256_mask:
14442  case X86::BI__builtin_ia32_loaddquhi512_mask:
14443  case X86::BI__builtin_ia32_loaddqusi128_mask:
14444  case X86::BI__builtin_ia32_loaddqusi256_mask:
14445  case X86::BI__builtin_ia32_loaddqusi512_mask:
14446  case X86::BI__builtin_ia32_loaddqudi128_mask:
14447  case X86::BI__builtin_ia32_loaddqudi256_mask:
14448  case X86::BI__builtin_ia32_loaddqudi512_mask:
14449    return EmitX86MaskedLoad(*this, Ops, Align(1));
14450
14451  case X86::BI__builtin_ia32_loadsh128_mask:
14452  case X86::BI__builtin_ia32_loadss128_mask:
14453  case X86::BI__builtin_ia32_loadsd128_mask:
14454    return EmitX86MaskedLoad(*this, Ops, Align(1));
14455
14456  case X86::BI__builtin_ia32_loadaps128_mask:
14457  case X86::BI__builtin_ia32_loadaps256_mask:
14458  case X86::BI__builtin_ia32_loadaps512_mask:
14459  case X86::BI__builtin_ia32_loadapd128_mask:
14460  case X86::BI__builtin_ia32_loadapd256_mask:
14461  case X86::BI__builtin_ia32_loadapd512_mask:
14462  case X86::BI__builtin_ia32_movdqa32load128_mask:
14463  case X86::BI__builtin_ia32_movdqa32load256_mask:
14464  case X86::BI__builtin_ia32_movdqa32load512_mask:
14465  case X86::BI__builtin_ia32_movdqa64load128_mask:
14466  case X86::BI__builtin_ia32_movdqa64load256_mask:
14467  case X86::BI__builtin_ia32_movdqa64load512_mask:
14468    return EmitX86MaskedLoad(
14469        *this, Ops,
14470        getContext().getTypeAlignInChars(E->getArg(1)->getType()).getAsAlign());
14471
14472  case X86::BI__builtin_ia32_expandloaddf128_mask:
14473  case X86::BI__builtin_ia32_expandloaddf256_mask:
14474  case X86::BI__builtin_ia32_expandloaddf512_mask:
14475  case X86::BI__builtin_ia32_expandloadsf128_mask:
14476  case X86::BI__builtin_ia32_expandloadsf256_mask:
14477  case X86::BI__builtin_ia32_expandloadsf512_mask:
14478  case X86::BI__builtin_ia32_expandloaddi128_mask:
14479  case X86::BI__builtin_ia32_expandloaddi256_mask:
14480  case X86::BI__builtin_ia32_expandloaddi512_mask:
14481  case X86::BI__builtin_ia32_expandloadsi128_mask:
14482  case X86::BI__builtin_ia32_expandloadsi256_mask:
14483  case X86::BI__builtin_ia32_expandloadsi512_mask:
14484  case X86::BI__builtin_ia32_expandloadhi128_mask:
14485  case X86::BI__builtin_ia32_expandloadhi256_mask:
14486  case X86::BI__builtin_ia32_expandloadhi512_mask:
14487  case X86::BI__builtin_ia32_expandloadqi128_mask:
14488  case X86::BI__builtin_ia32_expandloadqi256_mask:
14489  case X86::BI__builtin_ia32_expandloadqi512_mask:
14490    return EmitX86ExpandLoad(*this, Ops);
14491
14492  case X86::BI__builtin_ia32_compressstoredf128_mask:
14493  case X86::BI__builtin_ia32_compressstoredf256_mask:
14494  case X86::BI__builtin_ia32_compressstoredf512_mask:
14495  case X86::BI__builtin_ia32_compressstoresf128_mask:
14496  case X86::BI__builtin_ia32_compressstoresf256_mask:
14497  case X86::BI__builtin_ia32_compressstoresf512_mask:
14498  case X86::BI__builtin_ia32_compressstoredi128_mask:
14499  case X86::BI__builtin_ia32_compressstoredi256_mask:
14500  case X86::BI__builtin_ia32_compressstoredi512_mask:
14501  case X86::BI__builtin_ia32_compressstoresi128_mask:
14502  case X86::BI__builtin_ia32_compressstoresi256_mask:
14503  case X86::BI__builtin_ia32_compressstoresi512_mask:
14504  case X86::BI__builtin_ia32_compressstorehi128_mask:
14505  case X86::BI__builtin_ia32_compressstorehi256_mask:
14506  case X86::BI__builtin_ia32_compressstorehi512_mask:
14507  case X86::BI__builtin_ia32_compressstoreqi128_mask:
14508  case X86::BI__builtin_ia32_compressstoreqi256_mask:
14509  case X86::BI__builtin_ia32_compressstoreqi512_mask:
14510    return EmitX86CompressStore(*this, Ops);
14511
14512  case X86::BI__builtin_ia32_expanddf128_mask:
14513  case X86::BI__builtin_ia32_expanddf256_mask:
14514  case X86::BI__builtin_ia32_expanddf512_mask:
14515  case X86::BI__builtin_ia32_expandsf128_mask:
14516  case X86::BI__builtin_ia32_expandsf256_mask:
14517  case X86::BI__builtin_ia32_expandsf512_mask:
14518  case X86::BI__builtin_ia32_expanddi128_mask:
14519  case X86::BI__builtin_ia32_expanddi256_mask:
14520  case X86::BI__builtin_ia32_expanddi512_mask:
14521  case X86::BI__builtin_ia32_expandsi128_mask:
14522  case X86::BI__builtin_ia32_expandsi256_mask:
14523  case X86::BI__builtin_ia32_expandsi512_mask:
14524  case X86::BI__builtin_ia32_expandhi128_mask:
14525  case X86::BI__builtin_ia32_expandhi256_mask:
14526  case X86::BI__builtin_ia32_expandhi512_mask:
14527  case X86::BI__builtin_ia32_expandqi128_mask:
14528  case X86::BI__builtin_ia32_expandqi256_mask:
14529  case X86::BI__builtin_ia32_expandqi512_mask:
14530    return EmitX86CompressExpand(*this, Ops, /*IsCompress*/false);
14531
14532  case X86::BI__builtin_ia32_compressdf128_mask:
14533  case X86::BI__builtin_ia32_compressdf256_mask:
14534  case X86::BI__builtin_ia32_compressdf512_mask:
14535  case X86::BI__builtin_ia32_compresssf128_mask:
14536  case X86::BI__builtin_ia32_compresssf256_mask:
14537  case X86::BI__builtin_ia32_compresssf512_mask:
14538  case X86::BI__builtin_ia32_compressdi128_mask:
14539  case X86::BI__builtin_ia32_compressdi256_mask:
14540  case X86::BI__builtin_ia32_compressdi512_mask:
14541  case X86::BI__builtin_ia32_compresssi128_mask:
14542  case X86::BI__builtin_ia32_compresssi256_mask:
14543  case X86::BI__builtin_ia32_compresssi512_mask:
14544  case X86::BI__builtin_ia32_compresshi128_mask:
14545  case X86::BI__builtin_ia32_compresshi256_mask:
14546  case X86::BI__builtin_ia32_compresshi512_mask:
14547  case X86::BI__builtin_ia32_compressqi128_mask:
14548  case X86::BI__builtin_ia32_compressqi256_mask:
14549  case X86::BI__builtin_ia32_compressqi512_mask:
14550    return EmitX86CompressExpand(*this, Ops, /*IsCompress*/true);
14551
14552  case X86::BI__builtin_ia32_gather3div2df:
14553  case X86::BI__builtin_ia32_gather3div2di:
14554  case X86::BI__builtin_ia32_gather3div4df:
14555  case X86::BI__builtin_ia32_gather3div4di:
14556  case X86::BI__builtin_ia32_gather3div4sf:
14557  case X86::BI__builtin_ia32_gather3div4si:
14558  case X86::BI__builtin_ia32_gather3div8sf:
14559  case X86::BI__builtin_ia32_gather3div8si:
14560  case X86::BI__builtin_ia32_gather3siv2df:
14561  case X86::BI__builtin_ia32_gather3siv2di:
14562  case X86::BI__builtin_ia32_gather3siv4df:
14563  case X86::BI__builtin_ia32_gather3siv4di:
14564  case X86::BI__builtin_ia32_gather3siv4sf:
14565  case X86::BI__builtin_ia32_gather3siv4si:
14566  case X86::BI__builtin_ia32_gather3siv8sf:
14567  case X86::BI__builtin_ia32_gather3siv8si:
14568  case X86::BI__builtin_ia32_gathersiv8df:
14569  case X86::BI__builtin_ia32_gathersiv16sf:
14570  case X86::BI__builtin_ia32_gatherdiv8df:
14571  case X86::BI__builtin_ia32_gatherdiv16sf:
14572  case X86::BI__builtin_ia32_gathersiv8di:
14573  case X86::BI__builtin_ia32_gathersiv16si:
14574  case X86::BI__builtin_ia32_gatherdiv8di:
14575  case X86::BI__builtin_ia32_gatherdiv16si: {
14576    Intrinsic::ID IID;
14577    switch (BuiltinID) {
14578    default: llvm_unreachable("Unexpected builtin");
14579    case X86::BI__builtin_ia32_gather3div2df:
14580      IID = Intrinsic::x86_avx512_mask_gather3div2_df;
14581      break;
14582    case X86::BI__builtin_ia32_gather3div2di:
14583      IID = Intrinsic::x86_avx512_mask_gather3div2_di;
14584      break;
14585    case X86::BI__builtin_ia32_gather3div4df:
14586      IID = Intrinsic::x86_avx512_mask_gather3div4_df;
14587      break;
14588    case X86::BI__builtin_ia32_gather3div4di:
14589      IID = Intrinsic::x86_avx512_mask_gather3div4_di;
14590      break;
14591    case X86::BI__builtin_ia32_gather3div4sf:
14592      IID = Intrinsic::x86_avx512_mask_gather3div4_sf;
14593      break;
14594    case X86::BI__builtin_ia32_gather3div4si:
14595      IID = Intrinsic::x86_avx512_mask_gather3div4_si;
14596      break;
14597    case X86::BI__builtin_ia32_gather3div8sf:
14598      IID = Intrinsic::x86_avx512_mask_gather3div8_sf;
14599      break;
14600    case X86::BI__builtin_ia32_gather3div8si:
14601      IID = Intrinsic::x86_avx512_mask_gather3div8_si;
14602      break;
14603    case X86::BI__builtin_ia32_gather3siv2df:
14604      IID = Intrinsic::x86_avx512_mask_gather3siv2_df;
14605      break;
14606    case X86::BI__builtin_ia32_gather3siv2di:
14607      IID = Intrinsic::x86_avx512_mask_gather3siv2_di;
14608      break;
14609    case X86::BI__builtin_ia32_gather3siv4df:
14610      IID = Intrinsic::x86_avx512_mask_gather3siv4_df;
14611      break;
14612    case X86::BI__builtin_ia32_gather3siv4di:
14613      IID = Intrinsic::x86_avx512_mask_gather3siv4_di;
14614      break;
14615    case X86::BI__builtin_ia32_gather3siv4sf:
14616      IID = Intrinsic::x86_avx512_mask_gather3siv4_sf;
14617      break;
14618    case X86::BI__builtin_ia32_gather3siv4si:
14619      IID = Intrinsic::x86_avx512_mask_gather3siv4_si;
14620      break;
14621    case X86::BI__builtin_ia32_gather3siv8sf:
14622      IID = Intrinsic::x86_avx512_mask_gather3siv8_sf;
14623      break;
14624    case X86::BI__builtin_ia32_gather3siv8si:
14625      IID = Intrinsic::x86_avx512_mask_gather3siv8_si;
14626      break;
14627    case X86::BI__builtin_ia32_gathersiv8df:
14628      IID = Intrinsic::x86_avx512_mask_gather_dpd_512;
14629      break;
14630    case X86::BI__builtin_ia32_gathersiv16sf:
14631      IID = Intrinsic::x86_avx512_mask_gather_dps_512;
14632      break;
14633    case X86::BI__builtin_ia32_gatherdiv8df:
14634      IID = Intrinsic::x86_avx512_mask_gather_qpd_512;
14635      break;
14636    case X86::BI__builtin_ia32_gatherdiv16sf:
14637      IID = Intrinsic::x86_avx512_mask_gather_qps_512;
14638      break;
14639    case X86::BI__builtin_ia32_gathersiv8di:
14640      IID = Intrinsic::x86_avx512_mask_gather_dpq_512;
14641      break;
14642    case X86::BI__builtin_ia32_gathersiv16si:
14643      IID = Intrinsic::x86_avx512_mask_gather_dpi_512;
14644      break;
14645    case X86::BI__builtin_ia32_gatherdiv8di:
14646      IID = Intrinsic::x86_avx512_mask_gather_qpq_512;
14647      break;
14648    case X86::BI__builtin_ia32_gatherdiv16si:
14649      IID = Intrinsic::x86_avx512_mask_gather_qpi_512;
14650      break;
14651    }
14652
14653    unsigned MinElts = std::min(
14654        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements(),
14655        cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements());
14656    Ops[3] = getMaskVecValue(*this, Ops[3], MinElts);
14657    Function *Intr = CGM.getIntrinsic(IID);
14658    return Builder.CreateCall(Intr, Ops);
14659  }
14660
14661  case X86::BI__builtin_ia32_scattersiv8df:
14662  case X86::BI__builtin_ia32_scattersiv16sf:
14663  case X86::BI__builtin_ia32_scatterdiv8df:
14664  case X86::BI__builtin_ia32_scatterdiv16sf:
14665  case X86::BI__builtin_ia32_scattersiv8di:
14666  case X86::BI__builtin_ia32_scattersiv16si:
14667  case X86::BI__builtin_ia32_scatterdiv8di:
14668  case X86::BI__builtin_ia32_scatterdiv16si:
14669  case X86::BI__builtin_ia32_scatterdiv2df:
14670  case X86::BI__builtin_ia32_scatterdiv2di:
14671  case X86::BI__builtin_ia32_scatterdiv4df:
14672  case X86::BI__builtin_ia32_scatterdiv4di:
14673  case X86::BI__builtin_ia32_scatterdiv4sf:
14674  case X86::BI__builtin_ia32_scatterdiv4si:
14675  case X86::BI__builtin_ia32_scatterdiv8sf:
14676  case X86::BI__builtin_ia32_scatterdiv8si:
14677  case X86::BI__builtin_ia32_scattersiv2df:
14678  case X86::BI__builtin_ia32_scattersiv2di:
14679  case X86::BI__builtin_ia32_scattersiv4df:
14680  case X86::BI__builtin_ia32_scattersiv4di:
14681  case X86::BI__builtin_ia32_scattersiv4sf:
14682  case X86::BI__builtin_ia32_scattersiv4si:
14683  case X86::BI__builtin_ia32_scattersiv8sf:
14684  case X86::BI__builtin_ia32_scattersiv8si: {
14685    Intrinsic::ID IID;
14686    switch (BuiltinID) {
14687    default: llvm_unreachable("Unexpected builtin");
14688    case X86::BI__builtin_ia32_scattersiv8df:
14689      IID = Intrinsic::x86_avx512_mask_scatter_dpd_512;
14690      break;
14691    case X86::BI__builtin_ia32_scattersiv16sf:
14692      IID = Intrinsic::x86_avx512_mask_scatter_dps_512;
14693      break;
14694    case X86::BI__builtin_ia32_scatterdiv8df:
14695      IID = Intrinsic::x86_avx512_mask_scatter_qpd_512;
14696      break;
14697    case X86::BI__builtin_ia32_scatterdiv16sf:
14698      IID = Intrinsic::x86_avx512_mask_scatter_qps_512;
14699      break;
14700    case X86::BI__builtin_ia32_scattersiv8di:
14701      IID = Intrinsic::x86_avx512_mask_scatter_dpq_512;
14702      break;
14703    case X86::BI__builtin_ia32_scattersiv16si:
14704      IID = Intrinsic::x86_avx512_mask_scatter_dpi_512;
14705      break;
14706    case X86::BI__builtin_ia32_scatterdiv8di:
14707      IID = Intrinsic::x86_avx512_mask_scatter_qpq_512;
14708      break;
14709    case X86::BI__builtin_ia32_scatterdiv16si:
14710      IID = Intrinsic::x86_avx512_mask_scatter_qpi_512;
14711      break;
14712    case X86::BI__builtin_ia32_scatterdiv2df:
14713      IID = Intrinsic::x86_avx512_mask_scatterdiv2_df;
14714      break;
14715    case X86::BI__builtin_ia32_scatterdiv2di:
14716      IID = Intrinsic::x86_avx512_mask_scatterdiv2_di;
14717      break;
14718    case X86::BI__builtin_ia32_scatterdiv4df:
14719      IID = Intrinsic::x86_avx512_mask_scatterdiv4_df;
14720      break;
14721    case X86::BI__builtin_ia32_scatterdiv4di:
14722      IID = Intrinsic::x86_avx512_mask_scatterdiv4_di;
14723      break;
14724    case X86::BI__builtin_ia32_scatterdiv4sf:
14725      IID = Intrinsic::x86_avx512_mask_scatterdiv4_sf;
14726      break;
14727    case X86::BI__builtin_ia32_scatterdiv4si:
14728      IID = Intrinsic::x86_avx512_mask_scatterdiv4_si;
14729      break;
14730    case X86::BI__builtin_ia32_scatterdiv8sf:
14731      IID = Intrinsic::x86_avx512_mask_scatterdiv8_sf;
14732      break;
14733    case X86::BI__builtin_ia32_scatterdiv8si:
14734      IID = Intrinsic::x86_avx512_mask_scatterdiv8_si;
14735      break;
14736    case X86::BI__builtin_ia32_scattersiv2df:
14737      IID = Intrinsic::x86_avx512_mask_scattersiv2_df;
14738      break;
14739    case X86::BI__builtin_ia32_scattersiv2di:
14740      IID = Intrinsic::x86_avx512_mask_scattersiv2_di;
14741      break;
14742    case X86::BI__builtin_ia32_scattersiv4df:
14743      IID = Intrinsic::x86_avx512_mask_scattersiv4_df;
14744      break;
14745    case X86::BI__builtin_ia32_scattersiv4di:
14746      IID = Intrinsic::x86_avx512_mask_scattersiv4_di;
14747      break;
14748    case X86::BI__builtin_ia32_scattersiv4sf:
14749      IID = Intrinsic::x86_avx512_mask_scattersiv4_sf;
14750      break;
14751    case X86::BI__builtin_ia32_scattersiv4si:
14752      IID = Intrinsic::x86_avx512_mask_scattersiv4_si;
14753      break;
14754    case X86::BI__builtin_ia32_scattersiv8sf:
14755      IID = Intrinsic::x86_avx512_mask_scattersiv8_sf;
14756      break;
14757    case X86::BI__builtin_ia32_scattersiv8si:
14758      IID = Intrinsic::x86_avx512_mask_scattersiv8_si;
14759      break;
14760    }
14761
14762    unsigned MinElts = std::min(
14763        cast<llvm::FixedVectorType>(Ops[2]->getType())->getNumElements(),
14764        cast<llvm::FixedVectorType>(Ops[3]->getType())->getNumElements());
14765    Ops[1] = getMaskVecValue(*this, Ops[1], MinElts);
14766    Function *Intr = CGM.getIntrinsic(IID);
14767    return Builder.CreateCall(Intr, Ops);
14768  }
14769
14770  case X86::BI__builtin_ia32_vextractf128_pd256:
14771  case X86::BI__builtin_ia32_vextractf128_ps256:
14772  case X86::BI__builtin_ia32_vextractf128_si256:
14773  case X86::BI__builtin_ia32_extract128i256:
14774  case X86::BI__builtin_ia32_extractf64x4_mask:
14775  case X86::BI__builtin_ia32_extractf32x4_mask:
14776  case X86::BI__builtin_ia32_extracti64x4_mask:
14777  case X86::BI__builtin_ia32_extracti32x4_mask:
14778  case X86::BI__builtin_ia32_extractf32x8_mask:
14779  case X86::BI__builtin_ia32_extracti32x8_mask:
14780  case X86::BI__builtin_ia32_extractf32x4_256_mask:
14781  case X86::BI__builtin_ia32_extracti32x4_256_mask:
14782  case X86::BI__builtin_ia32_extractf64x2_256_mask:
14783  case X86::BI__builtin_ia32_extracti64x2_256_mask:
14784  case X86::BI__builtin_ia32_extractf64x2_512_mask:
14785  case X86::BI__builtin_ia32_extracti64x2_512_mask: {
14786    auto *DstTy = cast<llvm::FixedVectorType>(ConvertType(E->getType()));
14787    unsigned NumElts = DstTy->getNumElements();
14788    unsigned SrcNumElts =
14789        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14790    unsigned SubVectors = SrcNumElts / NumElts;
14791    unsigned Index = cast<ConstantInt>(Ops[1])->getZExtValue();
14792    assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
14793    Index &= SubVectors - 1; // Remove any extra bits.
14794    Index *= NumElts;
14795
14796    int Indices[16];
14797    for (unsigned i = 0; i != NumElts; ++i)
14798      Indices[i] = i + Index;
14799
14800    Value *Res = Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14801                                             "extract");
14802
14803    if (Ops.size() == 4)
14804      Res = EmitX86Select(*this, Ops[3], Res, Ops[2]);
14805
14806    return Res;
14807  }
14808  case X86::BI__builtin_ia32_vinsertf128_pd256:
14809  case X86::BI__builtin_ia32_vinsertf128_ps256:
14810  case X86::BI__builtin_ia32_vinsertf128_si256:
14811  case X86::BI__builtin_ia32_insert128i256:
14812  case X86::BI__builtin_ia32_insertf64x4:
14813  case X86::BI__builtin_ia32_insertf32x4:
14814  case X86::BI__builtin_ia32_inserti64x4:
14815  case X86::BI__builtin_ia32_inserti32x4:
14816  case X86::BI__builtin_ia32_insertf32x8:
14817  case X86::BI__builtin_ia32_inserti32x8:
14818  case X86::BI__builtin_ia32_insertf32x4_256:
14819  case X86::BI__builtin_ia32_inserti32x4_256:
14820  case X86::BI__builtin_ia32_insertf64x2_256:
14821  case X86::BI__builtin_ia32_inserti64x2_256:
14822  case X86::BI__builtin_ia32_insertf64x2_512:
14823  case X86::BI__builtin_ia32_inserti64x2_512: {
14824    unsigned DstNumElts =
14825        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14826    unsigned SrcNumElts =
14827        cast<llvm::FixedVectorType>(Ops[1]->getType())->getNumElements();
14828    unsigned SubVectors = DstNumElts / SrcNumElts;
14829    unsigned Index = cast<ConstantInt>(Ops[2])->getZExtValue();
14830    assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors");
14831    Index &= SubVectors - 1; // Remove any extra bits.
14832    Index *= SrcNumElts;
14833
14834    int Indices[16];
14835    for (unsigned i = 0; i != DstNumElts; ++i)
14836      Indices[i] = (i >= SrcNumElts) ? SrcNumElts + (i % SrcNumElts) : i;
14837
14838    Value *Op1 = Builder.CreateShuffleVector(
14839        Ops[1], ArrayRef(Indices, DstNumElts), "widen");
14840
14841    for (unsigned i = 0; i != DstNumElts; ++i) {
14842      if (i >= Index && i < (Index + SrcNumElts))
14843        Indices[i] = (i - Index) + DstNumElts;
14844      else
14845        Indices[i] = i;
14846    }
14847
14848    return Builder.CreateShuffleVector(Ops[0], Op1,
14849                                       ArrayRef(Indices, DstNumElts), "insert");
14850  }
14851  case X86::BI__builtin_ia32_pmovqd512_mask:
14852  case X86::BI__builtin_ia32_pmovwb512_mask: {
14853    Value *Res = Builder.CreateTrunc(Ops[0], Ops[1]->getType());
14854    return EmitX86Select(*this, Ops[2], Res, Ops[1]);
14855  }
14856  case X86::BI__builtin_ia32_pmovdb512_mask:
14857  case X86::BI__builtin_ia32_pmovdw512_mask:
14858  case X86::BI__builtin_ia32_pmovqw512_mask: {
14859    if (const auto *C = dyn_cast<Constant>(Ops[2]))
14860      if (C->isAllOnesValue())
14861        return Builder.CreateTrunc(Ops[0], Ops[1]->getType());
14862
14863    Intrinsic::ID IID;
14864    switch (BuiltinID) {
14865    default: llvm_unreachable("Unsupported intrinsic!");
14866    case X86::BI__builtin_ia32_pmovdb512_mask:
14867      IID = Intrinsic::x86_avx512_mask_pmov_db_512;
14868      break;
14869    case X86::BI__builtin_ia32_pmovdw512_mask:
14870      IID = Intrinsic::x86_avx512_mask_pmov_dw_512;
14871      break;
14872    case X86::BI__builtin_ia32_pmovqw512_mask:
14873      IID = Intrinsic::x86_avx512_mask_pmov_qw_512;
14874      break;
14875    }
14876
14877    Function *Intr = CGM.getIntrinsic(IID);
14878    return Builder.CreateCall(Intr, Ops);
14879  }
14880  case X86::BI__builtin_ia32_pblendw128:
14881  case X86::BI__builtin_ia32_blendpd:
14882  case X86::BI__builtin_ia32_blendps:
14883  case X86::BI__builtin_ia32_blendpd256:
14884  case X86::BI__builtin_ia32_blendps256:
14885  case X86::BI__builtin_ia32_pblendw256:
14886  case X86::BI__builtin_ia32_pblendd128:
14887  case X86::BI__builtin_ia32_pblendd256: {
14888    unsigned NumElts =
14889        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
14890    unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
14891
14892    int Indices[16];
14893    // If there are more than 8 elements, the immediate is used twice so make
14894    // sure we handle that.
14895    for (unsigned i = 0; i != NumElts; ++i)
14896      Indices[i] = ((Imm >> (i % 8)) & 0x1) ? NumElts + i : i;
14897
14898    return Builder.CreateShuffleVector(Ops[0], Ops[1],
14899                                       ArrayRef(Indices, NumElts), "blend");
14900  }
14901  case X86::BI__builtin_ia32_pshuflw:
14902  case X86::BI__builtin_ia32_pshuflw256:
14903  case X86::BI__builtin_ia32_pshuflw512: {
14904    uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14905    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14906    unsigned NumElts = Ty->getNumElements();
14907
14908    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14909    Imm = (Imm & 0xff) * 0x01010101;
14910
14911    int Indices[32];
14912    for (unsigned l = 0; l != NumElts; l += 8) {
14913      for (unsigned i = 0; i != 4; ++i) {
14914        Indices[l + i] = l + (Imm & 3);
14915        Imm >>= 2;
14916      }
14917      for (unsigned i = 4; i != 8; ++i)
14918        Indices[l + i] = l + i;
14919    }
14920
14921    return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14922                                       "pshuflw");
14923  }
14924  case X86::BI__builtin_ia32_pshufhw:
14925  case X86::BI__builtin_ia32_pshufhw256:
14926  case X86::BI__builtin_ia32_pshufhw512: {
14927    uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14928    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14929    unsigned NumElts = Ty->getNumElements();
14930
14931    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14932    Imm = (Imm & 0xff) * 0x01010101;
14933
14934    int Indices[32];
14935    for (unsigned l = 0; l != NumElts; l += 8) {
14936      for (unsigned i = 0; i != 4; ++i)
14937        Indices[l + i] = l + i;
14938      for (unsigned i = 4; i != 8; ++i) {
14939        Indices[l + i] = l + 4 + (Imm & 3);
14940        Imm >>= 2;
14941      }
14942    }
14943
14944    return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14945                                       "pshufhw");
14946  }
14947  case X86::BI__builtin_ia32_pshufd:
14948  case X86::BI__builtin_ia32_pshufd256:
14949  case X86::BI__builtin_ia32_pshufd512:
14950  case X86::BI__builtin_ia32_vpermilpd:
14951  case X86::BI__builtin_ia32_vpermilps:
14952  case X86::BI__builtin_ia32_vpermilpd256:
14953  case X86::BI__builtin_ia32_vpermilps256:
14954  case X86::BI__builtin_ia32_vpermilpd512:
14955  case X86::BI__builtin_ia32_vpermilps512: {
14956    uint32_t Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
14957    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14958    unsigned NumElts = Ty->getNumElements();
14959    unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
14960    unsigned NumLaneElts = NumElts / NumLanes;
14961
14962    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14963    Imm = (Imm & 0xff) * 0x01010101;
14964
14965    int Indices[16];
14966    for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
14967      for (unsigned i = 0; i != NumLaneElts; ++i) {
14968        Indices[i + l] = (Imm % NumLaneElts) + l;
14969        Imm /= NumLaneElts;
14970      }
14971    }
14972
14973    return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
14974                                       "permil");
14975  }
14976  case X86::BI__builtin_ia32_shufpd:
14977  case X86::BI__builtin_ia32_shufpd256:
14978  case X86::BI__builtin_ia32_shufpd512:
14979  case X86::BI__builtin_ia32_shufps:
14980  case X86::BI__builtin_ia32_shufps256:
14981  case X86::BI__builtin_ia32_shufps512: {
14982    uint32_t Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
14983    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
14984    unsigned NumElts = Ty->getNumElements();
14985    unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128;
14986    unsigned NumLaneElts = NumElts / NumLanes;
14987
14988    // Splat the 8-bits of immediate 4 times to help the loop wrap around.
14989    Imm = (Imm & 0xff) * 0x01010101;
14990
14991    int Indices[16];
14992    for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
14993      for (unsigned i = 0; i != NumLaneElts; ++i) {
14994        unsigned Index = Imm % NumLaneElts;
14995        Imm /= NumLaneElts;
14996        if (i >= (NumLaneElts / 2))
14997          Index += NumElts;
14998        Indices[l + i] = l + Index;
14999      }
15000    }
15001
15002    return Builder.CreateShuffleVector(Ops[0], Ops[1],
15003                                       ArrayRef(Indices, NumElts), "shufp");
15004  }
15005  case X86::BI__builtin_ia32_permdi256:
15006  case X86::BI__builtin_ia32_permdf256:
15007  case X86::BI__builtin_ia32_permdi512:
15008  case X86::BI__builtin_ia32_permdf512: {
15009    unsigned Imm = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
15010    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
15011    unsigned NumElts = Ty->getNumElements();
15012
15013    // These intrinsics operate on 256-bit lanes of four 64-bit elements.
15014    int Indices[8];
15015    for (unsigned l = 0; l != NumElts; l += 4)
15016      for (unsigned i = 0; i != 4; ++i)
15017        Indices[l + i] = l + ((Imm >> (2 * i)) & 0x3);
15018
15019    return Builder.CreateShuffleVector(Ops[0], ArrayRef(Indices, NumElts),
15020                                       "perm");
15021  }
15022  case X86::BI__builtin_ia32_palignr128:
15023  case X86::BI__builtin_ia32_palignr256:
15024  case X86::BI__builtin_ia32_palignr512: {
15025    unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
15026
15027    unsigned NumElts =
15028        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15029    assert(NumElts % 16 == 0);
15030
15031    // If palignr is shifting the pair of vectors more than the size of two
15032    // lanes, emit zero.
15033    if (ShiftVal >= 32)
15034      return llvm::Constant::getNullValue(ConvertType(E->getType()));
15035
15036    // If palignr is shifting the pair of input vectors more than one lane,
15037    // but less than two lanes, convert to shifting in zeroes.
15038    if (ShiftVal > 16) {
15039      ShiftVal -= 16;
15040      Ops[1] = Ops[0];
15041      Ops[0] = llvm::Constant::getNullValue(Ops[0]->getType());
15042    }
15043
15044    int Indices[64];
15045    // 256-bit palignr operates on 128-bit lanes so we need to handle that
15046    for (unsigned l = 0; l != NumElts; l += 16) {
15047      for (unsigned i = 0; i != 16; ++i) {
15048        unsigned Idx = ShiftVal + i;
15049        if (Idx >= 16)
15050          Idx += NumElts - 16; // End of lane, switch operand.
15051        Indices[l + i] = Idx + l;
15052      }
15053    }
15054
15055    return Builder.CreateShuffleVector(Ops[1], Ops[0],
15056                                       ArrayRef(Indices, NumElts), "palignr");
15057  }
15058  case X86::BI__builtin_ia32_alignd128:
15059  case X86::BI__builtin_ia32_alignd256:
15060  case X86::BI__builtin_ia32_alignd512:
15061  case X86::BI__builtin_ia32_alignq128:
15062  case X86::BI__builtin_ia32_alignq256:
15063  case X86::BI__builtin_ia32_alignq512: {
15064    unsigned NumElts =
15065        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15066    unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0xff;
15067
15068    // Mask the shift amount to width of a vector.
15069    ShiftVal &= NumElts - 1;
15070
15071    int Indices[16];
15072    for (unsigned i = 0; i != NumElts; ++i)
15073      Indices[i] = i + ShiftVal;
15074
15075    return Builder.CreateShuffleVector(Ops[1], Ops[0],
15076                                       ArrayRef(Indices, NumElts), "valign");
15077  }
15078  case X86::BI__builtin_ia32_shuf_f32x4_256:
15079  case X86::BI__builtin_ia32_shuf_f64x2_256:
15080  case X86::BI__builtin_ia32_shuf_i32x4_256:
15081  case X86::BI__builtin_ia32_shuf_i64x2_256:
15082  case X86::BI__builtin_ia32_shuf_f32x4:
15083  case X86::BI__builtin_ia32_shuf_f64x2:
15084  case X86::BI__builtin_ia32_shuf_i32x4:
15085  case X86::BI__builtin_ia32_shuf_i64x2: {
15086    unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
15087    auto *Ty = cast<llvm::FixedVectorType>(Ops[0]->getType());
15088    unsigned NumElts = Ty->getNumElements();
15089    unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
15090    unsigned NumLaneElts = NumElts / NumLanes;
15091
15092    int Indices[16];
15093    for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
15094      unsigned Index = (Imm % NumLanes) * NumLaneElts;
15095      Imm /= NumLanes; // Discard the bits we just used.
15096      if (l >= (NumElts / 2))
15097        Index += NumElts; // Switch to other source.
15098      for (unsigned i = 0; i != NumLaneElts; ++i) {
15099        Indices[l + i] = Index + i;
15100      }
15101    }
15102
15103    return Builder.CreateShuffleVector(Ops[0], Ops[1],
15104                                       ArrayRef(Indices, NumElts), "shuf");
15105  }
15106
15107  case X86::BI__builtin_ia32_vperm2f128_pd256:
15108  case X86::BI__builtin_ia32_vperm2f128_ps256:
15109  case X86::BI__builtin_ia32_vperm2f128_si256:
15110  case X86::BI__builtin_ia32_permti256: {
15111    unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
15112    unsigned NumElts =
15113        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15114
15115    // This takes a very simple approach since there are two lanes and a
15116    // shuffle can have 2 inputs. So we reserve the first input for the first
15117    // lane and the second input for the second lane. This may result in
15118    // duplicate sources, but this can be dealt with in the backend.
15119
15120    Value *OutOps[2];
15121    int Indices[8];
15122    for (unsigned l = 0; l != 2; ++l) {
15123      // Determine the source for this lane.
15124      if (Imm & (1 << ((l * 4) + 3)))
15125        OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
15126      else if (Imm & (1 << ((l * 4) + 1)))
15127        OutOps[l] = Ops[1];
15128      else
15129        OutOps[l] = Ops[0];
15130
15131      for (unsigned i = 0; i != NumElts/2; ++i) {
15132        // Start with ith element of the source for this lane.
15133        unsigned Idx = (l * NumElts) + i;
15134        // If bit 0 of the immediate half is set, switch to the high half of
15135        // the source.
15136        if (Imm & (1 << (l * 4)))
15137          Idx += NumElts/2;
15138        Indices[(l * (NumElts/2)) + i] = Idx;
15139      }
15140    }
15141
15142    return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
15143                                       ArrayRef(Indices, NumElts), "vperm");
15144  }
15145
15146  case X86::BI__builtin_ia32_pslldqi128_byteshift:
15147  case X86::BI__builtin_ia32_pslldqi256_byteshift:
15148  case X86::BI__builtin_ia32_pslldqi512_byteshift: {
15149    unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
15150    auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
15151    // Builtin type is vXi64 so multiply by 8 to get bytes.
15152    unsigned NumElts = ResultType->getNumElements() * 8;
15153
15154    // If pslldq is shifting the vector more than 15 bytes, emit zero.
15155    if (ShiftVal >= 16)
15156      return llvm::Constant::getNullValue(ResultType);
15157
15158    int Indices[64];
15159    // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
15160    for (unsigned l = 0; l != NumElts; l += 16) {
15161      for (unsigned i = 0; i != 16; ++i) {
15162        unsigned Idx = NumElts + i - ShiftVal;
15163        if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
15164        Indices[l + i] = Idx + l;
15165      }
15166    }
15167
15168    auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
15169    Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
15170    Value *Zero = llvm::Constant::getNullValue(VecTy);
15171    Value *SV = Builder.CreateShuffleVector(
15172        Zero, Cast, ArrayRef(Indices, NumElts), "pslldq");
15173    return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
15174  }
15175  case X86::BI__builtin_ia32_psrldqi128_byteshift:
15176  case X86::BI__builtin_ia32_psrldqi256_byteshift:
15177  case X86::BI__builtin_ia32_psrldqi512_byteshift: {
15178    unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
15179    auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
15180    // Builtin type is vXi64 so multiply by 8 to get bytes.
15181    unsigned NumElts = ResultType->getNumElements() * 8;
15182
15183    // If psrldq is shifting the vector more than 15 bytes, emit zero.
15184    if (ShiftVal >= 16)
15185      return llvm::Constant::getNullValue(ResultType);
15186
15187    int Indices[64];
15188    // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
15189    for (unsigned l = 0; l != NumElts; l += 16) {
15190      for (unsigned i = 0; i != 16; ++i) {
15191        unsigned Idx = i + ShiftVal;
15192        if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
15193        Indices[l + i] = Idx + l;
15194      }
15195    }
15196
15197    auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
15198    Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
15199    Value *Zero = llvm::Constant::getNullValue(VecTy);
15200    Value *SV = Builder.CreateShuffleVector(
15201        Cast, Zero, ArrayRef(Indices, NumElts), "psrldq");
15202    return Builder.CreateBitCast(SV, ResultType, "cast");
15203  }
15204  case X86::BI__builtin_ia32_kshiftliqi:
15205  case X86::BI__builtin_ia32_kshiftlihi:
15206  case X86::BI__builtin_ia32_kshiftlisi:
15207  case X86::BI__builtin_ia32_kshiftlidi: {
15208    unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
15209    unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15210
15211    if (ShiftVal >= NumElts)
15212      return llvm::Constant::getNullValue(Ops[0]->getType());
15213
15214    Value *In = getMaskVecValue(*this, Ops[0], NumElts);
15215
15216    int Indices[64];
15217    for (unsigned i = 0; i != NumElts; ++i)
15218      Indices[i] = NumElts + i - ShiftVal;
15219
15220    Value *Zero = llvm::Constant::getNullValue(In->getType());
15221    Value *SV = Builder.CreateShuffleVector(
15222        Zero, In, ArrayRef(Indices, NumElts), "kshiftl");
15223    return Builder.CreateBitCast(SV, Ops[0]->getType());
15224  }
15225  case X86::BI__builtin_ia32_kshiftriqi:
15226  case X86::BI__builtin_ia32_kshiftrihi:
15227  case X86::BI__builtin_ia32_kshiftrisi:
15228  case X86::BI__builtin_ia32_kshiftridi: {
15229    unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
15230    unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15231
15232    if (ShiftVal >= NumElts)
15233      return llvm::Constant::getNullValue(Ops[0]->getType());
15234
15235    Value *In = getMaskVecValue(*this, Ops[0], NumElts);
15236
15237    int Indices[64];
15238    for (unsigned i = 0; i != NumElts; ++i)
15239      Indices[i] = i + ShiftVal;
15240
15241    Value *Zero = llvm::Constant::getNullValue(In->getType());
15242    Value *SV = Builder.CreateShuffleVector(
15243        In, Zero, ArrayRef(Indices, NumElts), "kshiftr");
15244    return Builder.CreateBitCast(SV, Ops[0]->getType());
15245  }
15246  case X86::BI__builtin_ia32_movnti:
15247  case X86::BI__builtin_ia32_movnti64:
15248  case X86::BI__builtin_ia32_movntsd:
15249  case X86::BI__builtin_ia32_movntss: {
15250    llvm::MDNode *Node = llvm::MDNode::get(
15251        getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
15252
15253    Value *Ptr = Ops[0];
15254    Value *Src = Ops[1];
15255
15256    // Extract the 0'th element of the source vector.
15257    if (BuiltinID == X86::BI__builtin_ia32_movntsd ||
15258        BuiltinID == X86::BI__builtin_ia32_movntss)
15259      Src = Builder.CreateExtractElement(Src, (uint64_t)0, "extract");
15260
15261    // Unaligned nontemporal store of the scalar value.
15262    StoreInst *SI = Builder.CreateDefaultAlignedStore(Src, Ptr);
15263    SI->setMetadata(llvm::LLVMContext::MD_nontemporal, Node);
15264    SI->setAlignment(llvm::Align(1));
15265    return SI;
15266  }
15267  // Rotate is a special case of funnel shift - 1st 2 args are the same.
15268  case X86::BI__builtin_ia32_vprotb:
15269  case X86::BI__builtin_ia32_vprotw:
15270  case X86::BI__builtin_ia32_vprotd:
15271  case X86::BI__builtin_ia32_vprotq:
15272  case X86::BI__builtin_ia32_vprotbi:
15273  case X86::BI__builtin_ia32_vprotwi:
15274  case X86::BI__builtin_ia32_vprotdi:
15275  case X86::BI__builtin_ia32_vprotqi:
15276  case X86::BI__builtin_ia32_prold128:
15277  case X86::BI__builtin_ia32_prold256:
15278  case X86::BI__builtin_ia32_prold512:
15279  case X86::BI__builtin_ia32_prolq128:
15280  case X86::BI__builtin_ia32_prolq256:
15281  case X86::BI__builtin_ia32_prolq512:
15282  case X86::BI__builtin_ia32_prolvd128:
15283  case X86::BI__builtin_ia32_prolvd256:
15284  case X86::BI__builtin_ia32_prolvd512:
15285  case X86::BI__builtin_ia32_prolvq128:
15286  case X86::BI__builtin_ia32_prolvq256:
15287  case X86::BI__builtin_ia32_prolvq512:
15288    return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], false);
15289  case X86::BI__builtin_ia32_prord128:
15290  case X86::BI__builtin_ia32_prord256:
15291  case X86::BI__builtin_ia32_prord512:
15292  case X86::BI__builtin_ia32_prorq128:
15293  case X86::BI__builtin_ia32_prorq256:
15294  case X86::BI__builtin_ia32_prorq512:
15295  case X86::BI__builtin_ia32_prorvd128:
15296  case X86::BI__builtin_ia32_prorvd256:
15297  case X86::BI__builtin_ia32_prorvd512:
15298  case X86::BI__builtin_ia32_prorvq128:
15299  case X86::BI__builtin_ia32_prorvq256:
15300  case X86::BI__builtin_ia32_prorvq512:
15301    return EmitX86FunnelShift(*this, Ops[0], Ops[0], Ops[1], true);
15302  case X86::BI__builtin_ia32_selectb_128:
15303  case X86::BI__builtin_ia32_selectb_256:
15304  case X86::BI__builtin_ia32_selectb_512:
15305  case X86::BI__builtin_ia32_selectw_128:
15306  case X86::BI__builtin_ia32_selectw_256:
15307  case X86::BI__builtin_ia32_selectw_512:
15308  case X86::BI__builtin_ia32_selectd_128:
15309  case X86::BI__builtin_ia32_selectd_256:
15310  case X86::BI__builtin_ia32_selectd_512:
15311  case X86::BI__builtin_ia32_selectq_128:
15312  case X86::BI__builtin_ia32_selectq_256:
15313  case X86::BI__builtin_ia32_selectq_512:
15314  case X86::BI__builtin_ia32_selectph_128:
15315  case X86::BI__builtin_ia32_selectph_256:
15316  case X86::BI__builtin_ia32_selectph_512:
15317  case X86::BI__builtin_ia32_selectpbf_128:
15318  case X86::BI__builtin_ia32_selectpbf_256:
15319  case X86::BI__builtin_ia32_selectpbf_512:
15320  case X86::BI__builtin_ia32_selectps_128:
15321  case X86::BI__builtin_ia32_selectps_256:
15322  case X86::BI__builtin_ia32_selectps_512:
15323  case X86::BI__builtin_ia32_selectpd_128:
15324  case X86::BI__builtin_ia32_selectpd_256:
15325  case X86::BI__builtin_ia32_selectpd_512:
15326    return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
15327  case X86::BI__builtin_ia32_selectsh_128:
15328  case X86::BI__builtin_ia32_selectsbf_128:
15329  case X86::BI__builtin_ia32_selectss_128:
15330  case X86::BI__builtin_ia32_selectsd_128: {
15331    Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
15332    Value *B = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
15333    A = EmitX86ScalarSelect(*this, Ops[0], A, B);
15334    return Builder.CreateInsertElement(Ops[1], A, (uint64_t)0);
15335  }
15336  case X86::BI__builtin_ia32_cmpb128_mask:
15337  case X86::BI__builtin_ia32_cmpb256_mask:
15338  case X86::BI__builtin_ia32_cmpb512_mask:
15339  case X86::BI__builtin_ia32_cmpw128_mask:
15340  case X86::BI__builtin_ia32_cmpw256_mask:
15341  case X86::BI__builtin_ia32_cmpw512_mask:
15342  case X86::BI__builtin_ia32_cmpd128_mask:
15343  case X86::BI__builtin_ia32_cmpd256_mask:
15344  case X86::BI__builtin_ia32_cmpd512_mask:
15345  case X86::BI__builtin_ia32_cmpq128_mask:
15346  case X86::BI__builtin_ia32_cmpq256_mask:
15347  case X86::BI__builtin_ia32_cmpq512_mask: {
15348    unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
15349    return EmitX86MaskedCompare(*this, CC, true, Ops);
15350  }
15351  case X86::BI__builtin_ia32_ucmpb128_mask:
15352  case X86::BI__builtin_ia32_ucmpb256_mask:
15353  case X86::BI__builtin_ia32_ucmpb512_mask:
15354  case X86::BI__builtin_ia32_ucmpw128_mask:
15355  case X86::BI__builtin_ia32_ucmpw256_mask:
15356  case X86::BI__builtin_ia32_ucmpw512_mask:
15357  case X86::BI__builtin_ia32_ucmpd128_mask:
15358  case X86::BI__builtin_ia32_ucmpd256_mask:
15359  case X86::BI__builtin_ia32_ucmpd512_mask:
15360  case X86::BI__builtin_ia32_ucmpq128_mask:
15361  case X86::BI__builtin_ia32_ucmpq256_mask:
15362  case X86::BI__builtin_ia32_ucmpq512_mask: {
15363    unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x7;
15364    return EmitX86MaskedCompare(*this, CC, false, Ops);
15365  }
15366  case X86::BI__builtin_ia32_vpcomb:
15367  case X86::BI__builtin_ia32_vpcomw:
15368  case X86::BI__builtin_ia32_vpcomd:
15369  case X86::BI__builtin_ia32_vpcomq:
15370    return EmitX86vpcom(*this, Ops, true);
15371  case X86::BI__builtin_ia32_vpcomub:
15372  case X86::BI__builtin_ia32_vpcomuw:
15373  case X86::BI__builtin_ia32_vpcomud:
15374  case X86::BI__builtin_ia32_vpcomuq:
15375    return EmitX86vpcom(*this, Ops, false);
15376
15377  case X86::BI__builtin_ia32_kortestcqi:
15378  case X86::BI__builtin_ia32_kortestchi:
15379  case X86::BI__builtin_ia32_kortestcsi:
15380  case X86::BI__builtin_ia32_kortestcdi: {
15381    Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
15382    Value *C = llvm::Constant::getAllOnesValue(Ops[0]->getType());
15383    Value *Cmp = Builder.CreateICmpEQ(Or, C);
15384    return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
15385  }
15386  case X86::BI__builtin_ia32_kortestzqi:
15387  case X86::BI__builtin_ia32_kortestzhi:
15388  case X86::BI__builtin_ia32_kortestzsi:
15389  case X86::BI__builtin_ia32_kortestzdi: {
15390    Value *Or = EmitX86MaskLogic(*this, Instruction::Or, Ops);
15391    Value *C = llvm::Constant::getNullValue(Ops[0]->getType());
15392    Value *Cmp = Builder.CreateICmpEQ(Or, C);
15393    return Builder.CreateZExt(Cmp, ConvertType(E->getType()));
15394  }
15395
15396  case X86::BI__builtin_ia32_ktestcqi:
15397  case X86::BI__builtin_ia32_ktestzqi:
15398  case X86::BI__builtin_ia32_ktestchi:
15399  case X86::BI__builtin_ia32_ktestzhi:
15400  case X86::BI__builtin_ia32_ktestcsi:
15401  case X86::BI__builtin_ia32_ktestzsi:
15402  case X86::BI__builtin_ia32_ktestcdi:
15403  case X86::BI__builtin_ia32_ktestzdi: {
15404    Intrinsic::ID IID;
15405    switch (BuiltinID) {
15406    default: llvm_unreachable("Unsupported intrinsic!");
15407    case X86::BI__builtin_ia32_ktestcqi:
15408      IID = Intrinsic::x86_avx512_ktestc_b;
15409      break;
15410    case X86::BI__builtin_ia32_ktestzqi:
15411      IID = Intrinsic::x86_avx512_ktestz_b;
15412      break;
15413    case X86::BI__builtin_ia32_ktestchi:
15414      IID = Intrinsic::x86_avx512_ktestc_w;
15415      break;
15416    case X86::BI__builtin_ia32_ktestzhi:
15417      IID = Intrinsic::x86_avx512_ktestz_w;
15418      break;
15419    case X86::BI__builtin_ia32_ktestcsi:
15420      IID = Intrinsic::x86_avx512_ktestc_d;
15421      break;
15422    case X86::BI__builtin_ia32_ktestzsi:
15423      IID = Intrinsic::x86_avx512_ktestz_d;
15424      break;
15425    case X86::BI__builtin_ia32_ktestcdi:
15426      IID = Intrinsic::x86_avx512_ktestc_q;
15427      break;
15428    case X86::BI__builtin_ia32_ktestzdi:
15429      IID = Intrinsic::x86_avx512_ktestz_q;
15430      break;
15431    }
15432
15433    unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15434    Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
15435    Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
15436    Function *Intr = CGM.getIntrinsic(IID);
15437    return Builder.CreateCall(Intr, {LHS, RHS});
15438  }
15439
15440  case X86::BI__builtin_ia32_kaddqi:
15441  case X86::BI__builtin_ia32_kaddhi:
15442  case X86::BI__builtin_ia32_kaddsi:
15443  case X86::BI__builtin_ia32_kadddi: {
15444    Intrinsic::ID IID;
15445    switch (BuiltinID) {
15446    default: llvm_unreachable("Unsupported intrinsic!");
15447    case X86::BI__builtin_ia32_kaddqi:
15448      IID = Intrinsic::x86_avx512_kadd_b;
15449      break;
15450    case X86::BI__builtin_ia32_kaddhi:
15451      IID = Intrinsic::x86_avx512_kadd_w;
15452      break;
15453    case X86::BI__builtin_ia32_kaddsi:
15454      IID = Intrinsic::x86_avx512_kadd_d;
15455      break;
15456    case X86::BI__builtin_ia32_kadddi:
15457      IID = Intrinsic::x86_avx512_kadd_q;
15458      break;
15459    }
15460
15461    unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15462    Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
15463    Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
15464    Function *Intr = CGM.getIntrinsic(IID);
15465    Value *Res = Builder.CreateCall(Intr, {LHS, RHS});
15466    return Builder.CreateBitCast(Res, Ops[0]->getType());
15467  }
15468  case X86::BI__builtin_ia32_kandqi:
15469  case X86::BI__builtin_ia32_kandhi:
15470  case X86::BI__builtin_ia32_kandsi:
15471  case X86::BI__builtin_ia32_kanddi:
15472    return EmitX86MaskLogic(*this, Instruction::And, Ops);
15473  case X86::BI__builtin_ia32_kandnqi:
15474  case X86::BI__builtin_ia32_kandnhi:
15475  case X86::BI__builtin_ia32_kandnsi:
15476  case X86::BI__builtin_ia32_kandndi:
15477    return EmitX86MaskLogic(*this, Instruction::And, Ops, true);
15478  case X86::BI__builtin_ia32_korqi:
15479  case X86::BI__builtin_ia32_korhi:
15480  case X86::BI__builtin_ia32_korsi:
15481  case X86::BI__builtin_ia32_kordi:
15482    return EmitX86MaskLogic(*this, Instruction::Or, Ops);
15483  case X86::BI__builtin_ia32_kxnorqi:
15484  case X86::BI__builtin_ia32_kxnorhi:
15485  case X86::BI__builtin_ia32_kxnorsi:
15486  case X86::BI__builtin_ia32_kxnordi:
15487    return EmitX86MaskLogic(*this, Instruction::Xor, Ops, true);
15488  case X86::BI__builtin_ia32_kxorqi:
15489  case X86::BI__builtin_ia32_kxorhi:
15490  case X86::BI__builtin_ia32_kxorsi:
15491  case X86::BI__builtin_ia32_kxordi:
15492    return EmitX86MaskLogic(*this, Instruction::Xor,  Ops);
15493  case X86::BI__builtin_ia32_knotqi:
15494  case X86::BI__builtin_ia32_knothi:
15495  case X86::BI__builtin_ia32_knotsi:
15496  case X86::BI__builtin_ia32_knotdi: {
15497    unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15498    Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
15499    return Builder.CreateBitCast(Builder.CreateNot(Res),
15500                                 Ops[0]->getType());
15501  }
15502  case X86::BI__builtin_ia32_kmovb:
15503  case X86::BI__builtin_ia32_kmovw:
15504  case X86::BI__builtin_ia32_kmovd:
15505  case X86::BI__builtin_ia32_kmovq: {
15506    // Bitcast to vXi1 type and then back to integer. This gets the mask
15507    // register type into the IR, but might be optimized out depending on
15508    // what's around it.
15509    unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15510    Value *Res = getMaskVecValue(*this, Ops[0], NumElts);
15511    return Builder.CreateBitCast(Res, Ops[0]->getType());
15512  }
15513
15514  case X86::BI__builtin_ia32_kunpckdi:
15515  case X86::BI__builtin_ia32_kunpcksi:
15516  case X86::BI__builtin_ia32_kunpckhi: {
15517    unsigned NumElts = Ops[0]->getType()->getIntegerBitWidth();
15518    Value *LHS = getMaskVecValue(*this, Ops[0], NumElts);
15519    Value *RHS = getMaskVecValue(*this, Ops[1], NumElts);
15520    int Indices[64];
15521    for (unsigned i = 0; i != NumElts; ++i)
15522      Indices[i] = i;
15523
15524    // First extract half of each vector. This gives better codegen than
15525    // doing it in a single shuffle.
15526    LHS = Builder.CreateShuffleVector(LHS, LHS, ArrayRef(Indices, NumElts / 2));
15527    RHS = Builder.CreateShuffleVector(RHS, RHS, ArrayRef(Indices, NumElts / 2));
15528    // Concat the vectors.
15529    // NOTE: Operands are swapped to match the intrinsic definition.
15530    Value *Res =
15531        Builder.CreateShuffleVector(RHS, LHS, ArrayRef(Indices, NumElts));
15532    return Builder.CreateBitCast(Res, Ops[0]->getType());
15533  }
15534
15535  case X86::BI__builtin_ia32_vplzcntd_128:
15536  case X86::BI__builtin_ia32_vplzcntd_256:
15537  case X86::BI__builtin_ia32_vplzcntd_512:
15538  case X86::BI__builtin_ia32_vplzcntq_128:
15539  case X86::BI__builtin_ia32_vplzcntq_256:
15540  case X86::BI__builtin_ia32_vplzcntq_512: {
15541    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
15542    return Builder.CreateCall(F, {Ops[0],Builder.getInt1(false)});
15543  }
15544  case X86::BI__builtin_ia32_sqrtss:
15545  case X86::BI__builtin_ia32_sqrtsd: {
15546    Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
15547    Function *F;
15548    if (Builder.getIsFPConstrained()) {
15549      CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
15550      F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
15551                           A->getType());
15552      A = Builder.CreateConstrainedFPCall(F, {A});
15553    } else {
15554      F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
15555      A = Builder.CreateCall(F, {A});
15556    }
15557    return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
15558  }
15559  case X86::BI__builtin_ia32_sqrtsh_round_mask:
15560  case X86::BI__builtin_ia32_sqrtsd_round_mask:
15561  case X86::BI__builtin_ia32_sqrtss_round_mask: {
15562    unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
15563    // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
15564    // otherwise keep the intrinsic.
15565    if (CC != 4) {
15566      Intrinsic::ID IID;
15567
15568      switch (BuiltinID) {
15569      default:
15570        llvm_unreachable("Unsupported intrinsic!");
15571      case X86::BI__builtin_ia32_sqrtsh_round_mask:
15572        IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh;
15573        break;
15574      case X86::BI__builtin_ia32_sqrtsd_round_mask:
15575        IID = Intrinsic::x86_avx512_mask_sqrt_sd;
15576        break;
15577      case X86::BI__builtin_ia32_sqrtss_round_mask:
15578        IID = Intrinsic::x86_avx512_mask_sqrt_ss;
15579        break;
15580      }
15581      return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15582    }
15583    Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
15584    Function *F;
15585    if (Builder.getIsFPConstrained()) {
15586      CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
15587      F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
15588                           A->getType());
15589      A = Builder.CreateConstrainedFPCall(F, A);
15590    } else {
15591      F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
15592      A = Builder.CreateCall(F, A);
15593    }
15594    Value *Src = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
15595    A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
15596    return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
15597  }
15598  case X86::BI__builtin_ia32_sqrtpd256:
15599  case X86::BI__builtin_ia32_sqrtpd:
15600  case X86::BI__builtin_ia32_sqrtps256:
15601  case X86::BI__builtin_ia32_sqrtps:
15602  case X86::BI__builtin_ia32_sqrtph256:
15603  case X86::BI__builtin_ia32_sqrtph:
15604  case X86::BI__builtin_ia32_sqrtph512:
15605  case X86::BI__builtin_ia32_sqrtps512:
15606  case X86::BI__builtin_ia32_sqrtpd512: {
15607    if (Ops.size() == 2) {
15608      unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
15609      // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
15610      // otherwise keep the intrinsic.
15611      if (CC != 4) {
15612        Intrinsic::ID IID;
15613
15614        switch (BuiltinID) {
15615        default:
15616          llvm_unreachable("Unsupported intrinsic!");
15617        case X86::BI__builtin_ia32_sqrtph512:
15618          IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
15619          break;
15620        case X86::BI__builtin_ia32_sqrtps512:
15621          IID = Intrinsic::x86_avx512_sqrt_ps_512;
15622          break;
15623        case X86::BI__builtin_ia32_sqrtpd512:
15624          IID = Intrinsic::x86_avx512_sqrt_pd_512;
15625          break;
15626        }
15627        return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
15628      }
15629    }
15630    if (Builder.getIsFPConstrained()) {
15631      CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
15632      Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
15633                                     Ops[0]->getType());
15634      return Builder.CreateConstrainedFPCall(F, Ops[0]);
15635    } else {
15636      Function *F = CGM.getIntrinsic(Intrinsic::sqrt, Ops[0]->getType());
15637      return Builder.CreateCall(F, Ops[0]);
15638    }
15639  }
15640
15641  case X86::BI__builtin_ia32_pmuludq128:
15642  case X86::BI__builtin_ia32_pmuludq256:
15643  case X86::BI__builtin_ia32_pmuludq512:
15644    return EmitX86Muldq(*this, /*IsSigned*/false, Ops);
15645
15646  case X86::BI__builtin_ia32_pmuldq128:
15647  case X86::BI__builtin_ia32_pmuldq256:
15648  case X86::BI__builtin_ia32_pmuldq512:
15649    return EmitX86Muldq(*this, /*IsSigned*/true, Ops);
15650
15651  case X86::BI__builtin_ia32_pternlogd512_mask:
15652  case X86::BI__builtin_ia32_pternlogq512_mask:
15653  case X86::BI__builtin_ia32_pternlogd128_mask:
15654  case X86::BI__builtin_ia32_pternlogd256_mask:
15655  case X86::BI__builtin_ia32_pternlogq128_mask:
15656  case X86::BI__builtin_ia32_pternlogq256_mask:
15657    return EmitX86Ternlog(*this, /*ZeroMask*/false, Ops);
15658
15659  case X86::BI__builtin_ia32_pternlogd512_maskz:
15660  case X86::BI__builtin_ia32_pternlogq512_maskz:
15661  case X86::BI__builtin_ia32_pternlogd128_maskz:
15662  case X86::BI__builtin_ia32_pternlogd256_maskz:
15663  case X86::BI__builtin_ia32_pternlogq128_maskz:
15664  case X86::BI__builtin_ia32_pternlogq256_maskz:
15665    return EmitX86Ternlog(*this, /*ZeroMask*/true, Ops);
15666
15667  case X86::BI__builtin_ia32_vpshldd128:
15668  case X86::BI__builtin_ia32_vpshldd256:
15669  case X86::BI__builtin_ia32_vpshldd512:
15670  case X86::BI__builtin_ia32_vpshldq128:
15671  case X86::BI__builtin_ia32_vpshldq256:
15672  case X86::BI__builtin_ia32_vpshldq512:
15673  case X86::BI__builtin_ia32_vpshldw128:
15674  case X86::BI__builtin_ia32_vpshldw256:
15675  case X86::BI__builtin_ia32_vpshldw512:
15676    return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
15677
15678  case X86::BI__builtin_ia32_vpshrdd128:
15679  case X86::BI__builtin_ia32_vpshrdd256:
15680  case X86::BI__builtin_ia32_vpshrdd512:
15681  case X86::BI__builtin_ia32_vpshrdq128:
15682  case X86::BI__builtin_ia32_vpshrdq256:
15683  case X86::BI__builtin_ia32_vpshrdq512:
15684  case X86::BI__builtin_ia32_vpshrdw128:
15685  case X86::BI__builtin_ia32_vpshrdw256:
15686  case X86::BI__builtin_ia32_vpshrdw512:
15687    // Ops 0 and 1 are swapped.
15688    return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
15689
15690  case X86::BI__builtin_ia32_vpshldvd128:
15691  case X86::BI__builtin_ia32_vpshldvd256:
15692  case X86::BI__builtin_ia32_vpshldvd512:
15693  case X86::BI__builtin_ia32_vpshldvq128:
15694  case X86::BI__builtin_ia32_vpshldvq256:
15695  case X86::BI__builtin_ia32_vpshldvq512:
15696  case X86::BI__builtin_ia32_vpshldvw128:
15697  case X86::BI__builtin_ia32_vpshldvw256:
15698  case X86::BI__builtin_ia32_vpshldvw512:
15699    return EmitX86FunnelShift(*this, Ops[0], Ops[1], Ops[2], false);
15700
15701  case X86::BI__builtin_ia32_vpshrdvd128:
15702  case X86::BI__builtin_ia32_vpshrdvd256:
15703  case X86::BI__builtin_ia32_vpshrdvd512:
15704  case X86::BI__builtin_ia32_vpshrdvq128:
15705  case X86::BI__builtin_ia32_vpshrdvq256:
15706  case X86::BI__builtin_ia32_vpshrdvq512:
15707  case X86::BI__builtin_ia32_vpshrdvw128:
15708  case X86::BI__builtin_ia32_vpshrdvw256:
15709  case X86::BI__builtin_ia32_vpshrdvw512:
15710    // Ops 0 and 1 are swapped.
15711    return EmitX86FunnelShift(*this, Ops[1], Ops[0], Ops[2], true);
15712
15713  // Reductions
15714  case X86::BI__builtin_ia32_reduce_fadd_pd512:
15715  case X86::BI__builtin_ia32_reduce_fadd_ps512:
15716  case X86::BI__builtin_ia32_reduce_fadd_ph512:
15717  case X86::BI__builtin_ia32_reduce_fadd_ph256:
15718  case X86::BI__builtin_ia32_reduce_fadd_ph128: {
15719    Function *F =
15720        CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
15721    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15722    Builder.getFastMathFlags().setAllowReassoc();
15723    return Builder.CreateCall(F, {Ops[0], Ops[1]});
15724  }
15725  case X86::BI__builtin_ia32_reduce_fmul_pd512:
15726  case X86::BI__builtin_ia32_reduce_fmul_ps512:
15727  case X86::BI__builtin_ia32_reduce_fmul_ph512:
15728  case X86::BI__builtin_ia32_reduce_fmul_ph256:
15729  case X86::BI__builtin_ia32_reduce_fmul_ph128: {
15730    Function *F =
15731        CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
15732    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15733    Builder.getFastMathFlags().setAllowReassoc();
15734    return Builder.CreateCall(F, {Ops[0], Ops[1]});
15735  }
15736  case X86::BI__builtin_ia32_reduce_fmax_pd512:
15737  case X86::BI__builtin_ia32_reduce_fmax_ps512:
15738  case X86::BI__builtin_ia32_reduce_fmax_ph512:
15739  case X86::BI__builtin_ia32_reduce_fmax_ph256:
15740  case X86::BI__builtin_ia32_reduce_fmax_ph128: {
15741    Function *F =
15742        CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType());
15743    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15744    Builder.getFastMathFlags().setNoNaNs();
15745    return Builder.CreateCall(F, {Ops[0]});
15746  }
15747  case X86::BI__builtin_ia32_reduce_fmin_pd512:
15748  case X86::BI__builtin_ia32_reduce_fmin_ps512:
15749  case X86::BI__builtin_ia32_reduce_fmin_ph512:
15750  case X86::BI__builtin_ia32_reduce_fmin_ph256:
15751  case X86::BI__builtin_ia32_reduce_fmin_ph128: {
15752    Function *F =
15753        CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType());
15754    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
15755    Builder.getFastMathFlags().setNoNaNs();
15756    return Builder.CreateCall(F, {Ops[0]});
15757  }
15758
15759  // 3DNow!
15760  case X86::BI__builtin_ia32_pswapdsf:
15761  case X86::BI__builtin_ia32_pswapdsi: {
15762    llvm::Type *MMXTy = llvm::Type::getX86_MMXTy(getLLVMContext());
15763    Ops[0] = Builder.CreateBitCast(Ops[0], MMXTy, "cast");
15764    llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_3dnowa_pswapd);
15765    return Builder.CreateCall(F, Ops, "pswapd");
15766  }
15767  case X86::BI__builtin_ia32_rdrand16_step:
15768  case X86::BI__builtin_ia32_rdrand32_step:
15769  case X86::BI__builtin_ia32_rdrand64_step:
15770  case X86::BI__builtin_ia32_rdseed16_step:
15771  case X86::BI__builtin_ia32_rdseed32_step:
15772  case X86::BI__builtin_ia32_rdseed64_step: {
15773    Intrinsic::ID ID;
15774    switch (BuiltinID) {
15775    default: llvm_unreachable("Unsupported intrinsic!");
15776    case X86::BI__builtin_ia32_rdrand16_step:
15777      ID = Intrinsic::x86_rdrand_16;
15778      break;
15779    case X86::BI__builtin_ia32_rdrand32_step:
15780      ID = Intrinsic::x86_rdrand_32;
15781      break;
15782    case X86::BI__builtin_ia32_rdrand64_step:
15783      ID = Intrinsic::x86_rdrand_64;
15784      break;
15785    case X86::BI__builtin_ia32_rdseed16_step:
15786      ID = Intrinsic::x86_rdseed_16;
15787      break;
15788    case X86::BI__builtin_ia32_rdseed32_step:
15789      ID = Intrinsic::x86_rdseed_32;
15790      break;
15791    case X86::BI__builtin_ia32_rdseed64_step:
15792      ID = Intrinsic::x86_rdseed_64;
15793      break;
15794    }
15795
15796    Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID));
15797    Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 0),
15798                                      Ops[0]);
15799    return Builder.CreateExtractValue(Call, 1);
15800  }
15801  case X86::BI__builtin_ia32_addcarryx_u32:
15802  case X86::BI__builtin_ia32_addcarryx_u64:
15803  case X86::BI__builtin_ia32_subborrow_u32:
15804  case X86::BI__builtin_ia32_subborrow_u64: {
15805    Intrinsic::ID IID;
15806    switch (BuiltinID) {
15807    default: llvm_unreachable("Unsupported intrinsic!");
15808    case X86::BI__builtin_ia32_addcarryx_u32:
15809      IID = Intrinsic::x86_addcarry_32;
15810      break;
15811    case X86::BI__builtin_ia32_addcarryx_u64:
15812      IID = Intrinsic::x86_addcarry_64;
15813      break;
15814    case X86::BI__builtin_ia32_subborrow_u32:
15815      IID = Intrinsic::x86_subborrow_32;
15816      break;
15817    case X86::BI__builtin_ia32_subborrow_u64:
15818      IID = Intrinsic::x86_subborrow_64;
15819      break;
15820    }
15821
15822    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID),
15823                                     { Ops[0], Ops[1], Ops[2] });
15824    Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, 1),
15825                                      Ops[3]);
15826    return Builder.CreateExtractValue(Call, 0);
15827  }
15828
15829  case X86::BI__builtin_ia32_fpclassps128_mask:
15830  case X86::BI__builtin_ia32_fpclassps256_mask:
15831  case X86::BI__builtin_ia32_fpclassps512_mask:
15832  case X86::BI__builtin_ia32_fpclassph128_mask:
15833  case X86::BI__builtin_ia32_fpclassph256_mask:
15834  case X86::BI__builtin_ia32_fpclassph512_mask:
15835  case X86::BI__builtin_ia32_fpclasspd128_mask:
15836  case X86::BI__builtin_ia32_fpclasspd256_mask:
15837  case X86::BI__builtin_ia32_fpclasspd512_mask: {
15838    unsigned NumElts =
15839        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15840    Value *MaskIn = Ops[2];
15841    Ops.erase(&Ops[2]);
15842
15843    Intrinsic::ID ID;
15844    switch (BuiltinID) {
15845    default: llvm_unreachable("Unsupported intrinsic!");
15846    case X86::BI__builtin_ia32_fpclassph128_mask:
15847      ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
15848      break;
15849    case X86::BI__builtin_ia32_fpclassph256_mask:
15850      ID = Intrinsic::x86_avx512fp16_fpclass_ph_256;
15851      break;
15852    case X86::BI__builtin_ia32_fpclassph512_mask:
15853      ID = Intrinsic::x86_avx512fp16_fpclass_ph_512;
15854      break;
15855    case X86::BI__builtin_ia32_fpclassps128_mask:
15856      ID = Intrinsic::x86_avx512_fpclass_ps_128;
15857      break;
15858    case X86::BI__builtin_ia32_fpclassps256_mask:
15859      ID = Intrinsic::x86_avx512_fpclass_ps_256;
15860      break;
15861    case X86::BI__builtin_ia32_fpclassps512_mask:
15862      ID = Intrinsic::x86_avx512_fpclass_ps_512;
15863      break;
15864    case X86::BI__builtin_ia32_fpclasspd128_mask:
15865      ID = Intrinsic::x86_avx512_fpclass_pd_128;
15866      break;
15867    case X86::BI__builtin_ia32_fpclasspd256_mask:
15868      ID = Intrinsic::x86_avx512_fpclass_pd_256;
15869      break;
15870    case X86::BI__builtin_ia32_fpclasspd512_mask:
15871      ID = Intrinsic::x86_avx512_fpclass_pd_512;
15872      break;
15873    }
15874
15875    Value *Fpclass = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
15876    return EmitX86MaskedCompareResult(*this, Fpclass, NumElts, MaskIn);
15877  }
15878
15879  case X86::BI__builtin_ia32_vp2intersect_q_512:
15880  case X86::BI__builtin_ia32_vp2intersect_q_256:
15881  case X86::BI__builtin_ia32_vp2intersect_q_128:
15882  case X86::BI__builtin_ia32_vp2intersect_d_512:
15883  case X86::BI__builtin_ia32_vp2intersect_d_256:
15884  case X86::BI__builtin_ia32_vp2intersect_d_128: {
15885    unsigned NumElts =
15886        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15887    Intrinsic::ID ID;
15888
15889    switch (BuiltinID) {
15890    default: llvm_unreachable("Unsupported intrinsic!");
15891    case X86::BI__builtin_ia32_vp2intersect_q_512:
15892      ID = Intrinsic::x86_avx512_vp2intersect_q_512;
15893      break;
15894    case X86::BI__builtin_ia32_vp2intersect_q_256:
15895      ID = Intrinsic::x86_avx512_vp2intersect_q_256;
15896      break;
15897    case X86::BI__builtin_ia32_vp2intersect_q_128:
15898      ID = Intrinsic::x86_avx512_vp2intersect_q_128;
15899      break;
15900    case X86::BI__builtin_ia32_vp2intersect_d_512:
15901      ID = Intrinsic::x86_avx512_vp2intersect_d_512;
15902      break;
15903    case X86::BI__builtin_ia32_vp2intersect_d_256:
15904      ID = Intrinsic::x86_avx512_vp2intersect_d_256;
15905      break;
15906    case X86::BI__builtin_ia32_vp2intersect_d_128:
15907      ID = Intrinsic::x86_avx512_vp2intersect_d_128;
15908      break;
15909    }
15910
15911    Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID), {Ops[0], Ops[1]});
15912    Value *Result = Builder.CreateExtractValue(Call, 0);
15913    Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
15914    Builder.CreateDefaultAlignedStore(Result, Ops[2]);
15915
15916    Result = Builder.CreateExtractValue(Call, 1);
15917    Result = EmitX86MaskedCompareResult(*this, Result, NumElts, nullptr);
15918    return Builder.CreateDefaultAlignedStore(Result, Ops[3]);
15919  }
15920
15921  case X86::BI__builtin_ia32_vpmultishiftqb128:
15922  case X86::BI__builtin_ia32_vpmultishiftqb256:
15923  case X86::BI__builtin_ia32_vpmultishiftqb512: {
15924    Intrinsic::ID ID;
15925    switch (BuiltinID) {
15926    default: llvm_unreachable("Unsupported intrinsic!");
15927    case X86::BI__builtin_ia32_vpmultishiftqb128:
15928      ID = Intrinsic::x86_avx512_pmultishift_qb_128;
15929      break;
15930    case X86::BI__builtin_ia32_vpmultishiftqb256:
15931      ID = Intrinsic::x86_avx512_pmultishift_qb_256;
15932      break;
15933    case X86::BI__builtin_ia32_vpmultishiftqb512:
15934      ID = Intrinsic::x86_avx512_pmultishift_qb_512;
15935      break;
15936    }
15937
15938    return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
15939  }
15940
15941  case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
15942  case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
15943  case X86::BI__builtin_ia32_vpshufbitqmb512_mask: {
15944    unsigned NumElts =
15945        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
15946    Value *MaskIn = Ops[2];
15947    Ops.erase(&Ops[2]);
15948
15949    Intrinsic::ID ID;
15950    switch (BuiltinID) {
15951    default: llvm_unreachable("Unsupported intrinsic!");
15952    case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
15953      ID = Intrinsic::x86_avx512_vpshufbitqmb_128;
15954      break;
15955    case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
15956      ID = Intrinsic::x86_avx512_vpshufbitqmb_256;
15957      break;
15958    case X86::BI__builtin_ia32_vpshufbitqmb512_mask:
15959      ID = Intrinsic::x86_avx512_vpshufbitqmb_512;
15960      break;
15961    }
15962
15963    Value *Shufbit = Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
15964    return EmitX86MaskedCompareResult(*this, Shufbit, NumElts, MaskIn);
15965  }
15966
15967  // packed comparison intrinsics
15968  case X86::BI__builtin_ia32_cmpeqps:
15969  case X86::BI__builtin_ia32_cmpeqpd:
15970    return getVectorFCmpIR(CmpInst::FCMP_OEQ, /*IsSignaling*/false);
15971  case X86::BI__builtin_ia32_cmpltps:
15972  case X86::BI__builtin_ia32_cmpltpd:
15973    return getVectorFCmpIR(CmpInst::FCMP_OLT, /*IsSignaling*/true);
15974  case X86::BI__builtin_ia32_cmpleps:
15975  case X86::BI__builtin_ia32_cmplepd:
15976    return getVectorFCmpIR(CmpInst::FCMP_OLE, /*IsSignaling*/true);
15977  case X86::BI__builtin_ia32_cmpunordps:
15978  case X86::BI__builtin_ia32_cmpunordpd:
15979    return getVectorFCmpIR(CmpInst::FCMP_UNO, /*IsSignaling*/false);
15980  case X86::BI__builtin_ia32_cmpneqps:
15981  case X86::BI__builtin_ia32_cmpneqpd:
15982    return getVectorFCmpIR(CmpInst::FCMP_UNE, /*IsSignaling*/false);
15983  case X86::BI__builtin_ia32_cmpnltps:
15984  case X86::BI__builtin_ia32_cmpnltpd:
15985    return getVectorFCmpIR(CmpInst::FCMP_UGE, /*IsSignaling*/true);
15986  case X86::BI__builtin_ia32_cmpnleps:
15987  case X86::BI__builtin_ia32_cmpnlepd:
15988    return getVectorFCmpIR(CmpInst::FCMP_UGT, /*IsSignaling*/true);
15989  case X86::BI__builtin_ia32_cmpordps:
15990  case X86::BI__builtin_ia32_cmpordpd:
15991    return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false);
15992  case X86::BI__builtin_ia32_cmpph128_mask:
15993  case X86::BI__builtin_ia32_cmpph256_mask:
15994  case X86::BI__builtin_ia32_cmpph512_mask:
15995  case X86::BI__builtin_ia32_cmpps128_mask:
15996  case X86::BI__builtin_ia32_cmpps256_mask:
15997  case X86::BI__builtin_ia32_cmpps512_mask:
15998  case X86::BI__builtin_ia32_cmppd128_mask:
15999  case X86::BI__builtin_ia32_cmppd256_mask:
16000  case X86::BI__builtin_ia32_cmppd512_mask:
16001    IsMaskFCmp = true;
16002    [[fallthrough]];
16003  case X86::BI__builtin_ia32_cmpps:
16004  case X86::BI__builtin_ia32_cmpps256:
16005  case X86::BI__builtin_ia32_cmppd:
16006  case X86::BI__builtin_ia32_cmppd256: {
16007    // Lowering vector comparisons to fcmp instructions, while
16008    // ignoring signalling behaviour requested
16009    // ignoring rounding mode requested
16010    // This is only possible if fp-model is not strict and FENV_ACCESS is off.
16011
16012    // The third argument is the comparison condition, and integer in the
16013    // range [0, 31]
16014    unsigned CC = cast<llvm::ConstantInt>(Ops[2])->getZExtValue() & 0x1f;
16015
16016    // Lowering to IR fcmp instruction.
16017    // Ignoring requested signaling behaviour,
16018    // e.g. both _CMP_GT_OS & _CMP_GT_OQ are translated to FCMP_OGT.
16019    FCmpInst::Predicate Pred;
16020    bool IsSignaling;
16021    // Predicates for 16-31 repeat the 0-15 predicates. Only the signalling
16022    // behavior is inverted. We'll handle that after the switch.
16023    switch (CC & 0xf) {
16024    case 0x00: Pred = FCmpInst::FCMP_OEQ;   IsSignaling = false; break;
16025    case 0x01: Pred = FCmpInst::FCMP_OLT;   IsSignaling = true;  break;
16026    case 0x02: Pred = FCmpInst::FCMP_OLE;   IsSignaling = true;  break;
16027    case 0x03: Pred = FCmpInst::FCMP_UNO;   IsSignaling = false; break;
16028    case 0x04: Pred = FCmpInst::FCMP_UNE;   IsSignaling = false; break;
16029    case 0x05: Pred = FCmpInst::FCMP_UGE;   IsSignaling = true;  break;
16030    case 0x06: Pred = FCmpInst::FCMP_UGT;   IsSignaling = true;  break;
16031    case 0x07: Pred = FCmpInst::FCMP_ORD;   IsSignaling = false; break;
16032    case 0x08: Pred = FCmpInst::FCMP_UEQ;   IsSignaling = false; break;
16033    case 0x09: Pred = FCmpInst::FCMP_ULT;   IsSignaling = true;  break;
16034    case 0x0a: Pred = FCmpInst::FCMP_ULE;   IsSignaling = true;  break;
16035    case 0x0b: Pred = FCmpInst::FCMP_FALSE; IsSignaling = false; break;
16036    case 0x0c: Pred = FCmpInst::FCMP_ONE;   IsSignaling = false; break;
16037    case 0x0d: Pred = FCmpInst::FCMP_OGE;   IsSignaling = true;  break;
16038    case 0x0e: Pred = FCmpInst::FCMP_OGT;   IsSignaling = true;  break;
16039    case 0x0f: Pred = FCmpInst::FCMP_TRUE;  IsSignaling = false; break;
16040    default: llvm_unreachable("Unhandled CC");
16041    }
16042
16043    // Invert the signalling behavior for 16-31.
16044    if (CC & 0x10)
16045      IsSignaling = !IsSignaling;
16046
16047    // If the predicate is true or false and we're using constrained intrinsics,
16048    // we don't have a compare intrinsic we can use. Just use the legacy X86
16049    // specific intrinsic.
16050    // If the intrinsic is mask enabled and we're using constrained intrinsics,
16051    // use the legacy X86 specific intrinsic.
16052    if (Builder.getIsFPConstrained() &&
16053        (Pred == FCmpInst::FCMP_TRUE || Pred == FCmpInst::FCMP_FALSE ||
16054         IsMaskFCmp)) {
16055
16056      Intrinsic::ID IID;
16057      switch (BuiltinID) {
16058      default: llvm_unreachable("Unexpected builtin");
16059      case X86::BI__builtin_ia32_cmpps:
16060        IID = Intrinsic::x86_sse_cmp_ps;
16061        break;
16062      case X86::BI__builtin_ia32_cmpps256:
16063        IID = Intrinsic::x86_avx_cmp_ps_256;
16064        break;
16065      case X86::BI__builtin_ia32_cmppd:
16066        IID = Intrinsic::x86_sse2_cmp_pd;
16067        break;
16068      case X86::BI__builtin_ia32_cmppd256:
16069        IID = Intrinsic::x86_avx_cmp_pd_256;
16070        break;
16071      case X86::BI__builtin_ia32_cmpph128_mask:
16072        IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_128;
16073        break;
16074      case X86::BI__builtin_ia32_cmpph256_mask:
16075        IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_256;
16076        break;
16077      case X86::BI__builtin_ia32_cmpph512_mask:
16078        IID = Intrinsic::x86_avx512fp16_mask_cmp_ph_512;
16079        break;
16080      case X86::BI__builtin_ia32_cmpps512_mask:
16081        IID = Intrinsic::x86_avx512_mask_cmp_ps_512;
16082        break;
16083      case X86::BI__builtin_ia32_cmppd512_mask:
16084        IID = Intrinsic::x86_avx512_mask_cmp_pd_512;
16085        break;
16086      case X86::BI__builtin_ia32_cmpps128_mask:
16087        IID = Intrinsic::x86_avx512_mask_cmp_ps_128;
16088        break;
16089      case X86::BI__builtin_ia32_cmpps256_mask:
16090        IID = Intrinsic::x86_avx512_mask_cmp_ps_256;
16091        break;
16092      case X86::BI__builtin_ia32_cmppd128_mask:
16093        IID = Intrinsic::x86_avx512_mask_cmp_pd_128;
16094        break;
16095      case X86::BI__builtin_ia32_cmppd256_mask:
16096        IID = Intrinsic::x86_avx512_mask_cmp_pd_256;
16097        break;
16098      }
16099
16100      Function *Intr = CGM.getIntrinsic(IID);
16101      if (IsMaskFCmp) {
16102        unsigned NumElts =
16103            cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
16104        Ops[3] = getMaskVecValue(*this, Ops[3], NumElts);
16105        Value *Cmp = Builder.CreateCall(Intr, Ops);
16106        return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr);
16107      }
16108
16109      return Builder.CreateCall(Intr, Ops);
16110    }
16111
16112    // Builtins without the _mask suffix return a vector of integers
16113    // of the same width as the input vectors
16114    if (IsMaskFCmp) {
16115      // We ignore SAE if strict FP is disabled. We only keep precise
16116      // exception behavior under strict FP.
16117      // NOTE: If strict FP does ever go through here a CGFPOptionsRAII
16118      // object will be required.
16119      unsigned NumElts =
16120          cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements();
16121      Value *Cmp;
16122      if (IsSignaling)
16123        Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]);
16124      else
16125        Cmp = Builder.CreateFCmp(Pred, Ops[0], Ops[1]);
16126      return EmitX86MaskedCompareResult(*this, Cmp, NumElts, Ops[3]);
16127    }
16128
16129    return getVectorFCmpIR(Pred, IsSignaling);
16130  }
16131
16132  // SSE scalar comparison intrinsics
16133  case X86::BI__builtin_ia32_cmpeqss:
16134    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 0);
16135  case X86::BI__builtin_ia32_cmpltss:
16136    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 1);
16137  case X86::BI__builtin_ia32_cmpless:
16138    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 2);
16139  case X86::BI__builtin_ia32_cmpunordss:
16140    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 3);
16141  case X86::BI__builtin_ia32_cmpneqss:
16142    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 4);
16143  case X86::BI__builtin_ia32_cmpnltss:
16144    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 5);
16145  case X86::BI__builtin_ia32_cmpnless:
16146    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 6);
16147  case X86::BI__builtin_ia32_cmpordss:
16148    return getCmpIntrinsicCall(Intrinsic::x86_sse_cmp_ss, 7);
16149  case X86::BI__builtin_ia32_cmpeqsd:
16150    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 0);
16151  case X86::BI__builtin_ia32_cmpltsd:
16152    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 1);
16153  case X86::BI__builtin_ia32_cmplesd:
16154    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 2);
16155  case X86::BI__builtin_ia32_cmpunordsd:
16156    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 3);
16157  case X86::BI__builtin_ia32_cmpneqsd:
16158    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 4);
16159  case X86::BI__builtin_ia32_cmpnltsd:
16160    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 5);
16161  case X86::BI__builtin_ia32_cmpnlesd:
16162    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 6);
16163  case X86::BI__builtin_ia32_cmpordsd:
16164    return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
16165
16166  // f16c half2float intrinsics
16167  case X86::BI__builtin_ia32_vcvtph2ps:
16168  case X86::BI__builtin_ia32_vcvtph2ps256:
16169  case X86::BI__builtin_ia32_vcvtph2ps_mask:
16170  case X86::BI__builtin_ia32_vcvtph2ps256_mask:
16171  case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
16172    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
16173    return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType()));
16174  }
16175
16176  // AVX512 bf16 intrinsics
16177  case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: {
16178    Ops[2] = getMaskVecValue(
16179        *this, Ops[2],
16180        cast<llvm::FixedVectorType>(Ops[0]->getType())->getNumElements());
16181    Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128;
16182    return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
16183  }
16184  case X86::BI__builtin_ia32_cvtsbf162ss_32:
16185    return Builder.CreateFPExt(Ops[0], Builder.getFloatTy());
16186
16187  case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
16188  case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: {
16189    Intrinsic::ID IID;
16190    switch (BuiltinID) {
16191    default: llvm_unreachable("Unsupported intrinsic!");
16192    case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
16193      IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_256;
16194      break;
16195    case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
16196      IID = Intrinsic::x86_avx512bf16_cvtneps2bf16_512;
16197      break;
16198    }
16199    Value *Res = Builder.CreateCall(CGM.getIntrinsic(IID), Ops[0]);
16200    return EmitX86Select(*this, Ops[2], Res, Ops[1]);
16201  }
16202
16203  case X86::BI__cpuid:
16204  case X86::BI__cpuidex: {
16205    Value *FuncId = EmitScalarExpr(E->getArg(1));
16206    Value *SubFuncId = BuiltinID == X86::BI__cpuidex
16207                           ? EmitScalarExpr(E->getArg(2))
16208                           : llvm::ConstantInt::get(Int32Ty, 0);
16209
16210    llvm::StructType *CpuidRetTy =
16211        llvm::StructType::get(Int32Ty, Int32Ty, Int32Ty, Int32Ty);
16212    llvm::FunctionType *FTy =
16213        llvm::FunctionType::get(CpuidRetTy, {Int32Ty, Int32Ty}, false);
16214
16215    StringRef Asm, Constraints;
16216    if (getTarget().getTriple().getArch() == llvm::Triple::x86) {
16217      Asm = "cpuid";
16218      Constraints = "={ax},={bx},={cx},={dx},{ax},{cx}";
16219    } else {
16220      // x86-64 uses %rbx as the base register, so preserve it.
16221      Asm = "xchgq %rbx, ${1:q}\n"
16222            "cpuid\n"
16223            "xchgq %rbx, ${1:q}";
16224      Constraints = "={ax},=r,={cx},={dx},0,2";
16225    }
16226
16227    llvm::InlineAsm *IA = llvm::InlineAsm::get(FTy, Asm, Constraints,
16228                                               /*hasSideEffects=*/false);
16229    Value *IACall = Builder.CreateCall(IA, {FuncId, SubFuncId});
16230    Value *BasePtr = EmitScalarExpr(E->getArg(0));
16231    Value *Store = nullptr;
16232    for (unsigned i = 0; i < 4; i++) {
16233      Value *Extracted = Builder.CreateExtractValue(IACall, i);
16234      Value *StorePtr = Builder.CreateConstInBoundsGEP1_32(Int32Ty, BasePtr, i);
16235      Store = Builder.CreateAlignedStore(Extracted, StorePtr, getIntAlign());
16236    }
16237
16238    // Return the last store instruction to signal that we have emitted the
16239    // the intrinsic.
16240    return Store;
16241  }
16242
16243  case X86::BI__emul:
16244  case X86::BI__emulu: {
16245    llvm::Type *Int64Ty = llvm::IntegerType::get(getLLVMContext(), 64);
16246    bool isSigned = (BuiltinID == X86::BI__emul);
16247    Value *LHS = Builder.CreateIntCast(Ops[0], Int64Ty, isSigned);
16248    Value *RHS = Builder.CreateIntCast(Ops[1], Int64Ty, isSigned);
16249    return Builder.CreateMul(LHS, RHS, "", !isSigned, isSigned);
16250  }
16251  case X86::BI__mulh:
16252  case X86::BI__umulh:
16253  case X86::BI_mul128:
16254  case X86::BI_umul128: {
16255    llvm::Type *ResType = ConvertType(E->getType());
16256    llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
16257
16258    bool IsSigned = (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI_mul128);
16259    Value *LHS = Builder.CreateIntCast(Ops[0], Int128Ty, IsSigned);
16260    Value *RHS = Builder.CreateIntCast(Ops[1], Int128Ty, IsSigned);
16261
16262    Value *MulResult, *HigherBits;
16263    if (IsSigned) {
16264      MulResult = Builder.CreateNSWMul(LHS, RHS);
16265      HigherBits = Builder.CreateAShr(MulResult, 64);
16266    } else {
16267      MulResult = Builder.CreateNUWMul(LHS, RHS);
16268      HigherBits = Builder.CreateLShr(MulResult, 64);
16269    }
16270    HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
16271
16272    if (BuiltinID == X86::BI__mulh || BuiltinID == X86::BI__umulh)
16273      return HigherBits;
16274
16275    Address HighBitsAddress = EmitPointerWithAlignment(E->getArg(2));
16276    Builder.CreateStore(HigherBits, HighBitsAddress);
16277    return Builder.CreateIntCast(MulResult, ResType, IsSigned);
16278  }
16279
16280  case X86::BI__faststorefence: {
16281    return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
16282                               llvm::SyncScope::System);
16283  }
16284  case X86::BI__shiftleft128:
16285  case X86::BI__shiftright128: {
16286    llvm::Function *F = CGM.getIntrinsic(
16287        BuiltinID == X86::BI__shiftleft128 ? Intrinsic::fshl : Intrinsic::fshr,
16288        Int64Ty);
16289    // Flip low/high ops and zero-extend amount to matching type.
16290    // shiftleft128(Low, High, Amt) -> fshl(High, Low, Amt)
16291    // shiftright128(Low, High, Amt) -> fshr(High, Low, Amt)
16292    std::swap(Ops[0], Ops[1]);
16293    Ops[2] = Builder.CreateZExt(Ops[2], Int64Ty);
16294    return Builder.CreateCall(F, Ops);
16295  }
16296  case X86::BI_ReadWriteBarrier:
16297  case X86::BI_ReadBarrier:
16298  case X86::BI_WriteBarrier: {
16299    return Builder.CreateFence(llvm::AtomicOrdering::SequentiallyConsistent,
16300                               llvm::SyncScope::SingleThread);
16301  }
16302
16303  case X86::BI_AddressOfReturnAddress: {
16304    Function *F =
16305        CGM.getIntrinsic(Intrinsic::addressofreturnaddress, AllocaInt8PtrTy);
16306    return Builder.CreateCall(F);
16307  }
16308  case X86::BI__stosb: {
16309    // We treat __stosb as a volatile memset - it may not generate "rep stosb"
16310    // instruction, but it will create a memset that won't be optimized away.
16311    return Builder.CreateMemSet(Ops[0], Ops[1], Ops[2], Align(1), true);
16312  }
16313  case X86::BI__ud2:
16314    // llvm.trap makes a ud2a instruction on x86.
16315    return EmitTrapCall(Intrinsic::trap);
16316  case X86::BI__int2c: {
16317    // This syscall signals a driver assertion failure in x86 NT kernels.
16318    llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
16319    llvm::InlineAsm *IA =
16320        llvm::InlineAsm::get(FTy, "int $$0x2c", "", /*hasSideEffects=*/true);
16321    llvm::AttributeList NoReturnAttr = llvm::AttributeList::get(
16322        getLLVMContext(), llvm::AttributeList::FunctionIndex,
16323        llvm::Attribute::NoReturn);
16324    llvm::CallInst *CI = Builder.CreateCall(IA);
16325    CI->setAttributes(NoReturnAttr);
16326    return CI;
16327  }
16328  case X86::BI__readfsbyte:
16329  case X86::BI__readfsword:
16330  case X86::BI__readfsdword:
16331  case X86::BI__readfsqword: {
16332    llvm::Type *IntTy = ConvertType(E->getType());
16333    Value *Ptr = Builder.CreateIntToPtr(
16334        Ops[0], llvm::PointerType::get(getLLVMContext(), 257));
16335    LoadInst *Load = Builder.CreateAlignedLoad(
16336        IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
16337    Load->setVolatile(true);
16338    return Load;
16339  }
16340  case X86::BI__readgsbyte:
16341  case X86::BI__readgsword:
16342  case X86::BI__readgsdword:
16343  case X86::BI__readgsqword: {
16344    llvm::Type *IntTy = ConvertType(E->getType());
16345    Value *Ptr = Builder.CreateIntToPtr(
16346        Ops[0], llvm::PointerType::get(getLLVMContext(), 256));
16347    LoadInst *Load = Builder.CreateAlignedLoad(
16348        IntTy, Ptr, getContext().getTypeAlignInChars(E->getType()));
16349    Load->setVolatile(true);
16350    return Load;
16351  }
16352  case X86::BI__builtin_ia32_encodekey128_u32: {
16353    Intrinsic::ID IID = Intrinsic::x86_encodekey128;
16354
16355    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1]});
16356
16357    for (int i = 0; i < 3; ++i) {
16358      Value *Extract = Builder.CreateExtractValue(Call, i + 1);
16359      Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[2], i * 16);
16360      Builder.CreateAlignedStore(Extract, Ptr, Align(1));
16361    }
16362
16363    return Builder.CreateExtractValue(Call, 0);
16364  }
16365  case X86::BI__builtin_ia32_encodekey256_u32: {
16366    Intrinsic::ID IID = Intrinsic::x86_encodekey256;
16367
16368    Value *Call =
16369        Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[0], Ops[1], Ops[2]});
16370
16371    for (int i = 0; i < 4; ++i) {
16372      Value *Extract = Builder.CreateExtractValue(Call, i + 1);
16373      Value *Ptr = Builder.CreateConstGEP1_32(Int8Ty, Ops[3], i * 16);
16374      Builder.CreateAlignedStore(Extract, Ptr, Align(1));
16375    }
16376
16377    return Builder.CreateExtractValue(Call, 0);
16378  }
16379  case X86::BI__builtin_ia32_aesenc128kl_u8:
16380  case X86::BI__builtin_ia32_aesdec128kl_u8:
16381  case X86::BI__builtin_ia32_aesenc256kl_u8:
16382  case X86::BI__builtin_ia32_aesdec256kl_u8: {
16383    Intrinsic::ID IID;
16384    StringRef BlockName;
16385    switch (BuiltinID) {
16386    default:
16387      llvm_unreachable("Unexpected builtin");
16388    case X86::BI__builtin_ia32_aesenc128kl_u8:
16389      IID = Intrinsic::x86_aesenc128kl;
16390      BlockName = "aesenc128kl";
16391      break;
16392    case X86::BI__builtin_ia32_aesdec128kl_u8:
16393      IID = Intrinsic::x86_aesdec128kl;
16394      BlockName = "aesdec128kl";
16395      break;
16396    case X86::BI__builtin_ia32_aesenc256kl_u8:
16397      IID = Intrinsic::x86_aesenc256kl;
16398      BlockName = "aesenc256kl";
16399      break;
16400    case X86::BI__builtin_ia32_aesdec256kl_u8:
16401      IID = Intrinsic::x86_aesdec256kl;
16402      BlockName = "aesdec256kl";
16403      break;
16404    }
16405
16406    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), {Ops[1], Ops[2]});
16407
16408    BasicBlock *NoError =
16409        createBasicBlock(BlockName + "_no_error", this->CurFn);
16410    BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
16411    BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
16412
16413    Value *Ret = Builder.CreateExtractValue(Call, 0);
16414    Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
16415    Value *Out = Builder.CreateExtractValue(Call, 1);
16416    Builder.CreateCondBr(Succ, NoError, Error);
16417
16418    Builder.SetInsertPoint(NoError);
16419    Builder.CreateDefaultAlignedStore(Out, Ops[0]);
16420    Builder.CreateBr(End);
16421
16422    Builder.SetInsertPoint(Error);
16423    Constant *Zero = llvm::Constant::getNullValue(Out->getType());
16424    Builder.CreateDefaultAlignedStore(Zero, Ops[0]);
16425    Builder.CreateBr(End);
16426
16427    Builder.SetInsertPoint(End);
16428    return Builder.CreateExtractValue(Call, 0);
16429  }
16430  case X86::BI__builtin_ia32_aesencwide128kl_u8:
16431  case X86::BI__builtin_ia32_aesdecwide128kl_u8:
16432  case X86::BI__builtin_ia32_aesencwide256kl_u8:
16433  case X86::BI__builtin_ia32_aesdecwide256kl_u8: {
16434    Intrinsic::ID IID;
16435    StringRef BlockName;
16436    switch (BuiltinID) {
16437    case X86::BI__builtin_ia32_aesencwide128kl_u8:
16438      IID = Intrinsic::x86_aesencwide128kl;
16439      BlockName = "aesencwide128kl";
16440      break;
16441    case X86::BI__builtin_ia32_aesdecwide128kl_u8:
16442      IID = Intrinsic::x86_aesdecwide128kl;
16443      BlockName = "aesdecwide128kl";
16444      break;
16445    case X86::BI__builtin_ia32_aesencwide256kl_u8:
16446      IID = Intrinsic::x86_aesencwide256kl;
16447      BlockName = "aesencwide256kl";
16448      break;
16449    case X86::BI__builtin_ia32_aesdecwide256kl_u8:
16450      IID = Intrinsic::x86_aesdecwide256kl;
16451      BlockName = "aesdecwide256kl";
16452      break;
16453    }
16454
16455    llvm::Type *Ty = FixedVectorType::get(Builder.getInt64Ty(), 2);
16456    Value *InOps[9];
16457    InOps[0] = Ops[2];
16458    for (int i = 0; i != 8; ++i) {
16459      Value *Ptr = Builder.CreateConstGEP1_32(Ty, Ops[1], i);
16460      InOps[i + 1] = Builder.CreateAlignedLoad(Ty, Ptr, Align(16));
16461    }
16462
16463    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), InOps);
16464
16465    BasicBlock *NoError =
16466        createBasicBlock(BlockName + "_no_error", this->CurFn);
16467    BasicBlock *Error = createBasicBlock(BlockName + "_error", this->CurFn);
16468    BasicBlock *End = createBasicBlock(BlockName + "_end", this->CurFn);
16469
16470    Value *Ret = Builder.CreateExtractValue(Call, 0);
16471    Value *Succ = Builder.CreateTrunc(Ret, Builder.getInt1Ty());
16472    Builder.CreateCondBr(Succ, NoError, Error);
16473
16474    Builder.SetInsertPoint(NoError);
16475    for (int i = 0; i != 8; ++i) {
16476      Value *Extract = Builder.CreateExtractValue(Call, i + 1);
16477      Value *Ptr = Builder.CreateConstGEP1_32(Extract->getType(), Ops[0], i);
16478      Builder.CreateAlignedStore(Extract, Ptr, Align(16));
16479    }
16480    Builder.CreateBr(End);
16481
16482    Builder.SetInsertPoint(Error);
16483    for (int i = 0; i != 8; ++i) {
16484      Value *Out = Builder.CreateExtractValue(Call, i + 1);
16485      Constant *Zero = llvm::Constant::getNullValue(Out->getType());
16486      Value *Ptr = Builder.CreateConstGEP1_32(Out->getType(), Ops[0], i);
16487      Builder.CreateAlignedStore(Zero, Ptr, Align(16));
16488    }
16489    Builder.CreateBr(End);
16490
16491    Builder.SetInsertPoint(End);
16492    return Builder.CreateExtractValue(Call, 0);
16493  }
16494  case X86::BI__builtin_ia32_vfcmaddcph512_mask:
16495    IsConjFMA = true;
16496    [[fallthrough]];
16497  case X86::BI__builtin_ia32_vfmaddcph512_mask: {
16498    Intrinsic::ID IID = IsConjFMA
16499                            ? Intrinsic::x86_avx512fp16_mask_vfcmadd_cph_512
16500                            : Intrinsic::x86_avx512fp16_mask_vfmadd_cph_512;
16501    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
16502    return EmitX86Select(*this, Ops[3], Call, Ops[0]);
16503  }
16504  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
16505    IsConjFMA = true;
16506    [[fallthrough]];
16507  case X86::BI__builtin_ia32_vfmaddcsh_round_mask: {
16508    Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
16509                                  : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
16510    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
16511    Value *And = Builder.CreateAnd(Ops[3], llvm::ConstantInt::get(Int8Ty, 1));
16512    return EmitX86Select(*this, And, Call, Ops[0]);
16513  }
16514  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
16515    IsConjFMA = true;
16516    [[fallthrough]];
16517  case X86::BI__builtin_ia32_vfmaddcsh_round_mask3: {
16518    Intrinsic::ID IID = IsConjFMA ? Intrinsic::x86_avx512fp16_mask_vfcmadd_csh
16519                                  : Intrinsic::x86_avx512fp16_mask_vfmadd_csh;
16520    Value *Call = Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
16521    static constexpr int Mask[] = {0, 5, 6, 7};
16522    return Builder.CreateShuffleVector(Call, Ops[2], Mask);
16523  }
16524  case X86::BI__builtin_ia32_prefetchi:
16525    return Builder.CreateCall(
16526        CGM.getIntrinsic(Intrinsic::prefetch, Ops[0]->getType()),
16527        {Ops[0], llvm::ConstantInt::get(Int32Ty, 0), Ops[1],
16528         llvm::ConstantInt::get(Int32Ty, 0)});
16529  }
16530}
16531
16532Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
16533                                           const CallExpr *E) {
16534  // Do not emit the builtin arguments in the arguments of a function call,
16535  // because the evaluation order of function arguments is not specified in C++.
16536  // This is important when testing to ensure the arguments are emitted in the
16537  // same order every time. Eg:
16538  // Instead of:
16539  //   return Builder.CreateFDiv(EmitScalarExpr(E->getArg(0)),
16540  //                             EmitScalarExpr(E->getArg(1)), "swdiv");
16541  // Use:
16542  //   Value *Op0 = EmitScalarExpr(E->getArg(0));
16543  //   Value *Op1 = EmitScalarExpr(E->getArg(1));
16544  //   return Builder.CreateFDiv(Op0, Op1, "swdiv")
16545
16546  Intrinsic::ID ID = Intrinsic::not_intrinsic;
16547
16548  switch (BuiltinID) {
16549  default: return nullptr;
16550
16551  // __builtin_ppc_get_timebase is GCC 4.8+'s PowerPC-specific name for what we
16552  // call __builtin_readcyclecounter.
16553  case PPC::BI__builtin_ppc_get_timebase:
16554    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::readcyclecounter));
16555
16556  // vec_ld, vec_xl_be, vec_lvsl, vec_lvsr
16557  case PPC::BI__builtin_altivec_lvx:
16558  case PPC::BI__builtin_altivec_lvxl:
16559  case PPC::BI__builtin_altivec_lvebx:
16560  case PPC::BI__builtin_altivec_lvehx:
16561  case PPC::BI__builtin_altivec_lvewx:
16562  case PPC::BI__builtin_altivec_lvsl:
16563  case PPC::BI__builtin_altivec_lvsr:
16564  case PPC::BI__builtin_vsx_lxvd2x:
16565  case PPC::BI__builtin_vsx_lxvw4x:
16566  case PPC::BI__builtin_vsx_lxvd2x_be:
16567  case PPC::BI__builtin_vsx_lxvw4x_be:
16568  case PPC::BI__builtin_vsx_lxvl:
16569  case PPC::BI__builtin_vsx_lxvll:
16570  {
16571    SmallVector<Value *, 2> Ops;
16572    Ops.push_back(EmitScalarExpr(E->getArg(0)));
16573    Ops.push_back(EmitScalarExpr(E->getArg(1)));
16574    if (!(BuiltinID == PPC::BI__builtin_vsx_lxvl ||
16575          BuiltinID == PPC::BI__builtin_vsx_lxvll)) {
16576      Ops[0] = Builder.CreateGEP(Int8Ty, Ops[1], Ops[0]);
16577      Ops.pop_back();
16578    }
16579
16580    switch (BuiltinID) {
16581    default: llvm_unreachable("Unsupported ld/lvsl/lvsr intrinsic!");
16582    case PPC::BI__builtin_altivec_lvx:
16583      ID = Intrinsic::ppc_altivec_lvx;
16584      break;
16585    case PPC::BI__builtin_altivec_lvxl:
16586      ID = Intrinsic::ppc_altivec_lvxl;
16587      break;
16588    case PPC::BI__builtin_altivec_lvebx:
16589      ID = Intrinsic::ppc_altivec_lvebx;
16590      break;
16591    case PPC::BI__builtin_altivec_lvehx:
16592      ID = Intrinsic::ppc_altivec_lvehx;
16593      break;
16594    case PPC::BI__builtin_altivec_lvewx:
16595      ID = Intrinsic::ppc_altivec_lvewx;
16596      break;
16597    case PPC::BI__builtin_altivec_lvsl:
16598      ID = Intrinsic::ppc_altivec_lvsl;
16599      break;
16600    case PPC::BI__builtin_altivec_lvsr:
16601      ID = Intrinsic::ppc_altivec_lvsr;
16602      break;
16603    case PPC::BI__builtin_vsx_lxvd2x:
16604      ID = Intrinsic::ppc_vsx_lxvd2x;
16605      break;
16606    case PPC::BI__builtin_vsx_lxvw4x:
16607      ID = Intrinsic::ppc_vsx_lxvw4x;
16608      break;
16609    case PPC::BI__builtin_vsx_lxvd2x_be:
16610      ID = Intrinsic::ppc_vsx_lxvd2x_be;
16611      break;
16612    case PPC::BI__builtin_vsx_lxvw4x_be:
16613      ID = Intrinsic::ppc_vsx_lxvw4x_be;
16614      break;
16615    case PPC::BI__builtin_vsx_lxvl:
16616      ID = Intrinsic::ppc_vsx_lxvl;
16617      break;
16618    case PPC::BI__builtin_vsx_lxvll:
16619      ID = Intrinsic::ppc_vsx_lxvll;
16620      break;
16621    }
16622    llvm::Function *F = CGM.getIntrinsic(ID);
16623    return Builder.CreateCall(F, Ops, "");
16624  }
16625
16626  // vec_st, vec_xst_be
16627  case PPC::BI__builtin_altivec_stvx:
16628  case PPC::BI__builtin_altivec_stvxl:
16629  case PPC::BI__builtin_altivec_stvebx:
16630  case PPC::BI__builtin_altivec_stvehx:
16631  case PPC::BI__builtin_altivec_stvewx:
16632  case PPC::BI__builtin_vsx_stxvd2x:
16633  case PPC::BI__builtin_vsx_stxvw4x:
16634  case PPC::BI__builtin_vsx_stxvd2x_be:
16635  case PPC::BI__builtin_vsx_stxvw4x_be:
16636  case PPC::BI__builtin_vsx_stxvl:
16637  case PPC::BI__builtin_vsx_stxvll:
16638  {
16639    SmallVector<Value *, 3> Ops;
16640    Ops.push_back(EmitScalarExpr(E->getArg(0)));
16641    Ops.push_back(EmitScalarExpr(E->getArg(1)));
16642    Ops.push_back(EmitScalarExpr(E->getArg(2)));
16643    if (!(BuiltinID == PPC::BI__builtin_vsx_stxvl ||
16644          BuiltinID == PPC::BI__builtin_vsx_stxvll)) {
16645      Ops[1] = Builder.CreateGEP(Int8Ty, Ops[2], Ops[1]);
16646      Ops.pop_back();
16647    }
16648
16649    switch (BuiltinID) {
16650    default: llvm_unreachable("Unsupported st intrinsic!");
16651    case PPC::BI__builtin_altivec_stvx:
16652      ID = Intrinsic::ppc_altivec_stvx;
16653      break;
16654    case PPC::BI__builtin_altivec_stvxl:
16655      ID = Intrinsic::ppc_altivec_stvxl;
16656      break;
16657    case PPC::BI__builtin_altivec_stvebx:
16658      ID = Intrinsic::ppc_altivec_stvebx;
16659      break;
16660    case PPC::BI__builtin_altivec_stvehx:
16661      ID = Intrinsic::ppc_altivec_stvehx;
16662      break;
16663    case PPC::BI__builtin_altivec_stvewx:
16664      ID = Intrinsic::ppc_altivec_stvewx;
16665      break;
16666    case PPC::BI__builtin_vsx_stxvd2x:
16667      ID = Intrinsic::ppc_vsx_stxvd2x;
16668      break;
16669    case PPC::BI__builtin_vsx_stxvw4x:
16670      ID = Intrinsic::ppc_vsx_stxvw4x;
16671      break;
16672    case PPC::BI__builtin_vsx_stxvd2x_be:
16673      ID = Intrinsic::ppc_vsx_stxvd2x_be;
16674      break;
16675    case PPC::BI__builtin_vsx_stxvw4x_be:
16676      ID = Intrinsic::ppc_vsx_stxvw4x_be;
16677      break;
16678    case PPC::BI__builtin_vsx_stxvl:
16679      ID = Intrinsic::ppc_vsx_stxvl;
16680      break;
16681    case PPC::BI__builtin_vsx_stxvll:
16682      ID = Intrinsic::ppc_vsx_stxvll;
16683      break;
16684    }
16685    llvm::Function *F = CGM.getIntrinsic(ID);
16686    return Builder.CreateCall(F, Ops, "");
16687  }
16688  case PPC::BI__builtin_vsx_ldrmb: {
16689    // Essentially boils down to performing an unaligned VMX load sequence so
16690    // as to avoid crossing a page boundary and then shuffling the elements
16691    // into the right side of the vector register.
16692    Value *Op0 = EmitScalarExpr(E->getArg(0));
16693    Value *Op1 = EmitScalarExpr(E->getArg(1));
16694    int64_t NumBytes = cast<ConstantInt>(Op1)->getZExtValue();
16695    llvm::Type *ResTy = ConvertType(E->getType());
16696    bool IsLE = getTarget().isLittleEndian();
16697
16698    // If the user wants the entire vector, just load the entire vector.
16699    if (NumBytes == 16) {
16700      Value *LD =
16701          Builder.CreateLoad(Address(Op0, ResTy, CharUnits::fromQuantity(1)));
16702      if (!IsLE)
16703        return LD;
16704
16705      // Reverse the bytes on LE.
16706      SmallVector<int, 16> RevMask;
16707      for (int Idx = 0; Idx < 16; Idx++)
16708        RevMask.push_back(15 - Idx);
16709      return Builder.CreateShuffleVector(LD, LD, RevMask);
16710    }
16711
16712    llvm::Function *Lvx = CGM.getIntrinsic(Intrinsic::ppc_altivec_lvx);
16713    llvm::Function *Lvs = CGM.getIntrinsic(IsLE ? Intrinsic::ppc_altivec_lvsr
16714                                                : Intrinsic::ppc_altivec_lvsl);
16715    llvm::Function *Vperm = CGM.getIntrinsic(Intrinsic::ppc_altivec_vperm);
16716    Value *HiMem = Builder.CreateGEP(
16717        Int8Ty, Op0, ConstantInt::get(Op1->getType(), NumBytes - 1));
16718    Value *LoLd = Builder.CreateCall(Lvx, Op0, "ld.lo");
16719    Value *HiLd = Builder.CreateCall(Lvx, HiMem, "ld.hi");
16720    Value *Mask1 = Builder.CreateCall(Lvs, Op0, "mask1");
16721
16722    Op0 = IsLE ? HiLd : LoLd;
16723    Op1 = IsLE ? LoLd : HiLd;
16724    Value *AllElts = Builder.CreateCall(Vperm, {Op0, Op1, Mask1}, "shuffle1");
16725    Constant *Zero = llvm::Constant::getNullValue(IsLE ? ResTy : AllElts->getType());
16726
16727    if (IsLE) {
16728      SmallVector<int, 16> Consts;
16729      for (int Idx = 0; Idx < 16; Idx++) {
16730        int Val = (NumBytes - Idx - 1 >= 0) ? (NumBytes - Idx - 1)
16731                                            : 16 - (NumBytes - Idx);
16732        Consts.push_back(Val);
16733      }
16734      return Builder.CreateShuffleVector(Builder.CreateBitCast(AllElts, ResTy),
16735                                         Zero, Consts);
16736    }
16737    SmallVector<Constant *, 16> Consts;
16738    for (int Idx = 0; Idx < 16; Idx++)
16739      Consts.push_back(Builder.getInt8(NumBytes + Idx));
16740    Value *Mask2 = ConstantVector::get(Consts);
16741    return Builder.CreateBitCast(
16742        Builder.CreateCall(Vperm, {Zero, AllElts, Mask2}, "shuffle2"), ResTy);
16743  }
16744  case PPC::BI__builtin_vsx_strmb: {
16745    Value *Op0 = EmitScalarExpr(E->getArg(0));
16746    Value *Op1 = EmitScalarExpr(E->getArg(1));
16747    Value *Op2 = EmitScalarExpr(E->getArg(2));
16748    int64_t NumBytes = cast<ConstantInt>(Op1)->getZExtValue();
16749    bool IsLE = getTarget().isLittleEndian();
16750    auto StoreSubVec = [&](unsigned Width, unsigned Offset, unsigned EltNo) {
16751      // Storing the whole vector, simply store it on BE and reverse bytes and
16752      // store on LE.
16753      if (Width == 16) {
16754        Value *StVec = Op2;
16755        if (IsLE) {
16756          SmallVector<int, 16> RevMask;
16757          for (int Idx = 0; Idx < 16; Idx++)
16758            RevMask.push_back(15 - Idx);
16759          StVec = Builder.CreateShuffleVector(Op2, Op2, RevMask);
16760        }
16761        return Builder.CreateStore(
16762            StVec, Address(Op0, Op2->getType(), CharUnits::fromQuantity(1)));
16763      }
16764      auto *ConvTy = Int64Ty;
16765      unsigned NumElts = 0;
16766      switch (Width) {
16767      default:
16768        llvm_unreachable("width for stores must be a power of 2");
16769      case 8:
16770        ConvTy = Int64Ty;
16771        NumElts = 2;
16772        break;
16773      case 4:
16774        ConvTy = Int32Ty;
16775        NumElts = 4;
16776        break;
16777      case 2:
16778        ConvTy = Int16Ty;
16779        NumElts = 8;
16780        break;
16781      case 1:
16782        ConvTy = Int8Ty;
16783        NumElts = 16;
16784        break;
16785      }
16786      Value *Vec = Builder.CreateBitCast(
16787          Op2, llvm::FixedVectorType::get(ConvTy, NumElts));
16788      Value *Ptr =
16789          Builder.CreateGEP(Int8Ty, Op0, ConstantInt::get(Int64Ty, Offset));
16790      Value *Elt = Builder.CreateExtractElement(Vec, EltNo);
16791      if (IsLE && Width > 1) {
16792        Function *F = CGM.getIntrinsic(Intrinsic::bswap, ConvTy);
16793        Elt = Builder.CreateCall(F, Elt);
16794      }
16795      return Builder.CreateStore(
16796          Elt, Address(Ptr, ConvTy, CharUnits::fromQuantity(1)));
16797    };
16798    unsigned Stored = 0;
16799    unsigned RemainingBytes = NumBytes;
16800    Value *Result;
16801    if (NumBytes == 16)
16802      return StoreSubVec(16, 0, 0);
16803    if (NumBytes >= 8) {
16804      Result = StoreSubVec(8, NumBytes - 8, IsLE ? 0 : 1);
16805      RemainingBytes -= 8;
16806      Stored += 8;
16807    }
16808    if (RemainingBytes >= 4) {
16809      Result = StoreSubVec(4, NumBytes - Stored - 4,
16810                           IsLE ? (Stored >> 2) : 3 - (Stored >> 2));
16811      RemainingBytes -= 4;
16812      Stored += 4;
16813    }
16814    if (RemainingBytes >= 2) {
16815      Result = StoreSubVec(2, NumBytes - Stored - 2,
16816                           IsLE ? (Stored >> 1) : 7 - (Stored >> 1));
16817      RemainingBytes -= 2;
16818      Stored += 2;
16819    }
16820    if (RemainingBytes)
16821      Result =
16822          StoreSubVec(1, NumBytes - Stored - 1, IsLE ? Stored : 15 - Stored);
16823    return Result;
16824  }
16825  // Square root
16826  case PPC::BI__builtin_vsx_xvsqrtsp:
16827  case PPC::BI__builtin_vsx_xvsqrtdp: {
16828    llvm::Type *ResultType = ConvertType(E->getType());
16829    Value *X = EmitScalarExpr(E->getArg(0));
16830    if (Builder.getIsFPConstrained()) {
16831      llvm::Function *F = CGM.getIntrinsic(
16832          Intrinsic::experimental_constrained_sqrt, ResultType);
16833      return Builder.CreateConstrainedFPCall(F, X);
16834    } else {
16835      llvm::Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
16836      return Builder.CreateCall(F, X);
16837    }
16838  }
16839  // Count leading zeros
16840  case PPC::BI__builtin_altivec_vclzb:
16841  case PPC::BI__builtin_altivec_vclzh:
16842  case PPC::BI__builtin_altivec_vclzw:
16843  case PPC::BI__builtin_altivec_vclzd: {
16844    llvm::Type *ResultType = ConvertType(E->getType());
16845    Value *X = EmitScalarExpr(E->getArg(0));
16846    Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
16847    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
16848    return Builder.CreateCall(F, {X, Undef});
16849  }
16850  case PPC::BI__builtin_altivec_vctzb:
16851  case PPC::BI__builtin_altivec_vctzh:
16852  case PPC::BI__builtin_altivec_vctzw:
16853  case PPC::BI__builtin_altivec_vctzd: {
16854    llvm::Type *ResultType = ConvertType(E->getType());
16855    Value *X = EmitScalarExpr(E->getArg(0));
16856    Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
16857    Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
16858    return Builder.CreateCall(F, {X, Undef});
16859  }
16860  case PPC::BI__builtin_altivec_vinsd:
16861  case PPC::BI__builtin_altivec_vinsw:
16862  case PPC::BI__builtin_altivec_vinsd_elt:
16863  case PPC::BI__builtin_altivec_vinsw_elt: {
16864    llvm::Type *ResultType = ConvertType(E->getType());
16865    Value *Op0 = EmitScalarExpr(E->getArg(0));
16866    Value *Op1 = EmitScalarExpr(E->getArg(1));
16867    Value *Op2 = EmitScalarExpr(E->getArg(2));
16868
16869    bool IsUnaligned = (BuiltinID == PPC::BI__builtin_altivec_vinsw ||
16870                        BuiltinID == PPC::BI__builtin_altivec_vinsd);
16871
16872    bool Is32bit = (BuiltinID == PPC::BI__builtin_altivec_vinsw ||
16873                    BuiltinID == PPC::BI__builtin_altivec_vinsw_elt);
16874
16875    // The third argument must be a compile time constant.
16876    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
16877    assert(ArgCI &&
16878           "Third Arg to vinsw/vinsd intrinsic must be a constant integer!");
16879
16880    // Valid value for the third argument is dependent on the input type and
16881    // builtin called.
16882    int ValidMaxValue = 0;
16883    if (IsUnaligned)
16884      ValidMaxValue = (Is32bit) ? 12 : 8;
16885    else
16886      ValidMaxValue = (Is32bit) ? 3 : 1;
16887
16888    // Get value of third argument.
16889    int64_t ConstArg = ArgCI->getSExtValue();
16890
16891    // Compose range checking error message.
16892    std::string RangeErrMsg = IsUnaligned ? "byte" : "element";
16893    RangeErrMsg += " number " + llvm::to_string(ConstArg);
16894    RangeErrMsg += " is outside of the valid range [0, ";
16895    RangeErrMsg += llvm::to_string(ValidMaxValue) + "]";
16896
16897    // Issue error if third argument is not within the valid range.
16898    if (ConstArg < 0 || ConstArg > ValidMaxValue)
16899      CGM.Error(E->getExprLoc(), RangeErrMsg);
16900
16901    // Input to vec_replace_elt is an element index, convert to byte index.
16902    if (!IsUnaligned) {
16903      ConstArg *= Is32bit ? 4 : 8;
16904      // Fix the constant according to endianess.
16905      if (getTarget().isLittleEndian())
16906        ConstArg = (Is32bit ? 12 : 8) - ConstArg;
16907    }
16908
16909    ID = Is32bit ? Intrinsic::ppc_altivec_vinsw : Intrinsic::ppc_altivec_vinsd;
16910    Op2 = ConstantInt::getSigned(Int32Ty, ConstArg);
16911    // Casting input to vector int as per intrinsic definition.
16912    Op0 =
16913        Is32bit
16914            ? Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4))
16915            : Builder.CreateBitCast(Op0,
16916                                    llvm::FixedVectorType::get(Int64Ty, 2));
16917    return Builder.CreateBitCast(
16918        Builder.CreateCall(CGM.getIntrinsic(ID), {Op0, Op1, Op2}), ResultType);
16919  }
16920  case PPC::BI__builtin_altivec_vpopcntb:
16921  case PPC::BI__builtin_altivec_vpopcnth:
16922  case PPC::BI__builtin_altivec_vpopcntw:
16923  case PPC::BI__builtin_altivec_vpopcntd: {
16924    llvm::Type *ResultType = ConvertType(E->getType());
16925    Value *X = EmitScalarExpr(E->getArg(0));
16926    llvm::Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
16927    return Builder.CreateCall(F, X);
16928  }
16929  case PPC::BI__builtin_altivec_vadduqm:
16930  case PPC::BI__builtin_altivec_vsubuqm: {
16931    Value *Op0 = EmitScalarExpr(E->getArg(0));
16932    Value *Op1 = EmitScalarExpr(E->getArg(1));
16933    llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
16934    Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int128Ty, 1));
16935    Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int128Ty, 1));
16936    if (BuiltinID == PPC::BI__builtin_altivec_vadduqm)
16937      return Builder.CreateAdd(Op0, Op1, "vadduqm");
16938    else
16939      return Builder.CreateSub(Op0, Op1, "vsubuqm");
16940  }
16941  case PPC::BI__builtin_altivec_vaddcuq_c:
16942  case PPC::BI__builtin_altivec_vsubcuq_c: {
16943    SmallVector<Value *, 2> Ops;
16944    Value *Op0 = EmitScalarExpr(E->getArg(0));
16945    Value *Op1 = EmitScalarExpr(E->getArg(1));
16946    llvm::Type *V1I128Ty = llvm::FixedVectorType::get(
16947        llvm::IntegerType::get(getLLVMContext(), 128), 1);
16948    Ops.push_back(Builder.CreateBitCast(Op0, V1I128Ty));
16949    Ops.push_back(Builder.CreateBitCast(Op1, V1I128Ty));
16950    ID = (BuiltinID == PPC::BI__builtin_altivec_vaddcuq_c)
16951             ? Intrinsic::ppc_altivec_vaddcuq
16952             : Intrinsic::ppc_altivec_vsubcuq;
16953    return Builder.CreateCall(CGM.getIntrinsic(ID), Ops, "");
16954  }
16955  case PPC::BI__builtin_altivec_vaddeuqm_c:
16956  case PPC::BI__builtin_altivec_vaddecuq_c:
16957  case PPC::BI__builtin_altivec_vsubeuqm_c:
16958  case PPC::BI__builtin_altivec_vsubecuq_c: {
16959    SmallVector<Value *, 3> Ops;
16960    Value *Op0 = EmitScalarExpr(E->getArg(0));
16961    Value *Op1 = EmitScalarExpr(E->getArg(1));
16962    Value *Op2 = EmitScalarExpr(E->getArg(2));
16963    llvm::Type *V1I128Ty = llvm::FixedVectorType::get(
16964        llvm::IntegerType::get(getLLVMContext(), 128), 1);
16965    Ops.push_back(Builder.CreateBitCast(Op0, V1I128Ty));
16966    Ops.push_back(Builder.CreateBitCast(Op1, V1I128Ty));
16967    Ops.push_back(Builder.CreateBitCast(Op2, V1I128Ty));
16968    switch (BuiltinID) {
16969    default:
16970      llvm_unreachable("Unsupported intrinsic!");
16971    case PPC::BI__builtin_altivec_vaddeuqm_c:
16972      ID = Intrinsic::ppc_altivec_vaddeuqm;
16973      break;
16974    case PPC::BI__builtin_altivec_vaddecuq_c:
16975      ID = Intrinsic::ppc_altivec_vaddecuq;
16976      break;
16977    case PPC::BI__builtin_altivec_vsubeuqm_c:
16978      ID = Intrinsic::ppc_altivec_vsubeuqm;
16979      break;
16980    case PPC::BI__builtin_altivec_vsubecuq_c:
16981      ID = Intrinsic::ppc_altivec_vsubecuq;
16982      break;
16983    }
16984    return Builder.CreateCall(CGM.getIntrinsic(ID), Ops, "");
16985  }
16986  // Rotate and insert under mask operation.
16987  // __rldimi(rs, is, shift, mask)
16988  // (rotl64(rs, shift) & mask) | (is & ~mask)
16989  // __rlwimi(rs, is, shift, mask)
16990  // (rotl(rs, shift) & mask) | (is & ~mask)
16991  case PPC::BI__builtin_ppc_rldimi:
16992  case PPC::BI__builtin_ppc_rlwimi: {
16993    Value *Op0 = EmitScalarExpr(E->getArg(0));
16994    Value *Op1 = EmitScalarExpr(E->getArg(1));
16995    Value *Op2 = EmitScalarExpr(E->getArg(2));
16996    Value *Op3 = EmitScalarExpr(E->getArg(3));
16997    llvm::Type *Ty = Op0->getType();
16998    Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
16999    if (BuiltinID == PPC::BI__builtin_ppc_rldimi)
17000      Op2 = Builder.CreateZExt(Op2, Int64Ty);
17001    Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op2});
17002    Value *X = Builder.CreateAnd(Shift, Op3);
17003    Value *Y = Builder.CreateAnd(Op1, Builder.CreateNot(Op3));
17004    return Builder.CreateOr(X, Y);
17005  }
17006  // Rotate and insert under mask operation.
17007  // __rlwnm(rs, shift, mask)
17008  // rotl(rs, shift) & mask
17009  case PPC::BI__builtin_ppc_rlwnm: {
17010    Value *Op0 = EmitScalarExpr(E->getArg(0));
17011    Value *Op1 = EmitScalarExpr(E->getArg(1));
17012    Value *Op2 = EmitScalarExpr(E->getArg(2));
17013    llvm::Type *Ty = Op0->getType();
17014    Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
17015    Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op1});
17016    return Builder.CreateAnd(Shift, Op2);
17017  }
17018  case PPC::BI__builtin_ppc_poppar4:
17019  case PPC::BI__builtin_ppc_poppar8: {
17020    Value *Op0 = EmitScalarExpr(E->getArg(0));
17021    llvm::Type *ArgType = Op0->getType();
17022    Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
17023    Value *Tmp = Builder.CreateCall(F, Op0);
17024
17025    llvm::Type *ResultType = ConvertType(E->getType());
17026    Value *Result = Builder.CreateAnd(Tmp, llvm::ConstantInt::get(ArgType, 1));
17027    if (Result->getType() != ResultType)
17028      Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
17029                                     "cast");
17030    return Result;
17031  }
17032  case PPC::BI__builtin_ppc_cmpb: {
17033    Value *Op0 = EmitScalarExpr(E->getArg(0));
17034    Value *Op1 = EmitScalarExpr(E->getArg(1));
17035    if (getTarget().getTriple().isPPC64()) {
17036      Function *F =
17037          CGM.getIntrinsic(Intrinsic::ppc_cmpb, {Int64Ty, Int64Ty, Int64Ty});
17038      return Builder.CreateCall(F, {Op0, Op1}, "cmpb");
17039    }
17040    // For 32 bit, emit the code as below:
17041    // %conv = trunc i64 %a to i32
17042    // %conv1 = trunc i64 %b to i32
17043    // %shr = lshr i64 %a, 32
17044    // %conv2 = trunc i64 %shr to i32
17045    // %shr3 = lshr i64 %b, 32
17046    // %conv4 = trunc i64 %shr3 to i32
17047    // %0 = tail call i32 @llvm.ppc.cmpb32(i32 %conv, i32 %conv1)
17048    // %conv5 = zext i32 %0 to i64
17049    // %1 = tail call i32 @llvm.ppc.cmpb32(i32 %conv2, i32 %conv4)
17050    // %conv614 = zext i32 %1 to i64
17051    // %shl = shl nuw i64 %conv614, 32
17052    // %or = or i64 %shl, %conv5
17053    // ret i64 %or
17054    Function *F =
17055        CGM.getIntrinsic(Intrinsic::ppc_cmpb, {Int32Ty, Int32Ty, Int32Ty});
17056    Value *ArgOneLo = Builder.CreateTrunc(Op0, Int32Ty);
17057    Value *ArgTwoLo = Builder.CreateTrunc(Op1, Int32Ty);
17058    Constant *ShiftAmt = ConstantInt::get(Int64Ty, 32);
17059    Value *ArgOneHi =
17060        Builder.CreateTrunc(Builder.CreateLShr(Op0, ShiftAmt), Int32Ty);
17061    Value *ArgTwoHi =
17062        Builder.CreateTrunc(Builder.CreateLShr(Op1, ShiftAmt), Int32Ty);
17063    Value *ResLo = Builder.CreateZExt(
17064        Builder.CreateCall(F, {ArgOneLo, ArgTwoLo}, "cmpb"), Int64Ty);
17065    Value *ResHiShift = Builder.CreateZExt(
17066        Builder.CreateCall(F, {ArgOneHi, ArgTwoHi}, "cmpb"), Int64Ty);
17067    Value *ResHi = Builder.CreateShl(ResHiShift, ShiftAmt);
17068    return Builder.CreateOr(ResLo, ResHi);
17069  }
17070  // Copy sign
17071  case PPC::BI__builtin_vsx_xvcpsgnsp:
17072  case PPC::BI__builtin_vsx_xvcpsgndp: {
17073    llvm::Type *ResultType = ConvertType(E->getType());
17074    Value *X = EmitScalarExpr(E->getArg(0));
17075    Value *Y = EmitScalarExpr(E->getArg(1));
17076    ID = Intrinsic::copysign;
17077    llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
17078    return Builder.CreateCall(F, {X, Y});
17079  }
17080  // Rounding/truncation
17081  case PPC::BI__builtin_vsx_xvrspip:
17082  case PPC::BI__builtin_vsx_xvrdpip:
17083  case PPC::BI__builtin_vsx_xvrdpim:
17084  case PPC::BI__builtin_vsx_xvrspim:
17085  case PPC::BI__builtin_vsx_xvrdpi:
17086  case PPC::BI__builtin_vsx_xvrspi:
17087  case PPC::BI__builtin_vsx_xvrdpic:
17088  case PPC::BI__builtin_vsx_xvrspic:
17089  case PPC::BI__builtin_vsx_xvrdpiz:
17090  case PPC::BI__builtin_vsx_xvrspiz: {
17091    llvm::Type *ResultType = ConvertType(E->getType());
17092    Value *X = EmitScalarExpr(E->getArg(0));
17093    if (BuiltinID == PPC::BI__builtin_vsx_xvrdpim ||
17094        BuiltinID == PPC::BI__builtin_vsx_xvrspim)
17095      ID = Builder.getIsFPConstrained()
17096               ? Intrinsic::experimental_constrained_floor
17097               : Intrinsic::floor;
17098    else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpi ||
17099             BuiltinID == PPC::BI__builtin_vsx_xvrspi)
17100      ID = Builder.getIsFPConstrained()
17101               ? Intrinsic::experimental_constrained_round
17102               : Intrinsic::round;
17103    else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpic ||
17104             BuiltinID == PPC::BI__builtin_vsx_xvrspic)
17105      ID = Builder.getIsFPConstrained()
17106               ? Intrinsic::experimental_constrained_rint
17107               : Intrinsic::rint;
17108    else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpip ||
17109             BuiltinID == PPC::BI__builtin_vsx_xvrspip)
17110      ID = Builder.getIsFPConstrained()
17111               ? Intrinsic::experimental_constrained_ceil
17112               : Intrinsic::ceil;
17113    else if (BuiltinID == PPC::BI__builtin_vsx_xvrdpiz ||
17114             BuiltinID == PPC::BI__builtin_vsx_xvrspiz)
17115      ID = Builder.getIsFPConstrained()
17116               ? Intrinsic::experimental_constrained_trunc
17117               : Intrinsic::trunc;
17118    llvm::Function *F = CGM.getIntrinsic(ID, ResultType);
17119    return Builder.getIsFPConstrained() ? Builder.CreateConstrainedFPCall(F, X)
17120                                        : Builder.CreateCall(F, X);
17121  }
17122
17123  // Absolute value
17124  case PPC::BI__builtin_vsx_xvabsdp:
17125  case PPC::BI__builtin_vsx_xvabssp: {
17126    llvm::Type *ResultType = ConvertType(E->getType());
17127    Value *X = EmitScalarExpr(E->getArg(0));
17128    llvm::Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
17129    return Builder.CreateCall(F, X);
17130  }
17131
17132  // Fastmath by default
17133  case PPC::BI__builtin_ppc_recipdivf:
17134  case PPC::BI__builtin_ppc_recipdivd:
17135  case PPC::BI__builtin_ppc_rsqrtf:
17136  case PPC::BI__builtin_ppc_rsqrtd: {
17137    FastMathFlags FMF = Builder.getFastMathFlags();
17138    Builder.getFastMathFlags().setFast();
17139    llvm::Type *ResultType = ConvertType(E->getType());
17140    Value *X = EmitScalarExpr(E->getArg(0));
17141
17142    if (BuiltinID == PPC::BI__builtin_ppc_recipdivf ||
17143        BuiltinID == PPC::BI__builtin_ppc_recipdivd) {
17144      Value *Y = EmitScalarExpr(E->getArg(1));
17145      Value *FDiv = Builder.CreateFDiv(X, Y, "recipdiv");
17146      Builder.getFastMathFlags() &= (FMF);
17147      return FDiv;
17148    }
17149    auto *One = ConstantFP::get(ResultType, 1.0);
17150    llvm::Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
17151    Value *FDiv = Builder.CreateFDiv(One, Builder.CreateCall(F, X), "rsqrt");
17152    Builder.getFastMathFlags() &= (FMF);
17153    return FDiv;
17154  }
17155  case PPC::BI__builtin_ppc_alignx: {
17156    Value *Op0 = EmitScalarExpr(E->getArg(0));
17157    Value *Op1 = EmitScalarExpr(E->getArg(1));
17158    ConstantInt *AlignmentCI = cast<ConstantInt>(Op0);
17159    if (AlignmentCI->getValue().ugt(llvm::Value::MaximumAlignment))
17160      AlignmentCI = ConstantInt::get(AlignmentCI->getIntegerType(),
17161                                     llvm::Value::MaximumAlignment);
17162
17163    emitAlignmentAssumption(Op1, E->getArg(1),
17164                            /*The expr loc is sufficient.*/ SourceLocation(),
17165                            AlignmentCI, nullptr);
17166    return Op1;
17167  }
17168  case PPC::BI__builtin_ppc_rdlam: {
17169    Value *Op0 = EmitScalarExpr(E->getArg(0));
17170    Value *Op1 = EmitScalarExpr(E->getArg(1));
17171    Value *Op2 = EmitScalarExpr(E->getArg(2));
17172    llvm::Type *Ty = Op0->getType();
17173    Value *ShiftAmt = Builder.CreateIntCast(Op1, Ty, false);
17174    Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
17175    Value *Rotate = Builder.CreateCall(F, {Op0, Op0, ShiftAmt});
17176    return Builder.CreateAnd(Rotate, Op2);
17177  }
17178  case PPC::BI__builtin_ppc_load2r: {
17179    Function *F = CGM.getIntrinsic(Intrinsic::ppc_load2r);
17180    Value *Op0 = EmitScalarExpr(E->getArg(0));
17181    Value *LoadIntrinsic = Builder.CreateCall(F, {Op0});
17182    return Builder.CreateTrunc(LoadIntrinsic, Int16Ty);
17183  }
17184  // FMA variations
17185  case PPC::BI__builtin_ppc_fnmsub:
17186  case PPC::BI__builtin_ppc_fnmsubs:
17187  case PPC::BI__builtin_vsx_xvmaddadp:
17188  case PPC::BI__builtin_vsx_xvmaddasp:
17189  case PPC::BI__builtin_vsx_xvnmaddadp:
17190  case PPC::BI__builtin_vsx_xvnmaddasp:
17191  case PPC::BI__builtin_vsx_xvmsubadp:
17192  case PPC::BI__builtin_vsx_xvmsubasp:
17193  case PPC::BI__builtin_vsx_xvnmsubadp:
17194  case PPC::BI__builtin_vsx_xvnmsubasp: {
17195    llvm::Type *ResultType = ConvertType(E->getType());
17196    Value *X = EmitScalarExpr(E->getArg(0));
17197    Value *Y = EmitScalarExpr(E->getArg(1));
17198    Value *Z = EmitScalarExpr(E->getArg(2));
17199    llvm::Function *F;
17200    if (Builder.getIsFPConstrained())
17201      F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
17202    else
17203      F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
17204    switch (BuiltinID) {
17205      case PPC::BI__builtin_vsx_xvmaddadp:
17206      case PPC::BI__builtin_vsx_xvmaddasp:
17207        if (Builder.getIsFPConstrained())
17208          return Builder.CreateConstrainedFPCall(F, {X, Y, Z});
17209        else
17210          return Builder.CreateCall(F, {X, Y, Z});
17211      case PPC::BI__builtin_vsx_xvnmaddadp:
17212      case PPC::BI__builtin_vsx_xvnmaddasp:
17213        if (Builder.getIsFPConstrained())
17214          return Builder.CreateFNeg(
17215              Builder.CreateConstrainedFPCall(F, {X, Y, Z}), "neg");
17216        else
17217          return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, Z}), "neg");
17218      case PPC::BI__builtin_vsx_xvmsubadp:
17219      case PPC::BI__builtin_vsx_xvmsubasp:
17220        if (Builder.getIsFPConstrained())
17221          return Builder.CreateConstrainedFPCall(
17222              F, {X, Y, Builder.CreateFNeg(Z, "neg")});
17223        else
17224          return Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
17225      case PPC::BI__builtin_ppc_fnmsub:
17226      case PPC::BI__builtin_ppc_fnmsubs:
17227      case PPC::BI__builtin_vsx_xvnmsubadp:
17228      case PPC::BI__builtin_vsx_xvnmsubasp:
17229        if (Builder.getIsFPConstrained())
17230          return Builder.CreateFNeg(
17231              Builder.CreateConstrainedFPCall(
17232                  F, {X, Y, Builder.CreateFNeg(Z, "neg")}),
17233              "neg");
17234        else
17235          return Builder.CreateCall(
17236              CGM.getIntrinsic(Intrinsic::ppc_fnmsub, ResultType), {X, Y, Z});
17237      }
17238    llvm_unreachable("Unknown FMA operation");
17239    return nullptr; // Suppress no-return warning
17240  }
17241
17242  case PPC::BI__builtin_vsx_insertword: {
17243    Value *Op0 = EmitScalarExpr(E->getArg(0));
17244    Value *Op1 = EmitScalarExpr(E->getArg(1));
17245    Value *Op2 = EmitScalarExpr(E->getArg(2));
17246    llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxinsertw);
17247
17248    // Third argument is a compile time constant int. It must be clamped to
17249    // to the range [0, 12].
17250    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
17251    assert(ArgCI &&
17252           "Third arg to xxinsertw intrinsic must be constant integer");
17253    const int64_t MaxIndex = 12;
17254    int64_t Index = std::clamp(ArgCI->getSExtValue(), (int64_t)0, MaxIndex);
17255
17256    // The builtin semantics don't exactly match the xxinsertw instructions
17257    // semantics (which ppc_vsx_xxinsertw follows). The builtin extracts the
17258    // word from the first argument, and inserts it in the second argument. The
17259    // instruction extracts the word from its second input register and inserts
17260    // it into its first input register, so swap the first and second arguments.
17261    std::swap(Op0, Op1);
17262
17263    // Need to cast the second argument from a vector of unsigned int to a
17264    // vector of long long.
17265    Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int64Ty, 2));
17266
17267    if (getTarget().isLittleEndian()) {
17268      // Reverse the double words in the vector we will extract from.
17269      Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
17270      Op0 = Builder.CreateShuffleVector(Op0, Op0, ArrayRef<int>{1, 0});
17271
17272      // Reverse the index.
17273      Index = MaxIndex - Index;
17274    }
17275
17276    // Intrinsic expects the first arg to be a vector of int.
17277    Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4));
17278    Op2 = ConstantInt::getSigned(Int32Ty, Index);
17279    return Builder.CreateCall(F, {Op0, Op1, Op2});
17280  }
17281
17282  case PPC::BI__builtin_vsx_extractuword: {
17283    Value *Op0 = EmitScalarExpr(E->getArg(0));
17284    Value *Op1 = EmitScalarExpr(E->getArg(1));
17285    llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxextractuw);
17286
17287    // Intrinsic expects the first argument to be a vector of doublewords.
17288    Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
17289
17290    // The second argument is a compile time constant int that needs to
17291    // be clamped to the range [0, 12].
17292    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op1);
17293    assert(ArgCI &&
17294           "Second Arg to xxextractuw intrinsic must be a constant integer!");
17295    const int64_t MaxIndex = 12;
17296    int64_t Index = std::clamp(ArgCI->getSExtValue(), (int64_t)0, MaxIndex);
17297
17298    if (getTarget().isLittleEndian()) {
17299      // Reverse the index.
17300      Index = MaxIndex - Index;
17301      Op1 = ConstantInt::getSigned(Int32Ty, Index);
17302
17303      // Emit the call, then reverse the double words of the results vector.
17304      Value *Call = Builder.CreateCall(F, {Op0, Op1});
17305
17306      Value *ShuffleCall =
17307          Builder.CreateShuffleVector(Call, Call, ArrayRef<int>{1, 0});
17308      return ShuffleCall;
17309    } else {
17310      Op1 = ConstantInt::getSigned(Int32Ty, Index);
17311      return Builder.CreateCall(F, {Op0, Op1});
17312    }
17313  }
17314
17315  case PPC::BI__builtin_vsx_xxpermdi: {
17316    Value *Op0 = EmitScalarExpr(E->getArg(0));
17317    Value *Op1 = EmitScalarExpr(E->getArg(1));
17318    Value *Op2 = EmitScalarExpr(E->getArg(2));
17319    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
17320    assert(ArgCI && "Third arg must be constant integer!");
17321
17322    unsigned Index = ArgCI->getZExtValue();
17323    Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int64Ty, 2));
17324    Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int64Ty, 2));
17325
17326    // Account for endianness by treating this as just a shuffle. So we use the
17327    // same indices for both LE and BE in order to produce expected results in
17328    // both cases.
17329    int ElemIdx0 = (Index & 2) >> 1;
17330    int ElemIdx1 = 2 + (Index & 1);
17331
17332    int ShuffleElts[2] = {ElemIdx0, ElemIdx1};
17333    Value *ShuffleCall = Builder.CreateShuffleVector(Op0, Op1, ShuffleElts);
17334    QualType BIRetType = E->getType();
17335    auto RetTy = ConvertType(BIRetType);
17336    return Builder.CreateBitCast(ShuffleCall, RetTy);
17337  }
17338
17339  case PPC::BI__builtin_vsx_xxsldwi: {
17340    Value *Op0 = EmitScalarExpr(E->getArg(0));
17341    Value *Op1 = EmitScalarExpr(E->getArg(1));
17342    Value *Op2 = EmitScalarExpr(E->getArg(2));
17343    ConstantInt *ArgCI = dyn_cast<ConstantInt>(Op2);
17344    assert(ArgCI && "Third argument must be a compile time constant");
17345    unsigned Index = ArgCI->getZExtValue() & 0x3;
17346    Op0 = Builder.CreateBitCast(Op0, llvm::FixedVectorType::get(Int32Ty, 4));
17347    Op1 = Builder.CreateBitCast(Op1, llvm::FixedVectorType::get(Int32Ty, 4));
17348
17349    // Create a shuffle mask
17350    int ElemIdx0;
17351    int ElemIdx1;
17352    int ElemIdx2;
17353    int ElemIdx3;
17354    if (getTarget().isLittleEndian()) {
17355      // Little endian element N comes from element 8+N-Index of the
17356      // concatenated wide vector (of course, using modulo arithmetic on
17357      // the total number of elements).
17358      ElemIdx0 = (8 - Index) % 8;
17359      ElemIdx1 = (9 - Index) % 8;
17360      ElemIdx2 = (10 - Index) % 8;
17361      ElemIdx3 = (11 - Index) % 8;
17362    } else {
17363      // Big endian ElemIdx<N> = Index + N
17364      ElemIdx0 = Index;
17365      ElemIdx1 = Index + 1;
17366      ElemIdx2 = Index + 2;
17367      ElemIdx3 = Index + 3;
17368    }
17369
17370    int ShuffleElts[4] = {ElemIdx0, ElemIdx1, ElemIdx2, ElemIdx3};
17371    Value *ShuffleCall = Builder.CreateShuffleVector(Op0, Op1, ShuffleElts);
17372    QualType BIRetType = E->getType();
17373    auto RetTy = ConvertType(BIRetType);
17374    return Builder.CreateBitCast(ShuffleCall, RetTy);
17375  }
17376
17377  case PPC::BI__builtin_pack_vector_int128: {
17378    Value *Op0 = EmitScalarExpr(E->getArg(0));
17379    Value *Op1 = EmitScalarExpr(E->getArg(1));
17380    bool isLittleEndian = getTarget().isLittleEndian();
17381    Value *PoisonValue =
17382        llvm::PoisonValue::get(llvm::FixedVectorType::get(Op0->getType(), 2));
17383    Value *Res = Builder.CreateInsertElement(
17384        PoisonValue, Op0, (uint64_t)(isLittleEndian ? 1 : 0));
17385    Res = Builder.CreateInsertElement(Res, Op1,
17386                                      (uint64_t)(isLittleEndian ? 0 : 1));
17387    return Builder.CreateBitCast(Res, ConvertType(E->getType()));
17388  }
17389
17390  case PPC::BI__builtin_unpack_vector_int128: {
17391    Value *Op0 = EmitScalarExpr(E->getArg(0));
17392    Value *Op1 = EmitScalarExpr(E->getArg(1));
17393    ConstantInt *Index = cast<ConstantInt>(Op1);
17394    Value *Unpacked = Builder.CreateBitCast(
17395        Op0, llvm::FixedVectorType::get(ConvertType(E->getType()), 2));
17396
17397    if (getTarget().isLittleEndian())
17398      Index =
17399          ConstantInt::get(Index->getIntegerType(), 1 - Index->getZExtValue());
17400
17401    return Builder.CreateExtractElement(Unpacked, Index);
17402  }
17403
17404  case PPC::BI__builtin_ppc_sthcx: {
17405    llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_sthcx);
17406    Value *Op0 = EmitScalarExpr(E->getArg(0));
17407    Value *Op1 = Builder.CreateSExt(EmitScalarExpr(E->getArg(1)), Int32Ty);
17408    return Builder.CreateCall(F, {Op0, Op1});
17409  }
17410
17411  // The PPC MMA builtins take a pointer to a __vector_quad as an argument.
17412  // Some of the MMA instructions accumulate their result into an existing
17413  // accumulator whereas the others generate a new accumulator. So we need to
17414  // use custom code generation to expand a builtin call with a pointer to a
17415  // load (if the corresponding instruction accumulates its result) followed by
17416  // the call to the intrinsic and a store of the result.
17417#define CUSTOM_BUILTIN(Name, Intr, Types, Accumulate, Feature) \
17418  case PPC::BI__builtin_##Name:
17419#include "clang/Basic/BuiltinsPPC.def"
17420  {
17421    SmallVector<Value *, 4> Ops;
17422    for (unsigned i = 0, e = E->getNumArgs(); i != e; i++)
17423      if (E->getArg(i)->getType()->isArrayType())
17424        Ops.push_back(EmitArrayToPointerDecay(E->getArg(i)).getPointer());
17425      else
17426        Ops.push_back(EmitScalarExpr(E->getArg(i)));
17427    // The first argument of these two builtins is a pointer used to store their
17428    // result. However, the llvm intrinsics return their result in multiple
17429    // return values. So, here we emit code extracting these values from the
17430    // intrinsic results and storing them using that pointer.
17431    if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc ||
17432        BuiltinID == PPC::BI__builtin_vsx_disassemble_pair ||
17433        BuiltinID == PPC::BI__builtin_mma_disassemble_pair) {
17434      unsigned NumVecs = 2;
17435      auto Intrinsic = Intrinsic::ppc_vsx_disassemble_pair;
17436      if (BuiltinID == PPC::BI__builtin_mma_disassemble_acc) {
17437        NumVecs = 4;
17438        Intrinsic = Intrinsic::ppc_mma_disassemble_acc;
17439      }
17440      llvm::Function *F = CGM.getIntrinsic(Intrinsic);
17441      Address Addr = EmitPointerWithAlignment(E->getArg(1));
17442      Value *Vec = Builder.CreateLoad(Addr);
17443      Value *Call = Builder.CreateCall(F, {Vec});
17444      llvm::Type *VTy = llvm::FixedVectorType::get(Int8Ty, 16);
17445      Value *Ptr = Ops[0];
17446      for (unsigned i=0; i<NumVecs; i++) {
17447        Value *Vec = Builder.CreateExtractValue(Call, i);
17448        llvm::ConstantInt* Index = llvm::ConstantInt::get(IntTy, i);
17449        Value *GEP = Builder.CreateInBoundsGEP(VTy, Ptr, Index);
17450        Builder.CreateAlignedStore(Vec, GEP, MaybeAlign(16));
17451      }
17452      return Call;
17453    }
17454    if (BuiltinID == PPC::BI__builtin_vsx_build_pair ||
17455        BuiltinID == PPC::BI__builtin_mma_build_acc) {
17456      // Reverse the order of the operands for LE, so the
17457      // same builtin call can be used on both LE and BE
17458      // without the need for the programmer to swap operands.
17459      // The operands are reversed starting from the second argument,
17460      // the first operand is the pointer to the pair/accumulator
17461      // that is being built.
17462      if (getTarget().isLittleEndian())
17463        std::reverse(Ops.begin() + 1, Ops.end());
17464    }
17465    bool Accumulate;
17466    switch (BuiltinID) {
17467  #define CUSTOM_BUILTIN(Name, Intr, Types, Acc, Feature) \
17468    case PPC::BI__builtin_##Name: \
17469      ID = Intrinsic::ppc_##Intr; \
17470      Accumulate = Acc; \
17471      break;
17472  #include "clang/Basic/BuiltinsPPC.def"
17473    }
17474    if (BuiltinID == PPC::BI__builtin_vsx_lxvp ||
17475        BuiltinID == PPC::BI__builtin_vsx_stxvp ||
17476        BuiltinID == PPC::BI__builtin_mma_lxvp ||
17477        BuiltinID == PPC::BI__builtin_mma_stxvp) {
17478      if (BuiltinID == PPC::BI__builtin_vsx_lxvp ||
17479          BuiltinID == PPC::BI__builtin_mma_lxvp) {
17480        Ops[0] = Builder.CreateGEP(Int8Ty, Ops[1], Ops[0]);
17481      } else {
17482        Ops[1] = Builder.CreateGEP(Int8Ty, Ops[2], Ops[1]);
17483      }
17484      Ops.pop_back();
17485      llvm::Function *F = CGM.getIntrinsic(ID);
17486      return Builder.CreateCall(F, Ops, "");
17487    }
17488    SmallVector<Value*, 4> CallOps;
17489    if (Accumulate) {
17490      Address Addr = EmitPointerWithAlignment(E->getArg(0));
17491      Value *Acc = Builder.CreateLoad(Addr);
17492      CallOps.push_back(Acc);
17493    }
17494    for (unsigned i=1; i<Ops.size(); i++)
17495      CallOps.push_back(Ops[i]);
17496    llvm::Function *F = CGM.getIntrinsic(ID);
17497    Value *Call = Builder.CreateCall(F, CallOps);
17498    return Builder.CreateAlignedStore(Call, Ops[0], MaybeAlign(64));
17499  }
17500
17501  case PPC::BI__builtin_ppc_compare_and_swap:
17502  case PPC::BI__builtin_ppc_compare_and_swaplp: {
17503    Address Addr = EmitPointerWithAlignment(E->getArg(0));
17504    Address OldValAddr = EmitPointerWithAlignment(E->getArg(1));
17505    Value *OldVal = Builder.CreateLoad(OldValAddr);
17506    QualType AtomicTy = E->getArg(0)->getType()->getPointeeType();
17507    LValue LV = MakeAddrLValue(Addr, AtomicTy);
17508    Value *Op2 = EmitScalarExpr(E->getArg(2));
17509    auto Pair = EmitAtomicCompareExchange(
17510        LV, RValue::get(OldVal), RValue::get(Op2), E->getExprLoc(),
17511        llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Monotonic, true);
17512    // Unlike c11's atomic_compare_exchange, according to
17513    // https://www.ibm.com/docs/en/xl-c-and-cpp-aix/16.1?topic=functions-compare-swap-compare-swaplp
17514    // > In either case, the contents of the memory location specified by addr
17515    // > are copied into the memory location specified by old_val_addr.
17516    // But it hasn't specified storing to OldValAddr is atomic or not and
17517    // which order to use. Now following XL's codegen, treat it as a normal
17518    // store.
17519    Value *LoadedVal = Pair.first.getScalarVal();
17520    Builder.CreateStore(LoadedVal, OldValAddr);
17521    return Builder.CreateZExt(Pair.second, Builder.getInt32Ty());
17522  }
17523  case PPC::BI__builtin_ppc_fetch_and_add:
17524  case PPC::BI__builtin_ppc_fetch_and_addlp: {
17525    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Add, E,
17526                                 llvm::AtomicOrdering::Monotonic);
17527  }
17528  case PPC::BI__builtin_ppc_fetch_and_and:
17529  case PPC::BI__builtin_ppc_fetch_and_andlp: {
17530    return MakeBinaryAtomicValue(*this, AtomicRMWInst::And, E,
17531                                 llvm::AtomicOrdering::Monotonic);
17532  }
17533
17534  case PPC::BI__builtin_ppc_fetch_and_or:
17535  case PPC::BI__builtin_ppc_fetch_and_orlp: {
17536    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Or, E,
17537                                 llvm::AtomicOrdering::Monotonic);
17538  }
17539  case PPC::BI__builtin_ppc_fetch_and_swap:
17540  case PPC::BI__builtin_ppc_fetch_and_swaplp: {
17541    return MakeBinaryAtomicValue(*this, AtomicRMWInst::Xchg, E,
17542                                 llvm::AtomicOrdering::Monotonic);
17543  }
17544  case PPC::BI__builtin_ppc_ldarx:
17545  case PPC::BI__builtin_ppc_lwarx:
17546  case PPC::BI__builtin_ppc_lharx:
17547  case PPC::BI__builtin_ppc_lbarx:
17548    return emitPPCLoadReserveIntrinsic(*this, BuiltinID, E);
17549  case PPC::BI__builtin_ppc_mfspr: {
17550    Value *Op0 = EmitScalarExpr(E->getArg(0));
17551    llvm::Type *RetType = CGM.getDataLayout().getTypeSizeInBits(VoidPtrTy) == 32
17552                              ? Int32Ty
17553                              : Int64Ty;
17554    Function *F = CGM.getIntrinsic(Intrinsic::ppc_mfspr, RetType);
17555    return Builder.CreateCall(F, {Op0});
17556  }
17557  case PPC::BI__builtin_ppc_mtspr: {
17558    Value *Op0 = EmitScalarExpr(E->getArg(0));
17559    Value *Op1 = EmitScalarExpr(E->getArg(1));
17560    llvm::Type *RetType = CGM.getDataLayout().getTypeSizeInBits(VoidPtrTy) == 32
17561                              ? Int32Ty
17562                              : Int64Ty;
17563    Function *F = CGM.getIntrinsic(Intrinsic::ppc_mtspr, RetType);
17564    return Builder.CreateCall(F, {Op0, Op1});
17565  }
17566  case PPC::BI__builtin_ppc_popcntb: {
17567    Value *ArgValue = EmitScalarExpr(E->getArg(0));
17568    llvm::Type *ArgType = ArgValue->getType();
17569    Function *F = CGM.getIntrinsic(Intrinsic::ppc_popcntb, {ArgType, ArgType});
17570    return Builder.CreateCall(F, {ArgValue}, "popcntb");
17571  }
17572  case PPC::BI__builtin_ppc_mtfsf: {
17573    // The builtin takes a uint32 that needs to be cast to an
17574    // f64 to be passed to the intrinsic.
17575    Value *Op0 = EmitScalarExpr(E->getArg(0));
17576    Value *Op1 = EmitScalarExpr(E->getArg(1));
17577    Value *Cast = Builder.CreateUIToFP(Op1, DoubleTy);
17578    llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_mtfsf);
17579    return Builder.CreateCall(F, {Op0, Cast}, "");
17580  }
17581
17582  case PPC::BI__builtin_ppc_swdiv_nochk:
17583  case PPC::BI__builtin_ppc_swdivs_nochk: {
17584    Value *Op0 = EmitScalarExpr(E->getArg(0));
17585    Value *Op1 = EmitScalarExpr(E->getArg(1));
17586    FastMathFlags FMF = Builder.getFastMathFlags();
17587    Builder.getFastMathFlags().setFast();
17588    Value *FDiv = Builder.CreateFDiv(Op0, Op1, "swdiv_nochk");
17589    Builder.getFastMathFlags() &= (FMF);
17590    return FDiv;
17591  }
17592  case PPC::BI__builtin_ppc_fric:
17593    return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17594                           *this, E, Intrinsic::rint,
17595                           Intrinsic::experimental_constrained_rint))
17596        .getScalarVal();
17597  case PPC::BI__builtin_ppc_frim:
17598  case PPC::BI__builtin_ppc_frims:
17599    return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17600                           *this, E, Intrinsic::floor,
17601                           Intrinsic::experimental_constrained_floor))
17602        .getScalarVal();
17603  case PPC::BI__builtin_ppc_frin:
17604  case PPC::BI__builtin_ppc_frins:
17605    return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17606                           *this, E, Intrinsic::round,
17607                           Intrinsic::experimental_constrained_round))
17608        .getScalarVal();
17609  case PPC::BI__builtin_ppc_frip:
17610  case PPC::BI__builtin_ppc_frips:
17611    return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17612                           *this, E, Intrinsic::ceil,
17613                           Intrinsic::experimental_constrained_ceil))
17614        .getScalarVal();
17615  case PPC::BI__builtin_ppc_friz:
17616  case PPC::BI__builtin_ppc_frizs:
17617    return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17618                           *this, E, Intrinsic::trunc,
17619                           Intrinsic::experimental_constrained_trunc))
17620        .getScalarVal();
17621  case PPC::BI__builtin_ppc_fsqrt:
17622  case PPC::BI__builtin_ppc_fsqrts:
17623    return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
17624                           *this, E, Intrinsic::sqrt,
17625                           Intrinsic::experimental_constrained_sqrt))
17626        .getScalarVal();
17627  case PPC::BI__builtin_ppc_test_data_class: {
17628    Value *Op0 = EmitScalarExpr(E->getArg(0));
17629    Value *Op1 = EmitScalarExpr(E->getArg(1));
17630    return Builder.CreateCall(
17631        CGM.getIntrinsic(Intrinsic::ppc_test_data_class, Op0->getType()),
17632        {Op0, Op1}, "test_data_class");
17633  }
17634  case PPC::BI__builtin_ppc_maxfe: {
17635    Value *Op0 = EmitScalarExpr(E->getArg(0));
17636    Value *Op1 = EmitScalarExpr(E->getArg(1));
17637    Value *Op2 = EmitScalarExpr(E->getArg(2));
17638    Value *Op3 = EmitScalarExpr(E->getArg(3));
17639    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfe),
17640                              {Op0, Op1, Op2, Op3});
17641  }
17642  case PPC::BI__builtin_ppc_maxfl: {
17643    Value *Op0 = EmitScalarExpr(E->getArg(0));
17644    Value *Op1 = EmitScalarExpr(E->getArg(1));
17645    Value *Op2 = EmitScalarExpr(E->getArg(2));
17646    Value *Op3 = EmitScalarExpr(E->getArg(3));
17647    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfl),
17648                              {Op0, Op1, Op2, Op3});
17649  }
17650  case PPC::BI__builtin_ppc_maxfs: {
17651    Value *Op0 = EmitScalarExpr(E->getArg(0));
17652    Value *Op1 = EmitScalarExpr(E->getArg(1));
17653    Value *Op2 = EmitScalarExpr(E->getArg(2));
17654    Value *Op3 = EmitScalarExpr(E->getArg(3));
17655    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_maxfs),
17656                              {Op0, Op1, Op2, Op3});
17657  }
17658  case PPC::BI__builtin_ppc_minfe: {
17659    Value *Op0 = EmitScalarExpr(E->getArg(0));
17660    Value *Op1 = EmitScalarExpr(E->getArg(1));
17661    Value *Op2 = EmitScalarExpr(E->getArg(2));
17662    Value *Op3 = EmitScalarExpr(E->getArg(3));
17663    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfe),
17664                              {Op0, Op1, Op2, Op3});
17665  }
17666  case PPC::BI__builtin_ppc_minfl: {
17667    Value *Op0 = EmitScalarExpr(E->getArg(0));
17668    Value *Op1 = EmitScalarExpr(E->getArg(1));
17669    Value *Op2 = EmitScalarExpr(E->getArg(2));
17670    Value *Op3 = EmitScalarExpr(E->getArg(3));
17671    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfl),
17672                              {Op0, Op1, Op2, Op3});
17673  }
17674  case PPC::BI__builtin_ppc_minfs: {
17675    Value *Op0 = EmitScalarExpr(E->getArg(0));
17676    Value *Op1 = EmitScalarExpr(E->getArg(1));
17677    Value *Op2 = EmitScalarExpr(E->getArg(2));
17678    Value *Op3 = EmitScalarExpr(E->getArg(3));
17679    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_minfs),
17680                              {Op0, Op1, Op2, Op3});
17681  }
17682  case PPC::BI__builtin_ppc_swdiv:
17683  case PPC::BI__builtin_ppc_swdivs: {
17684    Value *Op0 = EmitScalarExpr(E->getArg(0));
17685    Value *Op1 = EmitScalarExpr(E->getArg(1));
17686    return Builder.CreateFDiv(Op0, Op1, "swdiv");
17687  }
17688  case PPC::BI__builtin_ppc_set_fpscr_rn:
17689    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_setrnd),
17690                              {EmitScalarExpr(E->getArg(0))});
17691  case PPC::BI__builtin_ppc_mffs:
17692    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_readflm));
17693  }
17694}
17695
17696namespace {
17697// If \p E is not null pointer, insert address space cast to match return
17698// type of \p E if necessary.
17699Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
17700                             const CallExpr *E = nullptr) {
17701  auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
17702  auto *Call = CGF.Builder.CreateCall(F);
17703  Call->addRetAttr(
17704      Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
17705  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
17706  if (!E)
17707    return Call;
17708  QualType BuiltinRetType = E->getType();
17709  auto *RetTy = cast<llvm::PointerType>(CGF.ConvertType(BuiltinRetType));
17710  if (RetTy == Call->getType())
17711    return Call;
17712  return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
17713}
17714
17715Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
17716  auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
17717  auto *Call = CGF.Builder.CreateCall(F);
17718  Call->addRetAttr(
17719      Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
17720  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
17721  return Call;
17722}
17723
17724// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
17725/// Emit code based on Code Object ABI version.
17726/// COV_4    : Emit code to use dispatch ptr
17727/// COV_5    : Emit code to use implicitarg ptr
17728/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version"
17729///            and use its value for COV_4 or COV_5 approach. It is used for
17730///            compiling device libraries in an ABI-agnostic way.
17731///
17732/// Note: "__oclc_ABI_version" is supposed to be emitted and intialized by
17733///       clang during compilation of user code.
17734Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
17735  llvm::LoadInst *LD;
17736
17737  auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
17738
17739  if (Cov == CodeObjectVersionKind::COV_None) {
17740    StringRef Name = "__oclc_ABI_version";
17741    auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name);
17742    if (!ABIVersionC)
17743      ABIVersionC = new llvm::GlobalVariable(
17744          CGF.CGM.getModule(), CGF.Int32Ty, false,
17745          llvm::GlobalValue::ExternalLinkage, nullptr, Name, nullptr,
17746          llvm::GlobalVariable::NotThreadLocal,
17747          CGF.CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
17748
17749    // This load will be eliminated by the IPSCCP because it is constant
17750    // weak_odr without externally_initialized. Either changing it to weak or
17751    // adding externally_initialized will keep the load.
17752    Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
17753                                                      CGF.CGM.getIntAlign());
17754
17755    Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
17756        ABIVersion,
17757        llvm::ConstantInt::get(CGF.Int32Ty, CodeObjectVersionKind::COV_5));
17758
17759    // Indexing the implicit kernarg segment.
17760    Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
17761        CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
17762
17763    // Indexing the HSA kernel_dispatch_packet struct.
17764    Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
17765        CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
17766
17767    auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
17768    LD = CGF.Builder.CreateLoad(
17769        Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
17770  } else {
17771    Value *GEP = nullptr;
17772    if (Cov == CodeObjectVersionKind::COV_5) {
17773      // Indexing the implicit kernarg segment.
17774      GEP = CGF.Builder.CreateConstGEP1_32(
17775          CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
17776    } else {
17777      // Indexing the HSA kernel_dispatch_packet struct.
17778      GEP = CGF.Builder.CreateConstGEP1_32(
17779          CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
17780    }
17781    LD = CGF.Builder.CreateLoad(
17782        Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
17783  }
17784
17785  llvm::MDBuilder MDHelper(CGF.getLLVMContext());
17786  llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
17787      APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));
17788  LD->setMetadata(llvm::LLVMContext::MD_range, RNode);
17789  LD->setMetadata(llvm::LLVMContext::MD_noundef,
17790                  llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
17791  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
17792                  llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
17793  return LD;
17794}
17795
17796// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
17797Value *EmitAMDGPUGridSize(CodeGenFunction &CGF, unsigned Index) {
17798  const unsigned XOffset = 12;
17799  auto *DP = EmitAMDGPUDispatchPtr(CGF);
17800  // Indexing the HSA kernel_dispatch_packet struct.
17801  auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 4);
17802  auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
17803  auto *LD = CGF.Builder.CreateLoad(
17804      Address(GEP, CGF.Int32Ty, CharUnits::fromQuantity(4)));
17805  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
17806                  llvm::MDNode::get(CGF.getLLVMContext(), std::nullopt));
17807  return LD;
17808}
17809} // namespace
17810
17811// For processing memory ordering and memory scope arguments of various
17812// amdgcn builtins.
17813// \p Order takes a C++11 comptabile memory-ordering specifier and converts
17814// it into LLVM's memory ordering specifier using atomic C ABI, and writes
17815// to \p AO. \p Scope takes a const char * and converts it into AMDGCN
17816// specific SyncScopeID and writes it to \p SSID.
17817void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
17818                                              llvm::AtomicOrdering &AO,
17819                                              llvm::SyncScope::ID &SSID) {
17820  int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
17821
17822  // Map C11/C++11 memory ordering to LLVM memory ordering
17823  assert(llvm::isValidAtomicOrderingCABI(ord));
17824  switch (static_cast<llvm::AtomicOrderingCABI>(ord)) {
17825  case llvm::AtomicOrderingCABI::acquire:
17826  case llvm::AtomicOrderingCABI::consume:
17827    AO = llvm::AtomicOrdering::Acquire;
17828    break;
17829  case llvm::AtomicOrderingCABI::release:
17830    AO = llvm::AtomicOrdering::Release;
17831    break;
17832  case llvm::AtomicOrderingCABI::acq_rel:
17833    AO = llvm::AtomicOrdering::AcquireRelease;
17834    break;
17835  case llvm::AtomicOrderingCABI::seq_cst:
17836    AO = llvm::AtomicOrdering::SequentiallyConsistent;
17837    break;
17838  case llvm::AtomicOrderingCABI::relaxed:
17839    AO = llvm::AtomicOrdering::Monotonic;
17840    break;
17841  }
17842
17843  StringRef scp;
17844  llvm::getConstantStringInfo(Scope, scp);
17845  SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
17846}
17847
17848llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
17849                                                          unsigned Idx,
17850                                                          const CallExpr *E) {
17851  llvm::Value *Arg = nullptr;
17852  if ((ICEArguments & (1 << Idx)) == 0) {
17853    Arg = EmitScalarExpr(E->getArg(Idx));
17854  } else {
17855    // If this is required to be a constant, constant fold it so that we
17856    // know that the generated intrinsic gets a ConstantInt.
17857    std::optional<llvm::APSInt> Result =
17858        E->getArg(Idx)->getIntegerConstantExpr(getContext());
17859    assert(Result && "Expected argument to be a constant");
17860    Arg = llvm::ConstantInt::get(getLLVMContext(), *Result);
17861  }
17862  return Arg;
17863}
17864
17865Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
17866                                              const CallExpr *E) {
17867  llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
17868  llvm::SyncScope::ID SSID;
17869  switch (BuiltinID) {
17870  case AMDGPU::BI__builtin_amdgcn_div_scale:
17871  case AMDGPU::BI__builtin_amdgcn_div_scalef: {
17872    // Translate from the intrinsics's struct return to the builtin's out
17873    // argument.
17874
17875    Address FlagOutPtr = EmitPointerWithAlignment(E->getArg(3));
17876
17877    llvm::Value *X = EmitScalarExpr(E->getArg(0));
17878    llvm::Value *Y = EmitScalarExpr(E->getArg(1));
17879    llvm::Value *Z = EmitScalarExpr(E->getArg(2));
17880
17881    llvm::Function *Callee = CGM.getIntrinsic(Intrinsic::amdgcn_div_scale,
17882                                           X->getType());
17883
17884    llvm::Value *Tmp = Builder.CreateCall(Callee, {X, Y, Z});
17885
17886    llvm::Value *Result = Builder.CreateExtractValue(Tmp, 0);
17887    llvm::Value *Flag = Builder.CreateExtractValue(Tmp, 1);
17888
17889    llvm::Type *RealFlagType = FlagOutPtr.getElementType();
17890
17891    llvm::Value *FlagExt = Builder.CreateZExt(Flag, RealFlagType);
17892    Builder.CreateStore(FlagExt, FlagOutPtr);
17893    return Result;
17894  }
17895  case AMDGPU::BI__builtin_amdgcn_div_fmas:
17896  case AMDGPU::BI__builtin_amdgcn_div_fmasf: {
17897    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17898    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17899    llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
17900    llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
17901
17902    llvm::Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_div_fmas,
17903                                      Src0->getType());
17904    llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
17905    return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
17906  }
17907
17908  case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
17909    return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_ds_swizzle);
17910  case AMDGPU::BI__builtin_amdgcn_mov_dpp8:
17911    return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_mov_dpp8);
17912  case AMDGPU::BI__builtin_amdgcn_mov_dpp:
17913  case AMDGPU::BI__builtin_amdgcn_update_dpp: {
17914    llvm::SmallVector<llvm::Value *, 6> Args;
17915    // Find out if any arguments are required to be integer constant
17916    // expressions.
17917    unsigned ICEArguments = 0;
17918    ASTContext::GetBuiltinTypeError Error;
17919    getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
17920    assert(Error == ASTContext::GE_None && "Should not codegen an error");
17921    for (unsigned I = 0; I != E->getNumArgs(); ++I) {
17922      Args.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, I, E));
17923    }
17924    assert(Args.size() == 5 || Args.size() == 6);
17925    if (Args.size() == 5)
17926      Args.insert(Args.begin(), llvm::PoisonValue::get(Args[0]->getType()));
17927    Function *F =
17928        CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
17929    return Builder.CreateCall(F, Args);
17930  }
17931  case AMDGPU::BI__builtin_amdgcn_div_fixup:
17932  case AMDGPU::BI__builtin_amdgcn_div_fixupf:
17933  case AMDGPU::BI__builtin_amdgcn_div_fixuph:
17934    return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_div_fixup);
17935  case AMDGPU::BI__builtin_amdgcn_trig_preop:
17936  case AMDGPU::BI__builtin_amdgcn_trig_preopf:
17937    return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_trig_preop);
17938  case AMDGPU::BI__builtin_amdgcn_rcp:
17939  case AMDGPU::BI__builtin_amdgcn_rcpf:
17940  case AMDGPU::BI__builtin_amdgcn_rcph:
17941    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rcp);
17942  case AMDGPU::BI__builtin_amdgcn_sqrt:
17943  case AMDGPU::BI__builtin_amdgcn_sqrtf:
17944  case AMDGPU::BI__builtin_amdgcn_sqrth:
17945    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sqrt);
17946  case AMDGPU::BI__builtin_amdgcn_rsq:
17947  case AMDGPU::BI__builtin_amdgcn_rsqf:
17948  case AMDGPU::BI__builtin_amdgcn_rsqh:
17949    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq);
17950  case AMDGPU::BI__builtin_amdgcn_rsq_clamp:
17951  case AMDGPU::BI__builtin_amdgcn_rsq_clampf:
17952    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_rsq_clamp);
17953  case AMDGPU::BI__builtin_amdgcn_sinf:
17954  case AMDGPU::BI__builtin_amdgcn_sinh:
17955    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_sin);
17956  case AMDGPU::BI__builtin_amdgcn_cosf:
17957  case AMDGPU::BI__builtin_amdgcn_cosh:
17958    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_cos);
17959  case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
17960    return EmitAMDGPUDispatchPtr(*this, E);
17961  case AMDGPU::BI__builtin_amdgcn_logf:
17962    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log);
17963  case AMDGPU::BI__builtin_amdgcn_exp2f:
17964    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_exp2);
17965  case AMDGPU::BI__builtin_amdgcn_log_clampf:
17966    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
17967  case AMDGPU::BI__builtin_amdgcn_ldexp:
17968  case AMDGPU::BI__builtin_amdgcn_ldexpf: {
17969    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17970    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17971    llvm::Function *F =
17972        CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
17973    return Builder.CreateCall(F, {Src0, Src1});
17974  }
17975  case AMDGPU::BI__builtin_amdgcn_ldexph: {
17976    // The raw instruction has a different behavior for out of bounds exponent
17977    // values (implicit truncation instead of saturate to short_min/short_max).
17978    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17979    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17980    llvm::Function *F =
17981        CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
17982    return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)});
17983  }
17984  case AMDGPU::BI__builtin_amdgcn_frexp_mant:
17985  case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
17986  case AMDGPU::BI__builtin_amdgcn_frexp_manth:
17987    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_frexp_mant);
17988  case AMDGPU::BI__builtin_amdgcn_frexp_exp:
17989  case AMDGPU::BI__builtin_amdgcn_frexp_expf: {
17990    Value *Src0 = EmitScalarExpr(E->getArg(0));
17991    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
17992                                { Builder.getInt32Ty(), Src0->getType() });
17993    return Builder.CreateCall(F, Src0);
17994  }
17995  case AMDGPU::BI__builtin_amdgcn_frexp_exph: {
17996    Value *Src0 = EmitScalarExpr(E->getArg(0));
17997    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_frexp_exp,
17998                                { Builder.getInt16Ty(), Src0->getType() });
17999    return Builder.CreateCall(F, Src0);
18000  }
18001  case AMDGPU::BI__builtin_amdgcn_fract:
18002  case AMDGPU::BI__builtin_amdgcn_fractf:
18003  case AMDGPU::BI__builtin_amdgcn_fracth:
18004    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_fract);
18005  case AMDGPU::BI__builtin_amdgcn_lerp:
18006    return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_lerp);
18007  case AMDGPU::BI__builtin_amdgcn_ubfe:
18008    return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_ubfe);
18009  case AMDGPU::BI__builtin_amdgcn_sbfe:
18010    return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_sbfe);
18011  case AMDGPU::BI__builtin_amdgcn_ballot_w32:
18012  case AMDGPU::BI__builtin_amdgcn_ballot_w64: {
18013    llvm::Type *ResultType = ConvertType(E->getType());
18014    llvm::Value *Src = EmitScalarExpr(E->getArg(0));
18015    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
18016    return Builder.CreateCall(F, { Src });
18017  }
18018  case AMDGPU::BI__builtin_amdgcn_uicmp:
18019  case AMDGPU::BI__builtin_amdgcn_uicmpl:
18020  case AMDGPU::BI__builtin_amdgcn_sicmp:
18021  case AMDGPU::BI__builtin_amdgcn_sicmpl: {
18022    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
18023    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
18024    llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
18025
18026    // FIXME-GFX10: How should 32 bit mask be handled?
18027    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_icmp,
18028      { Builder.getInt64Ty(), Src0->getType() });
18029    return Builder.CreateCall(F, { Src0, Src1, Src2 });
18030  }
18031  case AMDGPU::BI__builtin_amdgcn_fcmp:
18032  case AMDGPU::BI__builtin_amdgcn_fcmpf: {
18033    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
18034    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
18035    llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
18036
18037    // FIXME-GFX10: How should 32 bit mask be handled?
18038    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_fcmp,
18039      { Builder.getInt64Ty(), Src0->getType() });
18040    return Builder.CreateCall(F, { Src0, Src1, Src2 });
18041  }
18042  case AMDGPU::BI__builtin_amdgcn_class:
18043  case AMDGPU::BI__builtin_amdgcn_classf:
18044  case AMDGPU::BI__builtin_amdgcn_classh:
18045    return emitFPIntBuiltin(*this, E, Intrinsic::amdgcn_class);
18046  case AMDGPU::BI__builtin_amdgcn_fmed3f:
18047  case AMDGPU::BI__builtin_amdgcn_fmed3h:
18048    return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_fmed3);
18049  case AMDGPU::BI__builtin_amdgcn_ds_append:
18050  case AMDGPU::BI__builtin_amdgcn_ds_consume: {
18051    Intrinsic::ID Intrin = BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_append ?
18052      Intrinsic::amdgcn_ds_append : Intrinsic::amdgcn_ds_consume;
18053    Value *Src0 = EmitScalarExpr(E->getArg(0));
18054    Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
18055    return Builder.CreateCall(F, { Src0, Builder.getFalse() });
18056  }
18057  case AMDGPU::BI__builtin_amdgcn_ds_faddf:
18058  case AMDGPU::BI__builtin_amdgcn_ds_fminf:
18059  case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
18060    Intrinsic::ID Intrin;
18061    switch (BuiltinID) {
18062    case AMDGPU::BI__builtin_amdgcn_ds_faddf:
18063      Intrin = Intrinsic::amdgcn_ds_fadd;
18064      break;
18065    case AMDGPU::BI__builtin_amdgcn_ds_fminf:
18066      Intrin = Intrinsic::amdgcn_ds_fmin;
18067      break;
18068    case AMDGPU::BI__builtin_amdgcn_ds_fmaxf:
18069      Intrin = Intrinsic::amdgcn_ds_fmax;
18070      break;
18071    }
18072    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
18073    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
18074    llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
18075    llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
18076    llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
18077    llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() });
18078    llvm::FunctionType *FTy = F->getFunctionType();
18079    llvm::Type *PTy = FTy->getParamType(0);
18080    Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy);
18081    return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 });
18082  }
18083  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18084  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18085  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18086  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18087  case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
18088  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
18089  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
18090  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
18091  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
18092  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: {
18093    Intrinsic::ID IID;
18094    llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
18095    switch (BuiltinID) {
18096    case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f32:
18097      ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18098      IID = Intrinsic::amdgcn_global_atomic_fadd;
18099      break;
18100    case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2f16:
18101      ArgTy = llvm::FixedVectorType::get(
18102          llvm::Type::getHalfTy(getLLVMContext()), 2);
18103      IID = Intrinsic::amdgcn_global_atomic_fadd;
18104      break;
18105    case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_f64:
18106      IID = Intrinsic::amdgcn_global_atomic_fadd;
18107      break;
18108    case AMDGPU::BI__builtin_amdgcn_global_atomic_fmin_f64:
18109      IID = Intrinsic::amdgcn_global_atomic_fmin;
18110      break;
18111    case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64:
18112      IID = Intrinsic::amdgcn_global_atomic_fmax;
18113      break;
18114    case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64:
18115      IID = Intrinsic::amdgcn_flat_atomic_fadd;
18116      break;
18117    case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64:
18118      IID = Intrinsic::amdgcn_flat_atomic_fmin;
18119      break;
18120    case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64:
18121      IID = Intrinsic::amdgcn_flat_atomic_fmax;
18122      break;
18123    case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32:
18124      ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18125      IID = Intrinsic::amdgcn_flat_atomic_fadd;
18126      break;
18127    case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16:
18128      ArgTy = llvm::FixedVectorType::get(
18129          llvm::Type::getHalfTy(getLLVMContext()), 2);
18130      IID = Intrinsic::amdgcn_flat_atomic_fadd;
18131      break;
18132    }
18133    llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18134    llvm::Value *Val = EmitScalarExpr(E->getArg(1));
18135    llvm::Function *F =
18136        CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()});
18137    return Builder.CreateCall(F, {Addr, Val});
18138  }
18139  case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
18140  case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: {
18141    Intrinsic::ID IID;
18142    switch (BuiltinID) {
18143    case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16:
18144      IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16;
18145      break;
18146    case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16:
18147      IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16;
18148      break;
18149    }
18150    llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18151    llvm::Value *Val = EmitScalarExpr(E->getArg(1));
18152    llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
18153    return Builder.CreateCall(F, {Addr, Val});
18154  }
18155  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
18156  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
18157  case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: {
18158    Intrinsic::ID IID;
18159    llvm::Type *ArgTy;
18160    switch (BuiltinID) {
18161    case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
18162      ArgTy = llvm::Type::getFloatTy(getLLVMContext());
18163      IID = Intrinsic::amdgcn_ds_fadd;
18164      break;
18165    case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
18166      ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
18167      IID = Intrinsic::amdgcn_ds_fadd;
18168      break;
18169    case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
18170      ArgTy = llvm::FixedVectorType::get(
18171          llvm::Type::getHalfTy(getLLVMContext()), 2);
18172      IID = Intrinsic::amdgcn_ds_fadd;
18173      break;
18174    }
18175    llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18176    llvm::Value *Val = EmitScalarExpr(E->getArg(1));
18177    llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue(
18178        llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true));
18179    llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue(
18180        llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0));
18181    llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
18182    return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
18183  }
18184  case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
18185  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
18186  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
18187  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
18188  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
18189  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: {
18190
18191    llvm::Type *ArgTy;
18192    switch (BuiltinID) {
18193    case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
18194      ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
18195      break;
18196    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
18197      ArgTy = llvm::FixedVectorType::get(
18198          llvm::Type::getInt32Ty(getLLVMContext()), 2);
18199      break;
18200    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
18201      ArgTy = llvm::FixedVectorType::get(
18202          llvm::Type::getHalfTy(getLLVMContext()), 4);
18203      break;
18204    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
18205      ArgTy = llvm::FixedVectorType::get(
18206          llvm::Type::getInt16Ty(getLLVMContext()), 4);
18207      break;
18208    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
18209      ArgTy = llvm::FixedVectorType::get(
18210          llvm::Type::getHalfTy(getLLVMContext()), 8);
18211      break;
18212    case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16:
18213      ArgTy = llvm::FixedVectorType::get(
18214          llvm::Type::getInt16Ty(getLLVMContext()), 8);
18215      break;
18216    }
18217
18218    llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
18219    llvm::Function *F =
18220        CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
18221    return Builder.CreateCall(F, {Addr});
18222  }
18223  case AMDGPU::BI__builtin_amdgcn_read_exec:
18224    return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
18225  case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
18226    return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
18227  case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
18228    return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
18229  case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
18230  case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
18231  case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
18232  case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_lh: {
18233    llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
18234    llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
18235    llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(2));
18236    llvm::Value *RayDir = EmitScalarExpr(E->getArg(3));
18237    llvm::Value *RayInverseDir = EmitScalarExpr(E->getArg(4));
18238    llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(5));
18239
18240    // The builtins take these arguments as vec4 where the last element is
18241    // ignored. The intrinsic takes them as vec3.
18242    RayOrigin = Builder.CreateShuffleVector(RayOrigin, RayOrigin,
18243                                            ArrayRef<int>{0, 1, 2});
18244    RayDir =
18245        Builder.CreateShuffleVector(RayDir, RayDir, ArrayRef<int>{0, 1, 2});
18246    RayInverseDir = Builder.CreateShuffleVector(RayInverseDir, RayInverseDir,
18247                                                ArrayRef<int>{0, 1, 2});
18248
18249    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_image_bvh_intersect_ray,
18250                                   {NodePtr->getType(), RayDir->getType()});
18251    return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
18252                                  RayInverseDir, TextureDescr});
18253  }
18254
18255  case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: {
18256    SmallVector<Value *, 4> Args;
18257    for (int i = 0, e = E->getNumArgs(); i != e; ++i)
18258      Args.push_back(EmitScalarExpr(E->getArg(i)));
18259
18260    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn);
18261    Value *Call = Builder.CreateCall(F, Args);
18262    Value *Rtn = Builder.CreateExtractValue(Call, 0);
18263    Value *A = Builder.CreateExtractValue(Call, 1);
18264    llvm::Type *RetTy = ConvertType(E->getType());
18265    Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
18266                                            (uint64_t)0);
18267    return Builder.CreateInsertElement(I0, A, 1);
18268  }
18269
18270  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
18271  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
18272  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
18273  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
18274  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
18275  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
18276  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
18277  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
18278  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
18279  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
18280  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
18281  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
18282  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
18283  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
18284  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
18285  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
18286  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
18287  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
18288  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
18289  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
18290  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
18291  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
18292  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
18293  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
18294  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
18295  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
18296  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
18297  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
18298  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
18299  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
18300  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
18301  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
18302  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
18303  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
18304  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
18305  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
18306  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
18307  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
18308  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
18309  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
18310  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
18311  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
18312  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
18313  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
18314  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
18315  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
18316  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
18317  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
18318  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
18319  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
18320  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
18321  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
18322  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
18323  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
18324  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
18325  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
18326  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
18327  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
18328  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
18329  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
18330
18331    // These operations perform a matrix multiplication and accumulation of
18332    // the form:
18333    //             D = A * B + C
18334    // We need to specify one type for matrices AB and one for matrices CD.
18335    // Sparse matrix operations can have different types for A and B as well as
18336    // an additional type for sparsity index.
18337    // Destination type should be put before types used for source operands.
18338    SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
18339    // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
18340    // There is no need for the variable opsel argument, so always set it to
18341    // "false".
18342    bool AppendFalseForOpselArg = false;
18343    unsigned BuiltinWMMAOp;
18344
18345    switch (BuiltinID) {
18346    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
18347    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
18348    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
18349    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
18350      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18351      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
18352      break;
18353    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
18354    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
18355    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
18356    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
18357      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18358      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
18359      break;
18360    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
18361    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
18362      AppendFalseForOpselArg = true;
18363      LLVM_FALLTHROUGH;
18364    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
18365    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
18366      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18367      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
18368      break;
18369    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
18370    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
18371      AppendFalseForOpselArg = true;
18372      LLVM_FALLTHROUGH;
18373    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
18374    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
18375      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18376      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
18377      break;
18378    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
18379    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
18380      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18381      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
18382      break;
18383    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
18384    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
18385      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18386      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
18387      break;
18388    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
18389    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
18390    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
18391    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
18392      ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
18393      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
18394      break;
18395    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
18396    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
18397    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
18398    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
18399      ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
18400      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
18401      break;
18402    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
18403    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
18404      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18405      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
18406      break;
18407    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
18408    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
18409      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18410      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
18411      break;
18412    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
18413    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
18414      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18415      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
18416      break;
18417    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
18418    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
18419      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
18420      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
18421      break;
18422    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
18423    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
18424      ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
18425      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
18426      break;
18427    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
18428    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
18429      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18430      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
18431      break;
18432    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
18433    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
18434      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18435      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
18436      break;
18437    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
18438    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
18439      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18440      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
18441      break;
18442    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
18443    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
18444      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18445      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
18446      break;
18447    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
18448    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
18449      ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
18450      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
18451      break;
18452    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
18453    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
18454      ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
18455      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
18456      break;
18457    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
18458    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
18459      ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
18460      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
18461      break;
18462    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
18463    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
18464      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18465      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
18466      break;
18467    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
18468    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
18469      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18470      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
18471      break;
18472    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
18473    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
18474      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18475      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
18476      break;
18477    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
18478    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
18479      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
18480      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
18481      break;
18482    }
18483
18484    SmallVector<Value *, 6> Args;
18485    for (int i = 0, e = E->getNumArgs(); i != e; ++i)
18486      Args.push_back(EmitScalarExpr(E->getArg(i)));
18487    if (AppendFalseForOpselArg)
18488      Args.push_back(Builder.getFalse());
18489
18490    SmallVector<llvm::Type *, 6> ArgTypes;
18491    for (auto ArgIdx : ArgsForMatchingMatrixTypes)
18492      ArgTypes.push_back(Args[ArgIdx]->getType());
18493
18494    Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
18495    return Builder.CreateCall(F, Args);
18496  }
18497
18498  // amdgcn workitem
18499  case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
18500    return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
18501  case AMDGPU::BI__builtin_amdgcn_workitem_id_y:
18502    return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024);
18503  case AMDGPU::BI__builtin_amdgcn_workitem_id_z:
18504    return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);
18505
18506  // amdgcn workgroup size
18507  case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
18508    return EmitAMDGPUWorkGroupSize(*this, 0);
18509  case AMDGPU::BI__builtin_amdgcn_workgroup_size_y:
18510    return EmitAMDGPUWorkGroupSize(*this, 1);
18511  case AMDGPU::BI__builtin_amdgcn_workgroup_size_z:
18512    return EmitAMDGPUWorkGroupSize(*this, 2);
18513
18514  // amdgcn grid size
18515  case AMDGPU::BI__builtin_amdgcn_grid_size_x:
18516    return EmitAMDGPUGridSize(*this, 0);
18517  case AMDGPU::BI__builtin_amdgcn_grid_size_y:
18518    return EmitAMDGPUGridSize(*this, 1);
18519  case AMDGPU::BI__builtin_amdgcn_grid_size_z:
18520    return EmitAMDGPUGridSize(*this, 2);
18521
18522  // r600 intrinsics
18523  case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
18524  case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
18525    return emitUnaryBuiltin(*this, E, Intrinsic::r600_recipsqrt_ieee);
18526  case AMDGPU::BI__builtin_r600_read_tidig_x:
18527    return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);
18528  case AMDGPU::BI__builtin_r600_read_tidig_y:
18529    return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);
18530  case AMDGPU::BI__builtin_r600_read_tidig_z:
18531    return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024);
18532  case AMDGPU::BI__builtin_amdgcn_alignbit: {
18533    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
18534    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
18535    llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
18536    Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType());
18537    return Builder.CreateCall(F, { Src0, Src1, Src2 });
18538  }
18539  case AMDGPU::BI__builtin_amdgcn_fence: {
18540    ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(0)),
18541                            EmitScalarExpr(E->getArg(1)), AO, SSID);
18542    return Builder.CreateFence(AO, SSID);
18543  }
18544  case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
18545  case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
18546  case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
18547  case AMDGPU::BI__builtin_amdgcn_atomic_dec64: {
18548    llvm::AtomicRMWInst::BinOp BinOp;
18549    switch (BuiltinID) {
18550    case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
18551    case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
18552      BinOp = llvm::AtomicRMWInst::UIncWrap;
18553      break;
18554    case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
18555    case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
18556      BinOp = llvm::AtomicRMWInst::UDecWrap;
18557      break;
18558    }
18559
18560    Address Ptr = CheckAtomicAlignment(*this, E);
18561    Value *Val = EmitScalarExpr(E->getArg(1));
18562
18563    ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
18564                            EmitScalarExpr(E->getArg(3)), AO, SSID);
18565
18566    QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
18567    bool Volatile =
18568        PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
18569
18570    llvm::AtomicRMWInst *RMW =
18571        Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
18572    if (Volatile)
18573      RMW->setVolatile(true);
18574    return RMW;
18575  }
18576  case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
18577  case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
18578    llvm::Value *Arg = EmitScalarExpr(E->getArg(0));
18579    llvm::Type *ResultType = ConvertType(E->getType());
18580    // s_sendmsg_rtn is mangled using return type only.
18581    Function *F =
18582        CGM.getIntrinsic(Intrinsic::amdgcn_s_sendmsg_rtn, {ResultType});
18583    return Builder.CreateCall(F, {Arg});
18584  }
18585  default:
18586    return nullptr;
18587  }
18588}
18589
18590/// Handle a SystemZ function in which the final argument is a pointer
18591/// to an int that receives the post-instruction CC value.  At the LLVM level
18592/// this is represented as a function that returns a {result, cc} pair.
18593static Value *EmitSystemZIntrinsicWithCC(CodeGenFunction &CGF,
18594                                         unsigned IntrinsicID,
18595                                         const CallExpr *E) {
18596  unsigned NumArgs = E->getNumArgs() - 1;
18597  SmallVector<Value *, 8> Args(NumArgs);
18598  for (unsigned I = 0; I < NumArgs; ++I)
18599    Args[I] = CGF.EmitScalarExpr(E->getArg(I));
18600  Address CCPtr = CGF.EmitPointerWithAlignment(E->getArg(NumArgs));
18601  Function *F = CGF.CGM.getIntrinsic(IntrinsicID);
18602  Value *Call = CGF.Builder.CreateCall(F, Args);
18603  Value *CC = CGF.Builder.CreateExtractValue(Call, 1);
18604  CGF.Builder.CreateStore(CC, CCPtr);
18605  return CGF.Builder.CreateExtractValue(Call, 0);
18606}
18607
18608Value *CodeGenFunction::EmitSystemZBuiltinExpr(unsigned BuiltinID,
18609                                               const CallExpr *E) {
18610  switch (BuiltinID) {
18611  case SystemZ::BI__builtin_tbegin: {
18612    Value *TDB = EmitScalarExpr(E->getArg(0));
18613    Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
18614    Function *F = CGM.getIntrinsic(Intrinsic::s390_tbegin);
18615    return Builder.CreateCall(F, {TDB, Control});
18616  }
18617  case SystemZ::BI__builtin_tbegin_nofloat: {
18618    Value *TDB = EmitScalarExpr(E->getArg(0));
18619    Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff0c);
18620    Function *F = CGM.getIntrinsic(Intrinsic::s390_tbegin_nofloat);
18621    return Builder.CreateCall(F, {TDB, Control});
18622  }
18623  case SystemZ::BI__builtin_tbeginc: {
18624    Value *TDB = llvm::ConstantPointerNull::get(Int8PtrTy);
18625    Value *Control = llvm::ConstantInt::get(Int32Ty, 0xff08);
18626    Function *F = CGM.getIntrinsic(Intrinsic::s390_tbeginc);
18627    return Builder.CreateCall(F, {TDB, Control});
18628  }
18629  case SystemZ::BI__builtin_tabort: {
18630    Value *Data = EmitScalarExpr(E->getArg(0));
18631    Function *F = CGM.getIntrinsic(Intrinsic::s390_tabort);
18632    return Builder.CreateCall(F, Builder.CreateSExt(Data, Int64Ty, "tabort"));
18633  }
18634  case SystemZ::BI__builtin_non_tx_store: {
18635    Value *Address = EmitScalarExpr(E->getArg(0));
18636    Value *Data = EmitScalarExpr(E->getArg(1));
18637    Function *F = CGM.getIntrinsic(Intrinsic::s390_ntstg);
18638    return Builder.CreateCall(F, {Data, Address});
18639  }
18640
18641  // Vector builtins.  Note that most vector builtins are mapped automatically
18642  // to target-specific LLVM intrinsics.  The ones handled specially here can
18643  // be represented via standard LLVM IR, which is preferable to enable common
18644  // LLVM optimizations.
18645
18646  case SystemZ::BI__builtin_s390_vpopctb:
18647  case SystemZ::BI__builtin_s390_vpopcth:
18648  case SystemZ::BI__builtin_s390_vpopctf:
18649  case SystemZ::BI__builtin_s390_vpopctg: {
18650    llvm::Type *ResultType = ConvertType(E->getType());
18651    Value *X = EmitScalarExpr(E->getArg(0));
18652    Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ResultType);
18653    return Builder.CreateCall(F, X);
18654  }
18655
18656  case SystemZ::BI__builtin_s390_vclzb:
18657  case SystemZ::BI__builtin_s390_vclzh:
18658  case SystemZ::BI__builtin_s390_vclzf:
18659  case SystemZ::BI__builtin_s390_vclzg: {
18660    llvm::Type *ResultType = ConvertType(E->getType());
18661    Value *X = EmitScalarExpr(E->getArg(0));
18662    Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
18663    Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ResultType);
18664    return Builder.CreateCall(F, {X, Undef});
18665  }
18666
18667  case SystemZ::BI__builtin_s390_vctzb:
18668  case SystemZ::BI__builtin_s390_vctzh:
18669  case SystemZ::BI__builtin_s390_vctzf:
18670  case SystemZ::BI__builtin_s390_vctzg: {
18671    llvm::Type *ResultType = ConvertType(E->getType());
18672    Value *X = EmitScalarExpr(E->getArg(0));
18673    Value *Undef = ConstantInt::get(Builder.getInt1Ty(), false);
18674    Function *F = CGM.getIntrinsic(Intrinsic::cttz, ResultType);
18675    return Builder.CreateCall(F, {X, Undef});
18676  }
18677
18678  case SystemZ::BI__builtin_s390_verllb:
18679  case SystemZ::BI__builtin_s390_verllh:
18680  case SystemZ::BI__builtin_s390_verllf:
18681  case SystemZ::BI__builtin_s390_verllg: {
18682    llvm::Type *ResultType = ConvertType(E->getType());
18683    llvm::Value *Src = EmitScalarExpr(E->getArg(0));
18684    llvm::Value *Amt = EmitScalarExpr(E->getArg(1));
18685    // Splat scalar rotate amount to vector type.
18686    unsigned NumElts = cast<llvm::FixedVectorType>(ResultType)->getNumElements();
18687    Amt = Builder.CreateIntCast(Amt, ResultType->getScalarType(), false);
18688    Amt = Builder.CreateVectorSplat(NumElts, Amt);
18689    Function *F = CGM.getIntrinsic(Intrinsic::fshl, ResultType);
18690    return Builder.CreateCall(F, { Src, Src, Amt });
18691  }
18692
18693  case SystemZ::BI__builtin_s390_verllvb:
18694  case SystemZ::BI__builtin_s390_verllvh:
18695  case SystemZ::BI__builtin_s390_verllvf:
18696  case SystemZ::BI__builtin_s390_verllvg: {
18697    llvm::Type *ResultType = ConvertType(E->getType());
18698    llvm::Value *Src = EmitScalarExpr(E->getArg(0));
18699    llvm::Value *Amt = EmitScalarExpr(E->getArg(1));
18700    Function *F = CGM.getIntrinsic(Intrinsic::fshl, ResultType);
18701    return Builder.CreateCall(F, { Src, Src, Amt });
18702  }
18703
18704  case SystemZ::BI__builtin_s390_vfsqsb:
18705  case SystemZ::BI__builtin_s390_vfsqdb: {
18706    llvm::Type *ResultType = ConvertType(E->getType());
18707    Value *X = EmitScalarExpr(E->getArg(0));
18708    if (Builder.getIsFPConstrained()) {
18709      Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt, ResultType);
18710      return Builder.CreateConstrainedFPCall(F, { X });
18711    } else {
18712      Function *F = CGM.getIntrinsic(Intrinsic::sqrt, ResultType);
18713      return Builder.CreateCall(F, X);
18714    }
18715  }
18716  case SystemZ::BI__builtin_s390_vfmasb:
18717  case SystemZ::BI__builtin_s390_vfmadb: {
18718    llvm::Type *ResultType = ConvertType(E->getType());
18719    Value *X = EmitScalarExpr(E->getArg(0));
18720    Value *Y = EmitScalarExpr(E->getArg(1));
18721    Value *Z = EmitScalarExpr(E->getArg(2));
18722    if (Builder.getIsFPConstrained()) {
18723      Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
18724      return Builder.CreateConstrainedFPCall(F, {X, Y, Z});
18725    } else {
18726      Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
18727      return Builder.CreateCall(F, {X, Y, Z});
18728    }
18729  }
18730  case SystemZ::BI__builtin_s390_vfmssb:
18731  case SystemZ::BI__builtin_s390_vfmsdb: {
18732    llvm::Type *ResultType = ConvertType(E->getType());
18733    Value *X = EmitScalarExpr(E->getArg(0));
18734    Value *Y = EmitScalarExpr(E->getArg(1));
18735    Value *Z = EmitScalarExpr(E->getArg(2));
18736    if (Builder.getIsFPConstrained()) {
18737      Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
18738      return Builder.CreateConstrainedFPCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
18739    } else {
18740      Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
18741      return Builder.CreateCall(F, {X, Y, Builder.CreateFNeg(Z, "neg")});
18742    }
18743  }
18744  case SystemZ::BI__builtin_s390_vfnmasb:
18745  case SystemZ::BI__builtin_s390_vfnmadb: {
18746    llvm::Type *ResultType = ConvertType(E->getType());
18747    Value *X = EmitScalarExpr(E->getArg(0));
18748    Value *Y = EmitScalarExpr(E->getArg(1));
18749    Value *Z = EmitScalarExpr(E->getArg(2));
18750    if (Builder.getIsFPConstrained()) {
18751      Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
18752      return Builder.CreateFNeg(Builder.CreateConstrainedFPCall(F, {X, Y,  Z}), "neg");
18753    } else {
18754      Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
18755      return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, Z}), "neg");
18756    }
18757  }
18758  case SystemZ::BI__builtin_s390_vfnmssb:
18759  case SystemZ::BI__builtin_s390_vfnmsdb: {
18760    llvm::Type *ResultType = ConvertType(E->getType());
18761    Value *X = EmitScalarExpr(E->getArg(0));
18762    Value *Y = EmitScalarExpr(E->getArg(1));
18763    Value *Z = EmitScalarExpr(E->getArg(2));
18764    if (Builder.getIsFPConstrained()) {
18765      Function *F = CGM.getIntrinsic(Intrinsic::experimental_constrained_fma, ResultType);
18766      Value *NegZ = Builder.CreateFNeg(Z, "sub");
18767      return Builder.CreateFNeg(Builder.CreateConstrainedFPCall(F, {X, Y, NegZ}));
18768    } else {
18769      Function *F = CGM.getIntrinsic(Intrinsic::fma, ResultType);
18770      Value *NegZ = Builder.CreateFNeg(Z, "neg");
18771      return Builder.CreateFNeg(Builder.CreateCall(F, {X, Y, NegZ}));
18772    }
18773  }
18774  case SystemZ::BI__builtin_s390_vflpsb:
18775  case SystemZ::BI__builtin_s390_vflpdb: {
18776    llvm::Type *ResultType = ConvertType(E->getType());
18777    Value *X = EmitScalarExpr(E->getArg(0));
18778    Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
18779    return Builder.CreateCall(F, X);
18780  }
18781  case SystemZ::BI__builtin_s390_vflnsb:
18782  case SystemZ::BI__builtin_s390_vflndb: {
18783    llvm::Type *ResultType = ConvertType(E->getType());
18784    Value *X = EmitScalarExpr(E->getArg(0));
18785    Function *F = CGM.getIntrinsic(Intrinsic::fabs, ResultType);
18786    return Builder.CreateFNeg(Builder.CreateCall(F, X), "neg");
18787  }
18788  case SystemZ::BI__builtin_s390_vfisb:
18789  case SystemZ::BI__builtin_s390_vfidb: {
18790    llvm::Type *ResultType = ConvertType(E->getType());
18791    Value *X = EmitScalarExpr(E->getArg(0));
18792    // Constant-fold the M4 and M5 mask arguments.
18793    llvm::APSInt M4 = *E->getArg(1)->getIntegerConstantExpr(getContext());
18794    llvm::APSInt M5 = *E->getArg(2)->getIntegerConstantExpr(getContext());
18795    // Check whether this instance can be represented via a LLVM standard
18796    // intrinsic.  We only support some combinations of M4 and M5.
18797    Intrinsic::ID ID = Intrinsic::not_intrinsic;
18798    Intrinsic::ID CI;
18799    switch (M4.getZExtValue()) {
18800    default: break;
18801    case 0:  // IEEE-inexact exception allowed
18802      switch (M5.getZExtValue()) {
18803      default: break;
18804      case 0: ID = Intrinsic::rint;
18805              CI = Intrinsic::experimental_constrained_rint; break;
18806      }
18807      break;
18808    case 4:  // IEEE-inexact exception suppressed
18809      switch (M5.getZExtValue()) {
18810      default: break;
18811      case 0: ID = Intrinsic::nearbyint;
18812              CI = Intrinsic::experimental_constrained_nearbyint; break;
18813      case 1: ID = Intrinsic::round;
18814              CI = Intrinsic::experimental_constrained_round; break;
18815      case 5: ID = Intrinsic::trunc;
18816              CI = Intrinsic::experimental_constrained_trunc; break;
18817      case 6: ID = Intrinsic::ceil;
18818              CI = Intrinsic::experimental_constrained_ceil; break;
18819      case 7: ID = Intrinsic::floor;
18820              CI = Intrinsic::experimental_constrained_floor; break;
18821      }
18822      break;
18823    }
18824    if (ID != Intrinsic::not_intrinsic) {
18825      if (Builder.getIsFPConstrained()) {
18826        Function *F = CGM.getIntrinsic(CI, ResultType);
18827        return Builder.CreateConstrainedFPCall(F, X);
18828      } else {
18829        Function *F = CGM.getIntrinsic(ID, ResultType);
18830        return Builder.CreateCall(F, X);
18831      }
18832    }
18833    switch (BuiltinID) { // FIXME: constrained version?
18834      case SystemZ::BI__builtin_s390_vfisb: ID = Intrinsic::s390_vfisb; break;
18835      case SystemZ::BI__builtin_s390_vfidb: ID = Intrinsic::s390_vfidb; break;
18836      default: llvm_unreachable("Unknown BuiltinID");
18837    }
18838    Function *F = CGM.getIntrinsic(ID);
18839    Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
18840    Value *M5Value = llvm::ConstantInt::get(getLLVMContext(), M5);
18841    return Builder.CreateCall(F, {X, M4Value, M5Value});
18842  }
18843  case SystemZ::BI__builtin_s390_vfmaxsb:
18844  case SystemZ::BI__builtin_s390_vfmaxdb: {
18845    llvm::Type *ResultType = ConvertType(E->getType());
18846    Value *X = EmitScalarExpr(E->getArg(0));
18847    Value *Y = EmitScalarExpr(E->getArg(1));
18848    // Constant-fold the M4 mask argument.
18849    llvm::APSInt M4 = *E->getArg(2)->getIntegerConstantExpr(getContext());
18850    // Check whether this instance can be represented via a LLVM standard
18851    // intrinsic.  We only support some values of M4.
18852    Intrinsic::ID ID = Intrinsic::not_intrinsic;
18853    Intrinsic::ID CI;
18854    switch (M4.getZExtValue()) {
18855    default: break;
18856    case 4: ID = Intrinsic::maxnum;
18857            CI = Intrinsic::experimental_constrained_maxnum; break;
18858    }
18859    if (ID != Intrinsic::not_intrinsic) {
18860      if (Builder.getIsFPConstrained()) {
18861        Function *F = CGM.getIntrinsic(CI, ResultType);
18862        return Builder.CreateConstrainedFPCall(F, {X, Y});
18863      } else {
18864        Function *F = CGM.getIntrinsic(ID, ResultType);
18865        return Builder.CreateCall(F, {X, Y});
18866      }
18867    }
18868    switch (BuiltinID) {
18869      case SystemZ::BI__builtin_s390_vfmaxsb: ID = Intrinsic::s390_vfmaxsb; break;
18870      case SystemZ::BI__builtin_s390_vfmaxdb: ID = Intrinsic::s390_vfmaxdb; break;
18871      default: llvm_unreachable("Unknown BuiltinID");
18872    }
18873    Function *F = CGM.getIntrinsic(ID);
18874    Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
18875    return Builder.CreateCall(F, {X, Y, M4Value});
18876  }
18877  case SystemZ::BI__builtin_s390_vfminsb:
18878  case SystemZ::BI__builtin_s390_vfmindb: {
18879    llvm::Type *ResultType = ConvertType(E->getType());
18880    Value *X = EmitScalarExpr(E->getArg(0));
18881    Value *Y = EmitScalarExpr(E->getArg(1));
18882    // Constant-fold the M4 mask argument.
18883    llvm::APSInt M4 = *E->getArg(2)->getIntegerConstantExpr(getContext());
18884    // Check whether this instance can be represented via a LLVM standard
18885    // intrinsic.  We only support some values of M4.
18886    Intrinsic::ID ID = Intrinsic::not_intrinsic;
18887    Intrinsic::ID CI;
18888    switch (M4.getZExtValue()) {
18889    default: break;
18890    case 4: ID = Intrinsic::minnum;
18891            CI = Intrinsic::experimental_constrained_minnum; break;
18892    }
18893    if (ID != Intrinsic::not_intrinsic) {
18894      if (Builder.getIsFPConstrained()) {
18895        Function *F = CGM.getIntrinsic(CI, ResultType);
18896        return Builder.CreateConstrainedFPCall(F, {X, Y});
18897      } else {
18898        Function *F = CGM.getIntrinsic(ID, ResultType);
18899        return Builder.CreateCall(F, {X, Y});
18900      }
18901    }
18902    switch (BuiltinID) {
18903      case SystemZ::BI__builtin_s390_vfminsb: ID = Intrinsic::s390_vfminsb; break;
18904      case SystemZ::BI__builtin_s390_vfmindb: ID = Intrinsic::s390_vfmindb; break;
18905      default: llvm_unreachable("Unknown BuiltinID");
18906    }
18907    Function *F = CGM.getIntrinsic(ID);
18908    Value *M4Value = llvm::ConstantInt::get(getLLVMContext(), M4);
18909    return Builder.CreateCall(F, {X, Y, M4Value});
18910  }
18911
18912  case SystemZ::BI__builtin_s390_vlbrh:
18913  case SystemZ::BI__builtin_s390_vlbrf:
18914  case SystemZ::BI__builtin_s390_vlbrg: {
18915    llvm::Type *ResultType = ConvertType(E->getType());
18916    Value *X = EmitScalarExpr(E->getArg(0));
18917    Function *F = CGM.getIntrinsic(Intrinsic::bswap, ResultType);
18918    return Builder.CreateCall(F, X);
18919  }
18920
18921  // Vector intrinsics that output the post-instruction CC value.
18922
18923#define INTRINSIC_WITH_CC(NAME) \
18924    case SystemZ::BI__builtin_##NAME: \
18925      return EmitSystemZIntrinsicWithCC(*this, Intrinsic::NAME, E)
18926
18927  INTRINSIC_WITH_CC(s390_vpkshs);
18928  INTRINSIC_WITH_CC(s390_vpksfs);
18929  INTRINSIC_WITH_CC(s390_vpksgs);
18930
18931  INTRINSIC_WITH_CC(s390_vpklshs);
18932  INTRINSIC_WITH_CC(s390_vpklsfs);
18933  INTRINSIC_WITH_CC(s390_vpklsgs);
18934
18935  INTRINSIC_WITH_CC(s390_vceqbs);
18936  INTRINSIC_WITH_CC(s390_vceqhs);
18937  INTRINSIC_WITH_CC(s390_vceqfs);
18938  INTRINSIC_WITH_CC(s390_vceqgs);
18939
18940  INTRINSIC_WITH_CC(s390_vchbs);
18941  INTRINSIC_WITH_CC(s390_vchhs);
18942  INTRINSIC_WITH_CC(s390_vchfs);
18943  INTRINSIC_WITH_CC(s390_vchgs);
18944
18945  INTRINSIC_WITH_CC(s390_vchlbs);
18946  INTRINSIC_WITH_CC(s390_vchlhs);
18947  INTRINSIC_WITH_CC(s390_vchlfs);
18948  INTRINSIC_WITH_CC(s390_vchlgs);
18949
18950  INTRINSIC_WITH_CC(s390_vfaebs);
18951  INTRINSIC_WITH_CC(s390_vfaehs);
18952  INTRINSIC_WITH_CC(s390_vfaefs);
18953
18954  INTRINSIC_WITH_CC(s390_vfaezbs);
18955  INTRINSIC_WITH_CC(s390_vfaezhs);
18956  INTRINSIC_WITH_CC(s390_vfaezfs);
18957
18958  INTRINSIC_WITH_CC(s390_vfeebs);
18959  INTRINSIC_WITH_CC(s390_vfeehs);
18960  INTRINSIC_WITH_CC(s390_vfeefs);
18961
18962  INTRINSIC_WITH_CC(s390_vfeezbs);
18963  INTRINSIC_WITH_CC(s390_vfeezhs);
18964  INTRINSIC_WITH_CC(s390_vfeezfs);
18965
18966  INTRINSIC_WITH_CC(s390_vfenebs);
18967  INTRINSIC_WITH_CC(s390_vfenehs);
18968  INTRINSIC_WITH_CC(s390_vfenefs);
18969
18970  INTRINSIC_WITH_CC(s390_vfenezbs);
18971  INTRINSIC_WITH_CC(s390_vfenezhs);
18972  INTRINSIC_WITH_CC(s390_vfenezfs);
18973
18974  INTRINSIC_WITH_CC(s390_vistrbs);
18975  INTRINSIC_WITH_CC(s390_vistrhs);
18976  INTRINSIC_WITH_CC(s390_vistrfs);
18977
18978  INTRINSIC_WITH_CC(s390_vstrcbs);
18979  INTRINSIC_WITH_CC(s390_vstrchs);
18980  INTRINSIC_WITH_CC(s390_vstrcfs);
18981
18982  INTRINSIC_WITH_CC(s390_vstrczbs);
18983  INTRINSIC_WITH_CC(s390_vstrczhs);
18984  INTRINSIC_WITH_CC(s390_vstrczfs);
18985
18986  INTRINSIC_WITH_CC(s390_vfcesbs);
18987  INTRINSIC_WITH_CC(s390_vfcedbs);
18988  INTRINSIC_WITH_CC(s390_vfchsbs);
18989  INTRINSIC_WITH_CC(s390_vfchdbs);
18990  INTRINSIC_WITH_CC(s390_vfchesbs);
18991  INTRINSIC_WITH_CC(s390_vfchedbs);
18992
18993  INTRINSIC_WITH_CC(s390_vftcisb);
18994  INTRINSIC_WITH_CC(s390_vftcidb);
18995
18996  INTRINSIC_WITH_CC(s390_vstrsb);
18997  INTRINSIC_WITH_CC(s390_vstrsh);
18998  INTRINSIC_WITH_CC(s390_vstrsf);
18999
19000  INTRINSIC_WITH_CC(s390_vstrszb);
19001  INTRINSIC_WITH_CC(s390_vstrszh);
19002  INTRINSIC_WITH_CC(s390_vstrszf);
19003
19004#undef INTRINSIC_WITH_CC
19005
19006  default:
19007    return nullptr;
19008  }
19009}
19010
19011namespace {
19012// Helper classes for mapping MMA builtins to particular LLVM intrinsic variant.
19013struct NVPTXMmaLdstInfo {
19014  unsigned NumResults;  // Number of elements to load/store
19015  // Intrinsic IDs for row/col variants. 0 if particular layout is unsupported.
19016  unsigned IID_col;
19017  unsigned IID_row;
19018};
19019
19020#define MMA_INTR(geom_op_type, layout) \
19021  Intrinsic::nvvm_wmma_##geom_op_type##_##layout##_stride
19022#define MMA_LDST(n, geom_op_type)                                              \
19023  { n, MMA_INTR(geom_op_type, col), MMA_INTR(geom_op_type, row) }
19024
19025static NVPTXMmaLdstInfo getNVPTXMmaLdstInfo(unsigned BuiltinID) {
19026  switch (BuiltinID) {
19027  // FP MMA loads
19028  case NVPTX::BI__hmma_m16n16k16_ld_a:
19029    return MMA_LDST(8, m16n16k16_load_a_f16);
19030  case NVPTX::BI__hmma_m16n16k16_ld_b:
19031    return MMA_LDST(8, m16n16k16_load_b_f16);
19032  case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
19033    return MMA_LDST(4, m16n16k16_load_c_f16);
19034  case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
19035    return MMA_LDST(8, m16n16k16_load_c_f32);
19036  case NVPTX::BI__hmma_m32n8k16_ld_a:
19037    return MMA_LDST(8, m32n8k16_load_a_f16);
19038  case NVPTX::BI__hmma_m32n8k16_ld_b:
19039    return MMA_LDST(8, m32n8k16_load_b_f16);
19040  case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
19041    return MMA_LDST(4, m32n8k16_load_c_f16);
19042  case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
19043    return MMA_LDST(8, m32n8k16_load_c_f32);
19044  case NVPTX::BI__hmma_m8n32k16_ld_a:
19045    return MMA_LDST(8, m8n32k16_load_a_f16);
19046  case NVPTX::BI__hmma_m8n32k16_ld_b:
19047    return MMA_LDST(8, m8n32k16_load_b_f16);
19048  case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
19049    return MMA_LDST(4, m8n32k16_load_c_f16);
19050  case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
19051    return MMA_LDST(8, m8n32k16_load_c_f32);
19052
19053  // Integer MMA loads
19054  case NVPTX::BI__imma_m16n16k16_ld_a_s8:
19055    return MMA_LDST(2, m16n16k16_load_a_s8);
19056  case NVPTX::BI__imma_m16n16k16_ld_a_u8:
19057    return MMA_LDST(2, m16n16k16_load_a_u8);
19058  case NVPTX::BI__imma_m16n16k16_ld_b_s8:
19059    return MMA_LDST(2, m16n16k16_load_b_s8);
19060  case NVPTX::BI__imma_m16n16k16_ld_b_u8:
19061    return MMA_LDST(2, m16n16k16_load_b_u8);
19062  case NVPTX::BI__imma_m16n16k16_ld_c:
19063    return MMA_LDST(8, m16n16k16_load_c_s32);
19064  case NVPTX::BI__imma_m32n8k16_ld_a_s8:
19065    return MMA_LDST(4, m32n8k16_load_a_s8);
19066  case NVPTX::BI__imma_m32n8k16_ld_a_u8:
19067    return MMA_LDST(4, m32n8k16_load_a_u8);
19068  case NVPTX::BI__imma_m32n8k16_ld_b_s8:
19069    return MMA_LDST(1, m32n8k16_load_b_s8);
19070  case NVPTX::BI__imma_m32n8k16_ld_b_u8:
19071    return MMA_LDST(1, m32n8k16_load_b_u8);
19072  case NVPTX::BI__imma_m32n8k16_ld_c:
19073    return MMA_LDST(8, m32n8k16_load_c_s32);
19074  case NVPTX::BI__imma_m8n32k16_ld_a_s8:
19075    return MMA_LDST(1, m8n32k16_load_a_s8);
19076  case NVPTX::BI__imma_m8n32k16_ld_a_u8:
19077    return MMA_LDST(1, m8n32k16_load_a_u8);
19078  case NVPTX::BI__imma_m8n32k16_ld_b_s8:
19079    return MMA_LDST(4, m8n32k16_load_b_s8);
19080  case NVPTX::BI__imma_m8n32k16_ld_b_u8:
19081    return MMA_LDST(4, m8n32k16_load_b_u8);
19082  case NVPTX::BI__imma_m8n32k16_ld_c:
19083    return MMA_LDST(8, m8n32k16_load_c_s32);
19084
19085  // Sub-integer MMA loads.
19086  // Only row/col layout is supported by A/B fragments.
19087  case NVPTX::BI__imma_m8n8k32_ld_a_s4:
19088    return {1, 0, MMA_INTR(m8n8k32_load_a_s4, row)};
19089  case NVPTX::BI__imma_m8n8k32_ld_a_u4:
19090    return {1, 0, MMA_INTR(m8n8k32_load_a_u4, row)};
19091  case NVPTX::BI__imma_m8n8k32_ld_b_s4:
19092    return {1, MMA_INTR(m8n8k32_load_b_s4, col), 0};
19093  case NVPTX::BI__imma_m8n8k32_ld_b_u4:
19094    return {1, MMA_INTR(m8n8k32_load_b_u4, col), 0};
19095  case NVPTX::BI__imma_m8n8k32_ld_c:
19096    return MMA_LDST(2, m8n8k32_load_c_s32);
19097  case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
19098    return {1, 0, MMA_INTR(m8n8k128_load_a_b1, row)};
19099  case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
19100    return {1, MMA_INTR(m8n8k128_load_b_b1, col), 0};
19101  case NVPTX::BI__bmma_m8n8k128_ld_c:
19102    return MMA_LDST(2, m8n8k128_load_c_s32);
19103
19104  // Double MMA loads
19105  case NVPTX::BI__dmma_m8n8k4_ld_a:
19106    return MMA_LDST(1, m8n8k4_load_a_f64);
19107  case NVPTX::BI__dmma_m8n8k4_ld_b:
19108    return MMA_LDST(1, m8n8k4_load_b_f64);
19109  case NVPTX::BI__dmma_m8n8k4_ld_c:
19110    return MMA_LDST(2, m8n8k4_load_c_f64);
19111
19112  // Alternate float MMA loads
19113  case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
19114    return MMA_LDST(4, m16n16k16_load_a_bf16);
19115  case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
19116    return MMA_LDST(4, m16n16k16_load_b_bf16);
19117  case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
19118    return MMA_LDST(2, m8n32k16_load_a_bf16);
19119  case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
19120    return MMA_LDST(8, m8n32k16_load_b_bf16);
19121  case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
19122    return MMA_LDST(8, m32n8k16_load_a_bf16);
19123  case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
19124    return MMA_LDST(2, m32n8k16_load_b_bf16);
19125  case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
19126    return MMA_LDST(4, m16n16k8_load_a_tf32);
19127  case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
19128    return MMA_LDST(4, m16n16k8_load_b_tf32);
19129  case NVPTX::BI__mma_tf32_m16n16k8_ld_c:
19130    return MMA_LDST(8, m16n16k8_load_c_f32);
19131
19132  // NOTE: We need to follow inconsitent naming scheme used by NVCC.  Unlike
19133  // PTX and LLVM IR where stores always use fragment D, NVCC builtins always
19134  // use fragment C for both loads and stores.
19135  // FP MMA stores.
19136  case NVPTX::BI__hmma_m16n16k16_st_c_f16:
19137    return MMA_LDST(4, m16n16k16_store_d_f16);
19138  case NVPTX::BI__hmma_m16n16k16_st_c_f32:
19139    return MMA_LDST(8, m16n16k16_store_d_f32);
19140  case NVPTX::BI__hmma_m32n8k16_st_c_f16:
19141    return MMA_LDST(4, m32n8k16_store_d_f16);
19142  case NVPTX::BI__hmma_m32n8k16_st_c_f32:
19143    return MMA_LDST(8, m32n8k16_store_d_f32);
19144  case NVPTX::BI__hmma_m8n32k16_st_c_f16:
19145    return MMA_LDST(4, m8n32k16_store_d_f16);
19146  case NVPTX::BI__hmma_m8n32k16_st_c_f32:
19147    return MMA_LDST(8, m8n32k16_store_d_f32);
19148
19149  // Integer and sub-integer MMA stores.
19150  // Another naming quirk. Unlike other MMA builtins that use PTX types in the
19151  // name, integer loads/stores use LLVM's i32.
19152  case NVPTX::BI__imma_m16n16k16_st_c_i32:
19153    return MMA_LDST(8, m16n16k16_store_d_s32);
19154  case NVPTX::BI__imma_m32n8k16_st_c_i32:
19155    return MMA_LDST(8, m32n8k16_store_d_s32);
19156  case NVPTX::BI__imma_m8n32k16_st_c_i32:
19157    return MMA_LDST(8, m8n32k16_store_d_s32);
19158  case NVPTX::BI__imma_m8n8k32_st_c_i32:
19159    return MMA_LDST(2, m8n8k32_store_d_s32);
19160  case NVPTX::BI__bmma_m8n8k128_st_c_i32:
19161    return MMA_LDST(2, m8n8k128_store_d_s32);
19162
19163  // Double MMA store
19164  case NVPTX::BI__dmma_m8n8k4_st_c_f64:
19165    return MMA_LDST(2, m8n8k4_store_d_f64);
19166
19167  // Alternate float MMA store
19168  case NVPTX::BI__mma_m16n16k8_st_c_f32:
19169    return MMA_LDST(8, m16n16k8_store_d_f32);
19170
19171  default:
19172    llvm_unreachable("Unknown MMA builtin");
19173  }
19174}
19175#undef MMA_LDST
19176#undef MMA_INTR
19177
19178
19179struct NVPTXMmaInfo {
19180  unsigned NumEltsA;
19181  unsigned NumEltsB;
19182  unsigned NumEltsC;
19183  unsigned NumEltsD;
19184
19185  // Variants are ordered by layout-A/layout-B/satf, where 'row' has priority
19186  // over 'col' for layout. The index of non-satf variants is expected to match
19187  // the undocumented layout constants used by CUDA's mma.hpp.
19188  std::array<unsigned, 8> Variants;
19189
19190  unsigned getMMAIntrinsic(int Layout, bool Satf) {
19191    unsigned Index = Layout + 4 * Satf;
19192    if (Index >= Variants.size())
19193      return 0;
19194    return Variants[Index];
19195  }
19196};
19197
19198  // Returns an intrinsic that matches Layout and Satf for valid combinations of
19199  // Layout and Satf, 0 otherwise.
19200static NVPTXMmaInfo getNVPTXMmaInfo(unsigned BuiltinID) {
19201  // clang-format off
19202#define MMA_VARIANTS(geom, type)                                    \
19203      Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type,             \
19204      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
19205      Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type,             \
19206      Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type
19207#define MMA_SATF_VARIANTS(geom, type)                               \
19208      MMA_VARIANTS(geom, type),                                     \
19209      Intrinsic::nvvm_wmma_##geom##_mma_row_row_##type##_satfinite, \
19210      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
19211      Intrinsic::nvvm_wmma_##geom##_mma_col_row_##type##_satfinite, \
19212      Intrinsic::nvvm_wmma_##geom##_mma_col_col_##type##_satfinite
19213// Sub-integer MMA only supports row.col layout.
19214#define MMA_VARIANTS_I4(geom, type) \
19215      0, \
19216      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type,             \
19217      0, \
19218      0, \
19219      0, \
19220      Intrinsic::nvvm_wmma_##geom##_mma_row_col_##type##_satfinite, \
19221      0, \
19222      0
19223// b1 MMA does not support .satfinite.
19224#define MMA_VARIANTS_B1_XOR(geom, type) \
19225      0, \
19226      Intrinsic::nvvm_wmma_##geom##_mma_xor_popc_row_col_##type,             \
19227      0, \
19228      0, \
19229      0, \
19230      0, \
19231      0, \
19232      0
19233#define MMA_VARIANTS_B1_AND(geom, type) \
19234      0, \
19235      Intrinsic::nvvm_wmma_##geom##_mma_and_popc_row_col_##type,             \
19236      0, \
19237      0, \
19238      0, \
19239      0, \
19240      0, \
19241      0
19242  // clang-format on
19243  switch (BuiltinID) {
19244  // FP MMA
19245  // Note that 'type' argument of MMA_SATF_VARIANTS uses D_C notation, while
19246  // NumEltsN of return value are ordered as A,B,C,D.
19247  case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
19248    return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f16)}}};
19249  case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
19250    return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f16)}}};
19251  case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
19252    return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m16n16k16, f16_f32)}}};
19253  case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
19254    return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, f32_f32)}}};
19255  case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
19256    return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f16)}}};
19257  case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
19258    return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f16)}}};
19259  case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
19260    return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m32n8k16, f16_f32)}}};
19261  case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
19262    return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, f32_f32)}}};
19263  case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
19264    return {8, 8, 4, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f16)}}};
19265  case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
19266    return {8, 8, 4, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f16)}}};
19267  case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
19268    return {8, 8, 8, 4, {{MMA_SATF_VARIANTS(m8n32k16, f16_f32)}}};
19269  case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
19270    return {8, 8, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, f32_f32)}}};
19271
19272  // Integer MMA
19273  case NVPTX::BI__imma_m16n16k16_mma_s8:
19274    return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, s8)}}};
19275  case NVPTX::BI__imma_m16n16k16_mma_u8:
19276    return {2, 2, 8, 8, {{MMA_SATF_VARIANTS(m16n16k16, u8)}}};
19277  case NVPTX::BI__imma_m32n8k16_mma_s8:
19278    return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, s8)}}};
19279  case NVPTX::BI__imma_m32n8k16_mma_u8:
19280    return {4, 1, 8, 8, {{MMA_SATF_VARIANTS(m32n8k16, u8)}}};
19281  case NVPTX::BI__imma_m8n32k16_mma_s8:
19282    return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, s8)}}};
19283  case NVPTX::BI__imma_m8n32k16_mma_u8:
19284    return {1, 4, 8, 8, {{MMA_SATF_VARIANTS(m8n32k16, u8)}}};
19285
19286  // Sub-integer MMA
19287  case NVPTX::BI__imma_m8n8k32_mma_s4:
19288    return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, s4)}}};
19289  case NVPTX::BI__imma_m8n8k32_mma_u4:
19290    return {1, 1, 2, 2, {{MMA_VARIANTS_I4(m8n8k32, u4)}}};
19291  case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
19292    return {1, 1, 2, 2, {{MMA_VARIANTS_B1_XOR(m8n8k128, b1)}}};
19293  case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
19294    return {1, 1, 2, 2, {{MMA_VARIANTS_B1_AND(m8n8k128, b1)}}};
19295
19296  // Double MMA
19297  case NVPTX::BI__dmma_m8n8k4_mma_f64:
19298    return {1, 1, 2, 2, {{MMA_VARIANTS(m8n8k4, f64)}}};
19299
19300  // Alternate FP MMA
19301  case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
19302    return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k16, bf16)}}};
19303  case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
19304    return {2, 8, 8, 8, {{MMA_VARIANTS(m8n32k16, bf16)}}};
19305  case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
19306    return {8, 2, 8, 8, {{MMA_VARIANTS(m32n8k16, bf16)}}};
19307  case NVPTX::BI__mma_tf32_m16n16k8_mma_f32:
19308    return {4, 4, 8, 8, {{MMA_VARIANTS(m16n16k8, tf32)}}};
19309  default:
19310    llvm_unreachable("Unexpected builtin ID.");
19311  }
19312#undef MMA_VARIANTS
19313#undef MMA_SATF_VARIANTS
19314#undef MMA_VARIANTS_I4
19315#undef MMA_VARIANTS_B1_AND
19316#undef MMA_VARIANTS_B1_XOR
19317}
19318
19319static Value *MakeLdgLdu(unsigned IntrinsicID, CodeGenFunction &CGF,
19320                         const CallExpr *E) {
19321  Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
19322  QualType ArgType = E->getArg(0)->getType();
19323  clang::CharUnits Align = CGF.CGM.getNaturalPointeeTypeAlignment(ArgType);
19324  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ArgType->getPointeeType());
19325  return CGF.Builder.CreateCall(
19326      CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
19327      {Ptr, ConstantInt::get(CGF.Builder.getInt32Ty(), Align.getQuantity())});
19328}
19329
19330static Value *MakeScopedAtomic(unsigned IntrinsicID, CodeGenFunction &CGF,
19331                               const CallExpr *E) {
19332  Value *Ptr = CGF.EmitScalarExpr(E->getArg(0));
19333  llvm::Type *ElemTy =
19334      CGF.ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
19335  return CGF.Builder.CreateCall(
19336      CGF.CGM.getIntrinsic(IntrinsicID, {ElemTy, Ptr->getType()}),
19337      {Ptr, CGF.EmitScalarExpr(E->getArg(1))});
19338}
19339
19340static Value *MakeCpAsync(unsigned IntrinsicID, unsigned IntrinsicIDS,
19341                          CodeGenFunction &CGF, const CallExpr *E,
19342                          int SrcSize) {
19343  return E->getNumArgs() == 3
19344             ? CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicIDS),
19345                                      {CGF.EmitScalarExpr(E->getArg(0)),
19346                                       CGF.EmitScalarExpr(E->getArg(1)),
19347                                       CGF.EmitScalarExpr(E->getArg(2))})
19348             : CGF.Builder.CreateCall(CGF.CGM.getIntrinsic(IntrinsicID),
19349                                      {CGF.EmitScalarExpr(E->getArg(0)),
19350                                       CGF.EmitScalarExpr(E->getArg(1))});
19351}
19352
19353static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID,
19354                           const CallExpr *E, CodeGenFunction &CGF) {
19355  auto &C = CGF.CGM.getContext();
19356  if (!(C.getLangOpts().NativeHalfType ||
19357        !C.getTargetInfo().useFP16ConversionIntrinsics())) {
19358    CGF.CGM.Error(E->getExprLoc(), C.BuiltinInfo.getName(BuiltinID).str() +
19359                                       " requires native half type support.");
19360    return nullptr;
19361  }
19362
19363  if (IntrinsicID == Intrinsic::nvvm_ldg_global_f ||
19364      IntrinsicID == Intrinsic::nvvm_ldu_global_f)
19365    return MakeLdgLdu(IntrinsicID, CGF, E);
19366
19367  SmallVector<Value *, 16> Args;
19368  auto *F = CGF.CGM.getIntrinsic(IntrinsicID);
19369  auto *FTy = F->getFunctionType();
19370  unsigned ICEArguments = 0;
19371  ASTContext::GetBuiltinTypeError Error;
19372  C.GetBuiltinType(BuiltinID, Error, &ICEArguments);
19373  assert(Error == ASTContext::GE_None && "Should not codegen an error");
19374  for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
19375    assert((ICEArguments & (1 << i)) == 0);
19376    auto *ArgValue = CGF.EmitScalarExpr(E->getArg(i));
19377    auto *PTy = FTy->getParamType(i);
19378    if (PTy != ArgValue->getType())
19379      ArgValue = CGF.Builder.CreateBitCast(ArgValue, PTy);
19380    Args.push_back(ArgValue);
19381  }
19382
19383  return CGF.Builder.CreateCall(F, Args);
19384}
19385} // namespace
19386
19387Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
19388                                             const CallExpr *E) {
19389  switch (BuiltinID) {
19390  case NVPTX::BI__nvvm_atom_add_gen_i:
19391  case NVPTX::BI__nvvm_atom_add_gen_l:
19392  case NVPTX::BI__nvvm_atom_add_gen_ll:
19393    return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Add, E);
19394
19395  case NVPTX::BI__nvvm_atom_sub_gen_i:
19396  case NVPTX::BI__nvvm_atom_sub_gen_l:
19397  case NVPTX::BI__nvvm_atom_sub_gen_ll:
19398    return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Sub, E);
19399
19400  case NVPTX::BI__nvvm_atom_and_gen_i:
19401  case NVPTX::BI__nvvm_atom_and_gen_l:
19402  case NVPTX::BI__nvvm_atom_and_gen_ll:
19403    return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::And, E);
19404
19405  case NVPTX::BI__nvvm_atom_or_gen_i:
19406  case NVPTX::BI__nvvm_atom_or_gen_l:
19407  case NVPTX::BI__nvvm_atom_or_gen_ll:
19408    return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Or, E);
19409
19410  case NVPTX::BI__nvvm_atom_xor_gen_i:
19411  case NVPTX::BI__nvvm_atom_xor_gen_l:
19412  case NVPTX::BI__nvvm_atom_xor_gen_ll:
19413    return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xor, E);
19414
19415  case NVPTX::BI__nvvm_atom_xchg_gen_i:
19416  case NVPTX::BI__nvvm_atom_xchg_gen_l:
19417  case NVPTX::BI__nvvm_atom_xchg_gen_ll:
19418    return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Xchg, E);
19419
19420  case NVPTX::BI__nvvm_atom_max_gen_i:
19421  case NVPTX::BI__nvvm_atom_max_gen_l:
19422  case NVPTX::BI__nvvm_atom_max_gen_ll:
19423    return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Max, E);
19424
19425  case NVPTX::BI__nvvm_atom_max_gen_ui:
19426  case NVPTX::BI__nvvm_atom_max_gen_ul:
19427  case NVPTX::BI__nvvm_atom_max_gen_ull:
19428    return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMax, E);
19429
19430  case NVPTX::BI__nvvm_atom_min_gen_i:
19431  case NVPTX::BI__nvvm_atom_min_gen_l:
19432  case NVPTX::BI__nvvm_atom_min_gen_ll:
19433    return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::Min, E);
19434
19435  case NVPTX::BI__nvvm_atom_min_gen_ui:
19436  case NVPTX::BI__nvvm_atom_min_gen_ul:
19437  case NVPTX::BI__nvvm_atom_min_gen_ull:
19438    return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UMin, E);
19439
19440  case NVPTX::BI__nvvm_atom_cas_gen_i:
19441  case NVPTX::BI__nvvm_atom_cas_gen_l:
19442  case NVPTX::BI__nvvm_atom_cas_gen_ll:
19443    // __nvvm_atom_cas_gen_* should return the old value rather than the
19444    // success flag.
19445    return MakeAtomicCmpXchgValue(*this, E, /*ReturnBool=*/false);
19446
19447  case NVPTX::BI__nvvm_atom_add_gen_f:
19448  case NVPTX::BI__nvvm_atom_add_gen_d: {
19449    Address DestAddr = EmitPointerWithAlignment(E->getArg(0));
19450    Value *Val = EmitScalarExpr(E->getArg(1));
19451
19452    return Builder.CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, DestAddr, Val,
19453                                   AtomicOrdering::SequentiallyConsistent);
19454  }
19455
19456  case NVPTX::BI__nvvm_atom_inc_gen_ui: {
19457    Value *Ptr = EmitScalarExpr(E->getArg(0));
19458    Value *Val = EmitScalarExpr(E->getArg(1));
19459    Function *FnALI32 =
19460        CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType());
19461    return Builder.CreateCall(FnALI32, {Ptr, Val});
19462  }
19463
19464  case NVPTX::BI__nvvm_atom_dec_gen_ui: {
19465    Value *Ptr = EmitScalarExpr(E->getArg(0));
19466    Value *Val = EmitScalarExpr(E->getArg(1));
19467    Function *FnALD32 =
19468        CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType());
19469    return Builder.CreateCall(FnALD32, {Ptr, Val});
19470  }
19471
19472  case NVPTX::BI__nvvm_ldg_c:
19473  case NVPTX::BI__nvvm_ldg_sc:
19474  case NVPTX::BI__nvvm_ldg_c2:
19475  case NVPTX::BI__nvvm_ldg_sc2:
19476  case NVPTX::BI__nvvm_ldg_c4:
19477  case NVPTX::BI__nvvm_ldg_sc4:
19478  case NVPTX::BI__nvvm_ldg_s:
19479  case NVPTX::BI__nvvm_ldg_s2:
19480  case NVPTX::BI__nvvm_ldg_s4:
19481  case NVPTX::BI__nvvm_ldg_i:
19482  case NVPTX::BI__nvvm_ldg_i2:
19483  case NVPTX::BI__nvvm_ldg_i4:
19484  case NVPTX::BI__nvvm_ldg_l:
19485  case NVPTX::BI__nvvm_ldg_l2:
19486  case NVPTX::BI__nvvm_ldg_ll:
19487  case NVPTX::BI__nvvm_ldg_ll2:
19488  case NVPTX::BI__nvvm_ldg_uc:
19489  case NVPTX::BI__nvvm_ldg_uc2:
19490  case NVPTX::BI__nvvm_ldg_uc4:
19491  case NVPTX::BI__nvvm_ldg_us:
19492  case NVPTX::BI__nvvm_ldg_us2:
19493  case NVPTX::BI__nvvm_ldg_us4:
19494  case NVPTX::BI__nvvm_ldg_ui:
19495  case NVPTX::BI__nvvm_ldg_ui2:
19496  case NVPTX::BI__nvvm_ldg_ui4:
19497  case NVPTX::BI__nvvm_ldg_ul:
19498  case NVPTX::BI__nvvm_ldg_ul2:
19499  case NVPTX::BI__nvvm_ldg_ull:
19500  case NVPTX::BI__nvvm_ldg_ull2:
19501    // PTX Interoperability section 2.2: "For a vector with an even number of
19502    // elements, its alignment is set to number of elements times the alignment
19503    // of its member: n*alignof(t)."
19504    return MakeLdgLdu(Intrinsic::nvvm_ldg_global_i, *this, E);
19505  case NVPTX::BI__nvvm_ldg_f:
19506  case NVPTX::BI__nvvm_ldg_f2:
19507  case NVPTX::BI__nvvm_ldg_f4:
19508  case NVPTX::BI__nvvm_ldg_d:
19509  case NVPTX::BI__nvvm_ldg_d2:
19510    return MakeLdgLdu(Intrinsic::nvvm_ldg_global_f, *this, E);
19511
19512  case NVPTX::BI__nvvm_ldu_c:
19513  case NVPTX::BI__nvvm_ldu_sc:
19514  case NVPTX::BI__nvvm_ldu_c2:
19515  case NVPTX::BI__nvvm_ldu_sc2:
19516  case NVPTX::BI__nvvm_ldu_c4:
19517  case NVPTX::BI__nvvm_ldu_sc4:
19518  case NVPTX::BI__nvvm_ldu_s:
19519  case NVPTX::BI__nvvm_ldu_s2:
19520  case NVPTX::BI__nvvm_ldu_s4:
19521  case NVPTX::BI__nvvm_ldu_i:
19522  case NVPTX::BI__nvvm_ldu_i2:
19523  case NVPTX::BI__nvvm_ldu_i4:
19524  case NVPTX::BI__nvvm_ldu_l:
19525  case NVPTX::BI__nvvm_ldu_l2:
19526  case NVPTX::BI__nvvm_ldu_ll:
19527  case NVPTX::BI__nvvm_ldu_ll2:
19528  case NVPTX::BI__nvvm_ldu_uc:
19529  case NVPTX::BI__nvvm_ldu_uc2:
19530  case NVPTX::BI__nvvm_ldu_uc4:
19531  case NVPTX::BI__nvvm_ldu_us:
19532  case NVPTX::BI__nvvm_ldu_us2:
19533  case NVPTX::BI__nvvm_ldu_us4:
19534  case NVPTX::BI__nvvm_ldu_ui:
19535  case NVPTX::BI__nvvm_ldu_ui2:
19536  case NVPTX::BI__nvvm_ldu_ui4:
19537  case NVPTX::BI__nvvm_ldu_ul:
19538  case NVPTX::BI__nvvm_ldu_ul2:
19539  case NVPTX::BI__nvvm_ldu_ull:
19540  case NVPTX::BI__nvvm_ldu_ull2:
19541    return MakeLdgLdu(Intrinsic::nvvm_ldu_global_i, *this, E);
19542  case NVPTX::BI__nvvm_ldu_f:
19543  case NVPTX::BI__nvvm_ldu_f2:
19544  case NVPTX::BI__nvvm_ldu_f4:
19545  case NVPTX::BI__nvvm_ldu_d:
19546  case NVPTX::BI__nvvm_ldu_d2:
19547    return MakeLdgLdu(Intrinsic::nvvm_ldu_global_f, *this, E);
19548
19549  case NVPTX::BI__nvvm_atom_cta_add_gen_i:
19550  case NVPTX::BI__nvvm_atom_cta_add_gen_l:
19551  case NVPTX::BI__nvvm_atom_cta_add_gen_ll:
19552    return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta, *this, E);
19553  case NVPTX::BI__nvvm_atom_sys_add_gen_i:
19554  case NVPTX::BI__nvvm_atom_sys_add_gen_l:
19555  case NVPTX::BI__nvvm_atom_sys_add_gen_ll:
19556    return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys, *this, E);
19557  case NVPTX::BI__nvvm_atom_cta_add_gen_f:
19558  case NVPTX::BI__nvvm_atom_cta_add_gen_d:
19559    return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta, *this, E);
19560  case NVPTX::BI__nvvm_atom_sys_add_gen_f:
19561  case NVPTX::BI__nvvm_atom_sys_add_gen_d:
19562    return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys, *this, E);
19563  case NVPTX::BI__nvvm_atom_cta_xchg_gen_i:
19564  case NVPTX::BI__nvvm_atom_cta_xchg_gen_l:
19565  case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll:
19566    return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta, *this, E);
19567  case NVPTX::BI__nvvm_atom_sys_xchg_gen_i:
19568  case NVPTX::BI__nvvm_atom_sys_xchg_gen_l:
19569  case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll:
19570    return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys, *this, E);
19571  case NVPTX::BI__nvvm_atom_cta_max_gen_i:
19572  case NVPTX::BI__nvvm_atom_cta_max_gen_ui:
19573  case NVPTX::BI__nvvm_atom_cta_max_gen_l:
19574  case NVPTX::BI__nvvm_atom_cta_max_gen_ul:
19575  case NVPTX::BI__nvvm_atom_cta_max_gen_ll:
19576  case NVPTX::BI__nvvm_atom_cta_max_gen_ull:
19577    return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta, *this, E);
19578  case NVPTX::BI__nvvm_atom_sys_max_gen_i:
19579  case NVPTX::BI__nvvm_atom_sys_max_gen_ui:
19580  case NVPTX::BI__nvvm_atom_sys_max_gen_l:
19581  case NVPTX::BI__nvvm_atom_sys_max_gen_ul:
19582  case NVPTX::BI__nvvm_atom_sys_max_gen_ll:
19583  case NVPTX::BI__nvvm_atom_sys_max_gen_ull:
19584    return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys, *this, E);
19585  case NVPTX::BI__nvvm_atom_cta_min_gen_i:
19586  case NVPTX::BI__nvvm_atom_cta_min_gen_ui:
19587  case NVPTX::BI__nvvm_atom_cta_min_gen_l:
19588  case NVPTX::BI__nvvm_atom_cta_min_gen_ul:
19589  case NVPTX::BI__nvvm_atom_cta_min_gen_ll:
19590  case NVPTX::BI__nvvm_atom_cta_min_gen_ull:
19591    return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta, *this, E);
19592  case NVPTX::BI__nvvm_atom_sys_min_gen_i:
19593  case NVPTX::BI__nvvm_atom_sys_min_gen_ui:
19594  case NVPTX::BI__nvvm_atom_sys_min_gen_l:
19595  case NVPTX::BI__nvvm_atom_sys_min_gen_ul:
19596  case NVPTX::BI__nvvm_atom_sys_min_gen_ll:
19597  case NVPTX::BI__nvvm_atom_sys_min_gen_ull:
19598    return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys, *this, E);
19599  case NVPTX::BI__nvvm_atom_cta_inc_gen_ui:
19600    return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta, *this, E);
19601  case NVPTX::BI__nvvm_atom_cta_dec_gen_ui:
19602    return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta, *this, E);
19603  case NVPTX::BI__nvvm_atom_sys_inc_gen_ui:
19604    return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys, *this, E);
19605  case NVPTX::BI__nvvm_atom_sys_dec_gen_ui:
19606    return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys, *this, E);
19607  case NVPTX::BI__nvvm_atom_cta_and_gen_i:
19608  case NVPTX::BI__nvvm_atom_cta_and_gen_l:
19609  case NVPTX::BI__nvvm_atom_cta_and_gen_ll:
19610    return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta, *this, E);
19611  case NVPTX::BI__nvvm_atom_sys_and_gen_i:
19612  case NVPTX::BI__nvvm_atom_sys_and_gen_l:
19613  case NVPTX::BI__nvvm_atom_sys_and_gen_ll:
19614    return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys, *this, E);
19615  case NVPTX::BI__nvvm_atom_cta_or_gen_i:
19616  case NVPTX::BI__nvvm_atom_cta_or_gen_l:
19617  case NVPTX::BI__nvvm_atom_cta_or_gen_ll:
19618    return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta, *this, E);
19619  case NVPTX::BI__nvvm_atom_sys_or_gen_i:
19620  case NVPTX::BI__nvvm_atom_sys_or_gen_l:
19621  case NVPTX::BI__nvvm_atom_sys_or_gen_ll:
19622    return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_sys, *this, E);
19623  case NVPTX::BI__nvvm_atom_cta_xor_gen_i:
19624  case NVPTX::BI__nvvm_atom_cta_xor_gen_l:
19625  case NVPTX::BI__nvvm_atom_cta_xor_gen_ll:
19626    return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_cta, *this, E);
19627  case NVPTX::BI__nvvm_atom_sys_xor_gen_i:
19628  case NVPTX::BI__nvvm_atom_sys_xor_gen_l:
19629  case NVPTX::BI__nvvm_atom_sys_xor_gen_ll:
19630    return MakeScopedAtomic(Intrinsic::nvvm_atomic_xor_gen_i_sys, *this, E);
19631  case NVPTX::BI__nvvm_atom_cta_cas_gen_i:
19632  case NVPTX::BI__nvvm_atom_cta_cas_gen_l:
19633  case NVPTX::BI__nvvm_atom_cta_cas_gen_ll: {
19634    Value *Ptr = EmitScalarExpr(E->getArg(0));
19635    llvm::Type *ElemTy =
19636        ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
19637    return Builder.CreateCall(
19638        CGM.getIntrinsic(
19639            Intrinsic::nvvm_atomic_cas_gen_i_cta, {ElemTy, Ptr->getType()}),
19640        {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
19641  }
19642  case NVPTX::BI__nvvm_atom_sys_cas_gen_i:
19643  case NVPTX::BI__nvvm_atom_sys_cas_gen_l:
19644  case NVPTX::BI__nvvm_atom_sys_cas_gen_ll: {
19645    Value *Ptr = EmitScalarExpr(E->getArg(0));
19646    llvm::Type *ElemTy =
19647        ConvertTypeForMem(E->getArg(0)->getType()->getPointeeType());
19648    return Builder.CreateCall(
19649        CGM.getIntrinsic(
19650            Intrinsic::nvvm_atomic_cas_gen_i_sys, {ElemTy, Ptr->getType()}),
19651        {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))});
19652  }
19653  case NVPTX::BI__nvvm_match_all_sync_i32p:
19654  case NVPTX::BI__nvvm_match_all_sync_i64p: {
19655    Value *Mask = EmitScalarExpr(E->getArg(0));
19656    Value *Val = EmitScalarExpr(E->getArg(1));
19657    Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2));
19658    Value *ResultPair = Builder.CreateCall(
19659        CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p
19660                             ? Intrinsic::nvvm_match_all_sync_i32p
19661                             : Intrinsic::nvvm_match_all_sync_i64p),
19662        {Mask, Val});
19663    Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1),
19664                                     PredOutPtr.getElementType());
19665    Builder.CreateStore(Pred, PredOutPtr);
19666    return Builder.CreateExtractValue(ResultPair, 0);
19667  }
19668
19669  // FP MMA loads
19670  case NVPTX::BI__hmma_m16n16k16_ld_a:
19671  case NVPTX::BI__hmma_m16n16k16_ld_b:
19672  case NVPTX::BI__hmma_m16n16k16_ld_c_f16:
19673  case NVPTX::BI__hmma_m16n16k16_ld_c_f32:
19674  case NVPTX::BI__hmma_m32n8k16_ld_a:
19675  case NVPTX::BI__hmma_m32n8k16_ld_b:
19676  case NVPTX::BI__hmma_m32n8k16_ld_c_f16:
19677  case NVPTX::BI__hmma_m32n8k16_ld_c_f32:
19678  case NVPTX::BI__hmma_m8n32k16_ld_a:
19679  case NVPTX::BI__hmma_m8n32k16_ld_b:
19680  case NVPTX::BI__hmma_m8n32k16_ld_c_f16:
19681  case NVPTX::BI__hmma_m8n32k16_ld_c_f32:
19682  // Integer MMA loads.
19683  case NVPTX::BI__imma_m16n16k16_ld_a_s8:
19684  case NVPTX::BI__imma_m16n16k16_ld_a_u8:
19685  case NVPTX::BI__imma_m16n16k16_ld_b_s8:
19686  case NVPTX::BI__imma_m16n16k16_ld_b_u8:
19687  case NVPTX::BI__imma_m16n16k16_ld_c:
19688  case NVPTX::BI__imma_m32n8k16_ld_a_s8:
19689  case NVPTX::BI__imma_m32n8k16_ld_a_u8:
19690  case NVPTX::BI__imma_m32n8k16_ld_b_s8:
19691  case NVPTX::BI__imma_m32n8k16_ld_b_u8:
19692  case NVPTX::BI__imma_m32n8k16_ld_c:
19693  case NVPTX::BI__imma_m8n32k16_ld_a_s8:
19694  case NVPTX::BI__imma_m8n32k16_ld_a_u8:
19695  case NVPTX::BI__imma_m8n32k16_ld_b_s8:
19696  case NVPTX::BI__imma_m8n32k16_ld_b_u8:
19697  case NVPTX::BI__imma_m8n32k16_ld_c:
19698  // Sub-integer MMA loads.
19699  case NVPTX::BI__imma_m8n8k32_ld_a_s4:
19700  case NVPTX::BI__imma_m8n8k32_ld_a_u4:
19701  case NVPTX::BI__imma_m8n8k32_ld_b_s4:
19702  case NVPTX::BI__imma_m8n8k32_ld_b_u4:
19703  case NVPTX::BI__imma_m8n8k32_ld_c:
19704  case NVPTX::BI__bmma_m8n8k128_ld_a_b1:
19705  case NVPTX::BI__bmma_m8n8k128_ld_b_b1:
19706  case NVPTX::BI__bmma_m8n8k128_ld_c:
19707  // Double MMA loads.
19708  case NVPTX::BI__dmma_m8n8k4_ld_a:
19709  case NVPTX::BI__dmma_m8n8k4_ld_b:
19710  case NVPTX::BI__dmma_m8n8k4_ld_c:
19711  // Alternate float MMA loads.
19712  case NVPTX::BI__mma_bf16_m16n16k16_ld_a:
19713  case NVPTX::BI__mma_bf16_m16n16k16_ld_b:
19714  case NVPTX::BI__mma_bf16_m8n32k16_ld_a:
19715  case NVPTX::BI__mma_bf16_m8n32k16_ld_b:
19716  case NVPTX::BI__mma_bf16_m32n8k16_ld_a:
19717  case NVPTX::BI__mma_bf16_m32n8k16_ld_b:
19718  case NVPTX::BI__mma_tf32_m16n16k8_ld_a:
19719  case NVPTX::BI__mma_tf32_m16n16k8_ld_b:
19720  case NVPTX::BI__mma_tf32_m16n16k8_ld_c: {
19721    Address Dst = EmitPointerWithAlignment(E->getArg(0));
19722    Value *Src = EmitScalarExpr(E->getArg(1));
19723    Value *Ldm = EmitScalarExpr(E->getArg(2));
19724    std::optional<llvm::APSInt> isColMajorArg =
19725        E->getArg(3)->getIntegerConstantExpr(getContext());
19726    if (!isColMajorArg)
19727      return nullptr;
19728    bool isColMajor = isColMajorArg->getSExtValue();
19729    NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
19730    unsigned IID = isColMajor ? II.IID_col : II.IID_row;
19731    if (IID == 0)
19732      return nullptr;
19733
19734    Value *Result =
19735        Builder.CreateCall(CGM.getIntrinsic(IID, Src->getType()), {Src, Ldm});
19736
19737    // Save returned values.
19738    assert(II.NumResults);
19739    if (II.NumResults == 1) {
19740      Builder.CreateAlignedStore(Result, Dst.getPointer(),
19741                                 CharUnits::fromQuantity(4));
19742    } else {
19743      for (unsigned i = 0; i < II.NumResults; ++i) {
19744        Builder.CreateAlignedStore(
19745            Builder.CreateBitCast(Builder.CreateExtractValue(Result, i),
19746                                  Dst.getElementType()),
19747            Builder.CreateGEP(Dst.getElementType(), Dst.getPointer(),
19748                              llvm::ConstantInt::get(IntTy, i)),
19749            CharUnits::fromQuantity(4));
19750      }
19751    }
19752    return Result;
19753  }
19754
19755  case NVPTX::BI__hmma_m16n16k16_st_c_f16:
19756  case NVPTX::BI__hmma_m16n16k16_st_c_f32:
19757  case NVPTX::BI__hmma_m32n8k16_st_c_f16:
19758  case NVPTX::BI__hmma_m32n8k16_st_c_f32:
19759  case NVPTX::BI__hmma_m8n32k16_st_c_f16:
19760  case NVPTX::BI__hmma_m8n32k16_st_c_f32:
19761  case NVPTX::BI__imma_m16n16k16_st_c_i32:
19762  case NVPTX::BI__imma_m32n8k16_st_c_i32:
19763  case NVPTX::BI__imma_m8n32k16_st_c_i32:
19764  case NVPTX::BI__imma_m8n8k32_st_c_i32:
19765  case NVPTX::BI__bmma_m8n8k128_st_c_i32:
19766  case NVPTX::BI__dmma_m8n8k4_st_c_f64:
19767  case NVPTX::BI__mma_m16n16k8_st_c_f32: {
19768    Value *Dst = EmitScalarExpr(E->getArg(0));
19769    Address Src = EmitPointerWithAlignment(E->getArg(1));
19770    Value *Ldm = EmitScalarExpr(E->getArg(2));
19771    std::optional<llvm::APSInt> isColMajorArg =
19772        E->getArg(3)->getIntegerConstantExpr(getContext());
19773    if (!isColMajorArg)
19774      return nullptr;
19775    bool isColMajor = isColMajorArg->getSExtValue();
19776    NVPTXMmaLdstInfo II = getNVPTXMmaLdstInfo(BuiltinID);
19777    unsigned IID = isColMajor ? II.IID_col : II.IID_row;
19778    if (IID == 0)
19779      return nullptr;
19780    Function *Intrinsic =
19781        CGM.getIntrinsic(IID, Dst->getType());
19782    llvm::Type *ParamType = Intrinsic->getFunctionType()->getParamType(1);
19783    SmallVector<Value *, 10> Values = {Dst};
19784    for (unsigned i = 0; i < II.NumResults; ++i) {
19785      Value *V = Builder.CreateAlignedLoad(
19786          Src.getElementType(),
19787          Builder.CreateGEP(Src.getElementType(), Src.getPointer(),
19788                            llvm::ConstantInt::get(IntTy, i)),
19789          CharUnits::fromQuantity(4));
19790      Values.push_back(Builder.CreateBitCast(V, ParamType));
19791    }
19792    Values.push_back(Ldm);
19793    Value *Result = Builder.CreateCall(Intrinsic, Values);
19794    return Result;
19795  }
19796
19797  // BI__hmma_m16n16k16_mma_<Dtype><CType>(d, a, b, c, layout, satf) -->
19798  // Intrinsic::nvvm_wmma_m16n16k16_mma_sync<layout A,B><DType><CType><Satf>
19799  case NVPTX::BI__hmma_m16n16k16_mma_f16f16:
19800  case NVPTX::BI__hmma_m16n16k16_mma_f32f16:
19801  case NVPTX::BI__hmma_m16n16k16_mma_f32f32:
19802  case NVPTX::BI__hmma_m16n16k16_mma_f16f32:
19803  case NVPTX::BI__hmma_m32n8k16_mma_f16f16:
19804  case NVPTX::BI__hmma_m32n8k16_mma_f32f16:
19805  case NVPTX::BI__hmma_m32n8k16_mma_f32f32:
19806  case NVPTX::BI__hmma_m32n8k16_mma_f16f32:
19807  case NVPTX::BI__hmma_m8n32k16_mma_f16f16:
19808  case NVPTX::BI__hmma_m8n32k16_mma_f32f16:
19809  case NVPTX::BI__hmma_m8n32k16_mma_f32f32:
19810  case NVPTX::BI__hmma_m8n32k16_mma_f16f32:
19811  case NVPTX::BI__imma_m16n16k16_mma_s8:
19812  case NVPTX::BI__imma_m16n16k16_mma_u8:
19813  case NVPTX::BI__imma_m32n8k16_mma_s8:
19814  case NVPTX::BI__imma_m32n8k16_mma_u8:
19815  case NVPTX::BI__imma_m8n32k16_mma_s8:
19816  case NVPTX::BI__imma_m8n32k16_mma_u8:
19817  case NVPTX::BI__imma_m8n8k32_mma_s4:
19818  case NVPTX::BI__imma_m8n8k32_mma_u4:
19819  case NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1:
19820  case NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1:
19821  case NVPTX::BI__dmma_m8n8k4_mma_f64:
19822  case NVPTX::BI__mma_bf16_m16n16k16_mma_f32:
19823  case NVPTX::BI__mma_bf16_m8n32k16_mma_f32:
19824  case NVPTX::BI__mma_bf16_m32n8k16_mma_f32:
19825  case NVPTX::BI__mma_tf32_m16n16k8_mma_f32: {
19826    Address Dst = EmitPointerWithAlignment(E->getArg(0));
19827    Address SrcA = EmitPointerWithAlignment(E->getArg(1));
19828    Address SrcB = EmitPointerWithAlignment(E->getArg(2));
19829    Address SrcC = EmitPointerWithAlignment(E->getArg(3));
19830    std::optional<llvm::APSInt> LayoutArg =
19831        E->getArg(4)->getIntegerConstantExpr(getContext());
19832    if (!LayoutArg)
19833      return nullptr;
19834    int Layout = LayoutArg->getSExtValue();
19835    if (Layout < 0 || Layout > 3)
19836      return nullptr;
19837    llvm::APSInt SatfArg;
19838    if (BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_xor_popc_b1 ||
19839        BuiltinID == NVPTX::BI__bmma_m8n8k128_mma_and_popc_b1)
19840      SatfArg = 0;  // .b1 does not have satf argument.
19841    else if (std::optional<llvm::APSInt> OptSatfArg =
19842                 E->getArg(5)->getIntegerConstantExpr(getContext()))
19843      SatfArg = *OptSatfArg;
19844    else
19845      return nullptr;
19846    bool Satf = SatfArg.getSExtValue();
19847    NVPTXMmaInfo MI = getNVPTXMmaInfo(BuiltinID);
19848    unsigned IID = MI.getMMAIntrinsic(Layout, Satf);
19849    if (IID == 0)  // Unsupported combination of Layout/Satf.
19850      return nullptr;
19851
19852    SmallVector<Value *, 24> Values;
19853    Function *Intrinsic = CGM.getIntrinsic(IID);
19854    llvm::Type *AType = Intrinsic->getFunctionType()->getParamType(0);
19855    // Load A
19856    for (unsigned i = 0; i < MI.NumEltsA; ++i) {
19857      Value *V = Builder.CreateAlignedLoad(
19858          SrcA.getElementType(),
19859          Builder.CreateGEP(SrcA.getElementType(), SrcA.getPointer(),
19860                            llvm::ConstantInt::get(IntTy, i)),
19861          CharUnits::fromQuantity(4));
19862      Values.push_back(Builder.CreateBitCast(V, AType));
19863    }
19864    // Load B
19865    llvm::Type *BType = Intrinsic->getFunctionType()->getParamType(MI.NumEltsA);
19866    for (unsigned i = 0; i < MI.NumEltsB; ++i) {
19867      Value *V = Builder.CreateAlignedLoad(
19868          SrcB.getElementType(),
19869          Builder.CreateGEP(SrcB.getElementType(), SrcB.getPointer(),
19870                            llvm::ConstantInt::get(IntTy, i)),
19871          CharUnits::fromQuantity(4));
19872      Values.push_back(Builder.CreateBitCast(V, BType));
19873    }
19874    // Load C
19875    llvm::Type *CType =
19876        Intrinsic->getFunctionType()->getParamType(MI.NumEltsA + MI.NumEltsB);
19877    for (unsigned i = 0; i < MI.NumEltsC; ++i) {
19878      Value *V = Builder.CreateAlignedLoad(
19879          SrcC.getElementType(),
19880          Builder.CreateGEP(SrcC.getElementType(), SrcC.getPointer(),
19881                            llvm::ConstantInt::get(IntTy, i)),
19882          CharUnits::fromQuantity(4));
19883      Values.push_back(Builder.CreateBitCast(V, CType));
19884    }
19885    Value *Result = Builder.CreateCall(Intrinsic, Values);
19886    llvm::Type *DType = Dst.getElementType();
19887    for (unsigned i = 0; i < MI.NumEltsD; ++i)
19888      Builder.CreateAlignedStore(
19889          Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), DType),
19890          Builder.CreateGEP(Dst.getElementType(), Dst.getPointer(),
19891                            llvm::ConstantInt::get(IntTy, i)),
19892          CharUnits::fromQuantity(4));
19893    return Result;
19894  }
19895  // The following builtins require half type support
19896  case NVPTX::BI__nvvm_ex2_approx_f16:
19897    return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16, BuiltinID, E, *this);
19898  case NVPTX::BI__nvvm_ex2_approx_f16x2:
19899    return MakeHalfType(Intrinsic::nvvm_ex2_approx_f16x2, BuiltinID, E, *this);
19900  case NVPTX::BI__nvvm_ff2f16x2_rn:
19901    return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn, BuiltinID, E, *this);
19902  case NVPTX::BI__nvvm_ff2f16x2_rn_relu:
19903    return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rn_relu, BuiltinID, E, *this);
19904  case NVPTX::BI__nvvm_ff2f16x2_rz:
19905    return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz, BuiltinID, E, *this);
19906  case NVPTX::BI__nvvm_ff2f16x2_rz_relu:
19907    return MakeHalfType(Intrinsic::nvvm_ff2f16x2_rz_relu, BuiltinID, E, *this);
19908  case NVPTX::BI__nvvm_fma_rn_f16:
19909    return MakeHalfType(Intrinsic::nvvm_fma_rn_f16, BuiltinID, E, *this);
19910  case NVPTX::BI__nvvm_fma_rn_f16x2:
19911    return MakeHalfType(Intrinsic::nvvm_fma_rn_f16x2, BuiltinID, E, *this);
19912  case NVPTX::BI__nvvm_fma_rn_ftz_f16:
19913    return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16, BuiltinID, E, *this);
19914  case NVPTX::BI__nvvm_fma_rn_ftz_f16x2:
19915    return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_f16x2, BuiltinID, E, *this);
19916  case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16:
19917    return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16, BuiltinID, E,
19918                        *this);
19919  case NVPTX::BI__nvvm_fma_rn_ftz_relu_f16x2:
19920    return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_relu_f16x2, BuiltinID, E,
19921                        *this);
19922  case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16:
19923    return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16, BuiltinID, E,
19924                        *this);
19925  case NVPTX::BI__nvvm_fma_rn_ftz_sat_f16x2:
19926    return MakeHalfType(Intrinsic::nvvm_fma_rn_ftz_sat_f16x2, BuiltinID, E,
19927                        *this);
19928  case NVPTX::BI__nvvm_fma_rn_relu_f16:
19929    return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16, BuiltinID, E, *this);
19930  case NVPTX::BI__nvvm_fma_rn_relu_f16x2:
19931    return MakeHalfType(Intrinsic::nvvm_fma_rn_relu_f16x2, BuiltinID, E, *this);
19932  case NVPTX::BI__nvvm_fma_rn_sat_f16:
19933    return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16, BuiltinID, E, *this);
19934  case NVPTX::BI__nvvm_fma_rn_sat_f16x2:
19935    return MakeHalfType(Intrinsic::nvvm_fma_rn_sat_f16x2, BuiltinID, E, *this);
19936  case NVPTX::BI__nvvm_fmax_f16:
19937    return MakeHalfType(Intrinsic::nvvm_fmax_f16, BuiltinID, E, *this);
19938  case NVPTX::BI__nvvm_fmax_f16x2:
19939    return MakeHalfType(Intrinsic::nvvm_fmax_f16x2, BuiltinID, E, *this);
19940  case NVPTX::BI__nvvm_fmax_ftz_f16:
19941    return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16, BuiltinID, E, *this);
19942  case NVPTX::BI__nvvm_fmax_ftz_f16x2:
19943    return MakeHalfType(Intrinsic::nvvm_fmax_ftz_f16x2, BuiltinID, E, *this);
19944  case NVPTX::BI__nvvm_fmax_ftz_nan_f16:
19945    return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16, BuiltinID, E, *this);
19946  case NVPTX::BI__nvvm_fmax_ftz_nan_f16x2:
19947    return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_f16x2, BuiltinID, E,
19948                        *this);
19949  case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16:
19950    return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16, BuiltinID,
19951                        E, *this);
19952  case NVPTX::BI__nvvm_fmax_ftz_nan_xorsign_abs_f16x2:
19953    return MakeHalfType(Intrinsic::nvvm_fmax_ftz_nan_xorsign_abs_f16x2,
19954                        BuiltinID, E, *this);
19955  case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16:
19956    return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16, BuiltinID, E,
19957                        *this);
19958  case NVPTX::BI__nvvm_fmax_ftz_xorsign_abs_f16x2:
19959    return MakeHalfType(Intrinsic::nvvm_fmax_ftz_xorsign_abs_f16x2, BuiltinID,
19960                        E, *this);
19961  case NVPTX::BI__nvvm_fmax_nan_f16:
19962    return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16, BuiltinID, E, *this);
19963  case NVPTX::BI__nvvm_fmax_nan_f16x2:
19964    return MakeHalfType(Intrinsic::nvvm_fmax_nan_f16x2, BuiltinID, E, *this);
19965  case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16:
19966    return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16, BuiltinID, E,
19967                        *this);
19968  case NVPTX::BI__nvvm_fmax_nan_xorsign_abs_f16x2:
19969    return MakeHalfType(Intrinsic::nvvm_fmax_nan_xorsign_abs_f16x2, BuiltinID,
19970                        E, *this);
19971  case NVPTX::BI__nvvm_fmax_xorsign_abs_f16:
19972    return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16, BuiltinID, E,
19973                        *this);
19974  case NVPTX::BI__nvvm_fmax_xorsign_abs_f16x2:
19975    return MakeHalfType(Intrinsic::nvvm_fmax_xorsign_abs_f16x2, BuiltinID, E,
19976                        *this);
19977  case NVPTX::BI__nvvm_fmin_f16:
19978    return MakeHalfType(Intrinsic::nvvm_fmin_f16, BuiltinID, E, *this);
19979  case NVPTX::BI__nvvm_fmin_f16x2:
19980    return MakeHalfType(Intrinsic::nvvm_fmin_f16x2, BuiltinID, E, *this);
19981  case NVPTX::BI__nvvm_fmin_ftz_f16:
19982    return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16, BuiltinID, E, *this);
19983  case NVPTX::BI__nvvm_fmin_ftz_f16x2:
19984    return MakeHalfType(Intrinsic::nvvm_fmin_ftz_f16x2, BuiltinID, E, *this);
19985  case NVPTX::BI__nvvm_fmin_ftz_nan_f16:
19986    return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16, BuiltinID, E, *this);
19987  case NVPTX::BI__nvvm_fmin_ftz_nan_f16x2:
19988    return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_f16x2, BuiltinID, E,
19989                        *this);
19990  case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16:
19991    return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16, BuiltinID,
19992                        E, *this);
19993  case NVPTX::BI__nvvm_fmin_ftz_nan_xorsign_abs_f16x2:
19994    return MakeHalfType(Intrinsic::nvvm_fmin_ftz_nan_xorsign_abs_f16x2,
19995                        BuiltinID, E, *this);
19996  case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16:
19997    return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16, BuiltinID, E,
19998                        *this);
19999  case NVPTX::BI__nvvm_fmin_ftz_xorsign_abs_f16x2:
20000    return MakeHalfType(Intrinsic::nvvm_fmin_ftz_xorsign_abs_f16x2, BuiltinID,
20001                        E, *this);
20002  case NVPTX::BI__nvvm_fmin_nan_f16:
20003    return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16, BuiltinID, E, *this);
20004  case NVPTX::BI__nvvm_fmin_nan_f16x2:
20005    return MakeHalfType(Intrinsic::nvvm_fmin_nan_f16x2, BuiltinID, E, *this);
20006  case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16:
20007    return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16, BuiltinID, E,
20008                        *this);
20009  case NVPTX::BI__nvvm_fmin_nan_xorsign_abs_f16x2:
20010    return MakeHalfType(Intrinsic::nvvm_fmin_nan_xorsign_abs_f16x2, BuiltinID,
20011                        E, *this);
20012  case NVPTX::BI__nvvm_fmin_xorsign_abs_f16:
20013    return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16, BuiltinID, E,
20014                        *this);
20015  case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2:
20016    return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E,
20017                        *this);
20018  case NVPTX::BI__nvvm_ldg_h:
20019    return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
20020  case NVPTX::BI__nvvm_ldg_h2:
20021    return MakeHalfType(Intrinsic::nvvm_ldg_global_f, BuiltinID, E, *this);
20022  case NVPTX::BI__nvvm_ldu_h:
20023    return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
20024  case NVPTX::BI__nvvm_ldu_h2: {
20025    return MakeHalfType(Intrinsic::nvvm_ldu_global_f, BuiltinID, E, *this);
20026  }
20027  case NVPTX::BI__nvvm_cp_async_ca_shared_global_4:
20028    return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_4,
20029                       Intrinsic::nvvm_cp_async_ca_shared_global_4_s, *this, E,
20030                       4);
20031  case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
20032    return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_8,
20033                       Intrinsic::nvvm_cp_async_ca_shared_global_8_s, *this, E,
20034                       8);
20035  case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
20036    return MakeCpAsync(Intrinsic::nvvm_cp_async_ca_shared_global_16,
20037                       Intrinsic::nvvm_cp_async_ca_shared_global_16_s, *this, E,
20038                       16);
20039  case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
20040    return MakeCpAsync(Intrinsic::nvvm_cp_async_cg_shared_global_16,
20041                       Intrinsic::nvvm_cp_async_cg_shared_global_16_s, *this, E,
20042                       16);
20043  case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_x:
20044    return Builder.CreateCall(
20045        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_x));
20046  case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_y:
20047    return Builder.CreateCall(
20048        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_y));
20049  case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_z:
20050    return Builder.CreateCall(
20051        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_z));
20052  case NVPTX::BI__nvvm_read_ptx_sreg_clusterid_w:
20053    return Builder.CreateCall(
20054        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_clusterid_w));
20055  case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_x:
20056    return Builder.CreateCall(
20057        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_x));
20058  case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_y:
20059    return Builder.CreateCall(
20060        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_y));
20061  case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_z:
20062    return Builder.CreateCall(
20063        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_z));
20064  case NVPTX::BI__nvvm_read_ptx_sreg_nclusterid_w:
20065    return Builder.CreateCall(
20066        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_nclusterid_w));
20067  case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_x:
20068    return Builder.CreateCall(
20069        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x));
20070  case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_y:
20071    return Builder.CreateCall(
20072        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y));
20073  case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_z:
20074    return Builder.CreateCall(
20075        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z));
20076  case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctaid_w:
20077    return Builder.CreateCall(
20078        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_w));
20079  case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_x:
20080    return Builder.CreateCall(
20081        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x));
20082  case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_y:
20083    return Builder.CreateCall(
20084        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y));
20085  case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_z:
20086    return Builder.CreateCall(
20087        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z));
20088  case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctaid_w:
20089    return Builder.CreateCall(
20090        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_w));
20091  case NVPTX::BI__nvvm_read_ptx_sreg_cluster_ctarank:
20092    return Builder.CreateCall(
20093        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank));
20094  case NVPTX::BI__nvvm_read_ptx_sreg_cluster_nctarank:
20095    return Builder.CreateCall(
20096        CGM.getIntrinsic(Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank));
20097  case NVPTX::BI__nvvm_is_explicit_cluster:
20098    return Builder.CreateCall(
20099        CGM.getIntrinsic(Intrinsic::nvvm_is_explicit_cluster));
20100  case NVPTX::BI__nvvm_isspacep_shared_cluster:
20101    return Builder.CreateCall(
20102        CGM.getIntrinsic(Intrinsic::nvvm_isspacep_shared_cluster),
20103        EmitScalarExpr(E->getArg(0)));
20104  case NVPTX::BI__nvvm_mapa:
20105    return Builder.CreateCall(
20106        CGM.getIntrinsic(Intrinsic::nvvm_mapa),
20107        {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
20108  case NVPTX::BI__nvvm_mapa_shared_cluster:
20109    return Builder.CreateCall(
20110        CGM.getIntrinsic(Intrinsic::nvvm_mapa_shared_cluster),
20111        {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
20112  case NVPTX::BI__nvvm_getctarank:
20113    return Builder.CreateCall(
20114        CGM.getIntrinsic(Intrinsic::nvvm_getctarank),
20115        EmitScalarExpr(E->getArg(0)));
20116  case NVPTX::BI__nvvm_getctarank_shared_cluster:
20117    return Builder.CreateCall(
20118        CGM.getIntrinsic(Intrinsic::nvvm_getctarank_shared_cluster),
20119        EmitScalarExpr(E->getArg(0)));
20120  case NVPTX::BI__nvvm_barrier_cluster_arrive:
20121    return Builder.CreateCall(
20122        CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive));
20123  case NVPTX::BI__nvvm_barrier_cluster_arrive_relaxed:
20124    return Builder.CreateCall(
20125        CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_arrive_relaxed));
20126  case NVPTX::BI__nvvm_barrier_cluster_wait:
20127    return Builder.CreateCall(
20128        CGM.getIntrinsic(Intrinsic::nvvm_barrier_cluster_wait));
20129  case NVPTX::BI__nvvm_fence_sc_cluster:
20130    return Builder.CreateCall(
20131        CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
20132  default:
20133    return nullptr;
20134  }
20135}
20136
20137namespace {
20138struct BuiltinAlignArgs {
20139  llvm::Value *Src = nullptr;
20140  llvm::Type *SrcType = nullptr;
20141  llvm::Value *Alignment = nullptr;
20142  llvm::Value *Mask = nullptr;
20143  llvm::IntegerType *IntType = nullptr;
20144
20145  BuiltinAlignArgs(const CallExpr *E, CodeGenFunction &CGF) {
20146    QualType AstType = E->getArg(0)->getType();
20147    if (AstType->isArrayType())
20148      Src = CGF.EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20149    else
20150      Src = CGF.EmitScalarExpr(E->getArg(0));
20151    SrcType = Src->getType();
20152    if (SrcType->isPointerTy()) {
20153      IntType = IntegerType::get(
20154          CGF.getLLVMContext(),
20155          CGF.CGM.getDataLayout().getIndexTypeSizeInBits(SrcType));
20156    } else {
20157      assert(SrcType->isIntegerTy());
20158      IntType = cast<llvm::IntegerType>(SrcType);
20159    }
20160    Alignment = CGF.EmitScalarExpr(E->getArg(1));
20161    Alignment = CGF.Builder.CreateZExtOrTrunc(Alignment, IntType, "alignment");
20162    auto *One = llvm::ConstantInt::get(IntType, 1);
20163    Mask = CGF.Builder.CreateSub(Alignment, One, "mask");
20164  }
20165};
20166} // namespace
20167
20168/// Generate (x & (y-1)) == 0.
20169RValue CodeGenFunction::EmitBuiltinIsAligned(const CallExpr *E) {
20170  BuiltinAlignArgs Args(E, *this);
20171  llvm::Value *SrcAddress = Args.Src;
20172  if (Args.SrcType->isPointerTy())
20173    SrcAddress =
20174        Builder.CreateBitOrPointerCast(Args.Src, Args.IntType, "src_addr");
20175  return RValue::get(Builder.CreateICmpEQ(
20176      Builder.CreateAnd(SrcAddress, Args.Mask, "set_bits"),
20177      llvm::Constant::getNullValue(Args.IntType), "is_aligned"));
20178}
20179
20180/// Generate (x & ~(y-1)) to align down or ((x+(y-1)) & ~(y-1)) to align up.
20181/// Note: For pointer types we can avoid ptrtoint/inttoptr pairs by using the
20182/// llvm.ptrmask intrinsic (with a GEP before in the align_up case).
20183RValue CodeGenFunction::EmitBuiltinAlignTo(const CallExpr *E, bool AlignUp) {
20184  BuiltinAlignArgs Args(E, *this);
20185  llvm::Value *SrcForMask = Args.Src;
20186  if (AlignUp) {
20187    // When aligning up we have to first add the mask to ensure we go over the
20188    // next alignment value and then align down to the next valid multiple.
20189    // By adding the mask, we ensure that align_up on an already aligned
20190    // value will not change the value.
20191    if (Args.Src->getType()->isPointerTy()) {
20192      if (getLangOpts().isSignedOverflowDefined())
20193        SrcForMask =
20194            Builder.CreateGEP(Int8Ty, SrcForMask, Args.Mask, "over_boundary");
20195      else
20196        SrcForMask = EmitCheckedInBoundsGEP(Int8Ty, SrcForMask, Args.Mask,
20197                                            /*SignedIndices=*/true,
20198                                            /*isSubtraction=*/false,
20199                                            E->getExprLoc(), "over_boundary");
20200    } else {
20201      SrcForMask = Builder.CreateAdd(SrcForMask, Args.Mask, "over_boundary");
20202    }
20203  }
20204  // Invert the mask to only clear the lower bits.
20205  llvm::Value *InvertedMask = Builder.CreateNot(Args.Mask, "inverted_mask");
20206  llvm::Value *Result = nullptr;
20207  if (Args.Src->getType()->isPointerTy()) {
20208    Result = Builder.CreateIntrinsic(
20209        Intrinsic::ptrmask, {Args.SrcType, Args.IntType},
20210        {SrcForMask, InvertedMask}, nullptr, "aligned_result");
20211  } else {
20212    Result = Builder.CreateAnd(SrcForMask, InvertedMask, "aligned_result");
20213  }
20214  assert(Result->getType() == Args.SrcType);
20215  return RValue::get(Result);
20216}
20217
20218Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
20219                                                   const CallExpr *E) {
20220  switch (BuiltinID) {
20221  case WebAssembly::BI__builtin_wasm_memory_size: {
20222    llvm::Type *ResultType = ConvertType(E->getType());
20223    Value *I = EmitScalarExpr(E->getArg(0));
20224    Function *Callee =
20225        CGM.getIntrinsic(Intrinsic::wasm_memory_size, ResultType);
20226    return Builder.CreateCall(Callee, I);
20227  }
20228  case WebAssembly::BI__builtin_wasm_memory_grow: {
20229    llvm::Type *ResultType = ConvertType(E->getType());
20230    Value *Args[] = {EmitScalarExpr(E->getArg(0)),
20231                     EmitScalarExpr(E->getArg(1))};
20232    Function *Callee =
20233        CGM.getIntrinsic(Intrinsic::wasm_memory_grow, ResultType);
20234    return Builder.CreateCall(Callee, Args);
20235  }
20236  case WebAssembly::BI__builtin_wasm_tls_size: {
20237    llvm::Type *ResultType = ConvertType(E->getType());
20238    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_size, ResultType);
20239    return Builder.CreateCall(Callee);
20240  }
20241  case WebAssembly::BI__builtin_wasm_tls_align: {
20242    llvm::Type *ResultType = ConvertType(E->getType());
20243    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_align, ResultType);
20244    return Builder.CreateCall(Callee);
20245  }
20246  case WebAssembly::BI__builtin_wasm_tls_base: {
20247    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_tls_base);
20248    return Builder.CreateCall(Callee);
20249  }
20250  case WebAssembly::BI__builtin_wasm_throw: {
20251    Value *Tag = EmitScalarExpr(E->getArg(0));
20252    Value *Obj = EmitScalarExpr(E->getArg(1));
20253    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_throw);
20254    return Builder.CreateCall(Callee, {Tag, Obj});
20255  }
20256  case WebAssembly::BI__builtin_wasm_rethrow: {
20257    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_rethrow);
20258    return Builder.CreateCall(Callee);
20259  }
20260  case WebAssembly::BI__builtin_wasm_memory_atomic_wait32: {
20261    Value *Addr = EmitScalarExpr(E->getArg(0));
20262    Value *Expected = EmitScalarExpr(E->getArg(1));
20263    Value *Timeout = EmitScalarExpr(E->getArg(2));
20264    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_wait32);
20265    return Builder.CreateCall(Callee, {Addr, Expected, Timeout});
20266  }
20267  case WebAssembly::BI__builtin_wasm_memory_atomic_wait64: {
20268    Value *Addr = EmitScalarExpr(E->getArg(0));
20269    Value *Expected = EmitScalarExpr(E->getArg(1));
20270    Value *Timeout = EmitScalarExpr(E->getArg(2));
20271    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_wait64);
20272    return Builder.CreateCall(Callee, {Addr, Expected, Timeout});
20273  }
20274  case WebAssembly::BI__builtin_wasm_memory_atomic_notify: {
20275    Value *Addr = EmitScalarExpr(E->getArg(0));
20276    Value *Count = EmitScalarExpr(E->getArg(1));
20277    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_memory_atomic_notify);
20278    return Builder.CreateCall(Callee, {Addr, Count});
20279  }
20280  case WebAssembly::BI__builtin_wasm_trunc_s_i32_f32:
20281  case WebAssembly::BI__builtin_wasm_trunc_s_i32_f64:
20282  case WebAssembly::BI__builtin_wasm_trunc_s_i64_f32:
20283  case WebAssembly::BI__builtin_wasm_trunc_s_i64_f64: {
20284    Value *Src = EmitScalarExpr(E->getArg(0));
20285    llvm::Type *ResT = ConvertType(E->getType());
20286    Function *Callee =
20287        CGM.getIntrinsic(Intrinsic::wasm_trunc_signed, {ResT, Src->getType()});
20288    return Builder.CreateCall(Callee, {Src});
20289  }
20290  case WebAssembly::BI__builtin_wasm_trunc_u_i32_f32:
20291  case WebAssembly::BI__builtin_wasm_trunc_u_i32_f64:
20292  case WebAssembly::BI__builtin_wasm_trunc_u_i64_f32:
20293  case WebAssembly::BI__builtin_wasm_trunc_u_i64_f64: {
20294    Value *Src = EmitScalarExpr(E->getArg(0));
20295    llvm::Type *ResT = ConvertType(E->getType());
20296    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_trunc_unsigned,
20297                                        {ResT, Src->getType()});
20298    return Builder.CreateCall(Callee, {Src});
20299  }
20300  case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f32:
20301  case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32_f64:
20302  case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f32:
20303  case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i64_f64:
20304  case WebAssembly::BI__builtin_wasm_trunc_saturate_s_i32x4_f32x4: {
20305    Value *Src = EmitScalarExpr(E->getArg(0));
20306    llvm::Type *ResT = ConvertType(E->getType());
20307    Function *Callee =
20308        CGM.getIntrinsic(Intrinsic::fptosi_sat, {ResT, Src->getType()});
20309    return Builder.CreateCall(Callee, {Src});
20310  }
20311  case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f32:
20312  case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32_f64:
20313  case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f32:
20314  case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i64_f64:
20315  case WebAssembly::BI__builtin_wasm_trunc_saturate_u_i32x4_f32x4: {
20316    Value *Src = EmitScalarExpr(E->getArg(0));
20317    llvm::Type *ResT = ConvertType(E->getType());
20318    Function *Callee =
20319        CGM.getIntrinsic(Intrinsic::fptoui_sat, {ResT, Src->getType()});
20320    return Builder.CreateCall(Callee, {Src});
20321  }
20322  case WebAssembly::BI__builtin_wasm_min_f32:
20323  case WebAssembly::BI__builtin_wasm_min_f64:
20324  case WebAssembly::BI__builtin_wasm_min_f32x4:
20325  case WebAssembly::BI__builtin_wasm_min_f64x2: {
20326    Value *LHS = EmitScalarExpr(E->getArg(0));
20327    Value *RHS = EmitScalarExpr(E->getArg(1));
20328    Function *Callee =
20329        CGM.getIntrinsic(Intrinsic::minimum, ConvertType(E->getType()));
20330    return Builder.CreateCall(Callee, {LHS, RHS});
20331  }
20332  case WebAssembly::BI__builtin_wasm_max_f32:
20333  case WebAssembly::BI__builtin_wasm_max_f64:
20334  case WebAssembly::BI__builtin_wasm_max_f32x4:
20335  case WebAssembly::BI__builtin_wasm_max_f64x2: {
20336    Value *LHS = EmitScalarExpr(E->getArg(0));
20337    Value *RHS = EmitScalarExpr(E->getArg(1));
20338    Function *Callee =
20339        CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType()));
20340    return Builder.CreateCall(Callee, {LHS, RHS});
20341  }
20342  case WebAssembly::BI__builtin_wasm_pmin_f32x4:
20343  case WebAssembly::BI__builtin_wasm_pmin_f64x2: {
20344    Value *LHS = EmitScalarExpr(E->getArg(0));
20345    Value *RHS = EmitScalarExpr(E->getArg(1));
20346    Function *Callee =
20347        CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType()));
20348    return Builder.CreateCall(Callee, {LHS, RHS});
20349  }
20350  case WebAssembly::BI__builtin_wasm_pmax_f32x4:
20351  case WebAssembly::BI__builtin_wasm_pmax_f64x2: {
20352    Value *LHS = EmitScalarExpr(E->getArg(0));
20353    Value *RHS = EmitScalarExpr(E->getArg(1));
20354    Function *Callee =
20355        CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType()));
20356    return Builder.CreateCall(Callee, {LHS, RHS});
20357  }
20358  case WebAssembly::BI__builtin_wasm_ceil_f32x4:
20359  case WebAssembly::BI__builtin_wasm_floor_f32x4:
20360  case WebAssembly::BI__builtin_wasm_trunc_f32x4:
20361  case WebAssembly::BI__builtin_wasm_nearest_f32x4:
20362  case WebAssembly::BI__builtin_wasm_ceil_f64x2:
20363  case WebAssembly::BI__builtin_wasm_floor_f64x2:
20364  case WebAssembly::BI__builtin_wasm_trunc_f64x2:
20365  case WebAssembly::BI__builtin_wasm_nearest_f64x2: {
20366    unsigned IntNo;
20367    switch (BuiltinID) {
20368    case WebAssembly::BI__builtin_wasm_ceil_f32x4:
20369    case WebAssembly::BI__builtin_wasm_ceil_f64x2:
20370      IntNo = Intrinsic::ceil;
20371      break;
20372    case WebAssembly::BI__builtin_wasm_floor_f32x4:
20373    case WebAssembly::BI__builtin_wasm_floor_f64x2:
20374      IntNo = Intrinsic::floor;
20375      break;
20376    case WebAssembly::BI__builtin_wasm_trunc_f32x4:
20377    case WebAssembly::BI__builtin_wasm_trunc_f64x2:
20378      IntNo = Intrinsic::trunc;
20379      break;
20380    case WebAssembly::BI__builtin_wasm_nearest_f32x4:
20381    case WebAssembly::BI__builtin_wasm_nearest_f64x2:
20382      IntNo = Intrinsic::nearbyint;
20383      break;
20384    default:
20385      llvm_unreachable("unexpected builtin ID");
20386    }
20387    Value *Value = EmitScalarExpr(E->getArg(0));
20388    Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
20389    return Builder.CreateCall(Callee, Value);
20390  }
20391  case WebAssembly::BI__builtin_wasm_ref_null_extern: {
20392    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_null_extern);
20393    return Builder.CreateCall(Callee);
20394  }
20395  case WebAssembly::BI__builtin_wasm_ref_null_func: {
20396    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_ref_null_func);
20397    return Builder.CreateCall(Callee);
20398  }
20399  case WebAssembly::BI__builtin_wasm_swizzle_i8x16: {
20400    Value *Src = EmitScalarExpr(E->getArg(0));
20401    Value *Indices = EmitScalarExpr(E->getArg(1));
20402    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_swizzle);
20403    return Builder.CreateCall(Callee, {Src, Indices});
20404  }
20405  case WebAssembly::BI__builtin_wasm_add_sat_s_i8x16:
20406  case WebAssembly::BI__builtin_wasm_add_sat_u_i8x16:
20407  case WebAssembly::BI__builtin_wasm_add_sat_s_i16x8:
20408  case WebAssembly::BI__builtin_wasm_add_sat_u_i16x8:
20409  case WebAssembly::BI__builtin_wasm_sub_sat_s_i8x16:
20410  case WebAssembly::BI__builtin_wasm_sub_sat_u_i8x16:
20411  case WebAssembly::BI__builtin_wasm_sub_sat_s_i16x8:
20412  case WebAssembly::BI__builtin_wasm_sub_sat_u_i16x8: {
20413    unsigned IntNo;
20414    switch (BuiltinID) {
20415    case WebAssembly::BI__builtin_wasm_add_sat_s_i8x16:
20416    case WebAssembly::BI__builtin_wasm_add_sat_s_i16x8:
20417      IntNo = Intrinsic::sadd_sat;
20418      break;
20419    case WebAssembly::BI__builtin_wasm_add_sat_u_i8x16:
20420    case WebAssembly::BI__builtin_wasm_add_sat_u_i16x8:
20421      IntNo = Intrinsic::uadd_sat;
20422      break;
20423    case WebAssembly::BI__builtin_wasm_sub_sat_s_i8x16:
20424    case WebAssembly::BI__builtin_wasm_sub_sat_s_i16x8:
20425      IntNo = Intrinsic::wasm_sub_sat_signed;
20426      break;
20427    case WebAssembly::BI__builtin_wasm_sub_sat_u_i8x16:
20428    case WebAssembly::BI__builtin_wasm_sub_sat_u_i16x8:
20429      IntNo = Intrinsic::wasm_sub_sat_unsigned;
20430      break;
20431    default:
20432      llvm_unreachable("unexpected builtin ID");
20433    }
20434    Value *LHS = EmitScalarExpr(E->getArg(0));
20435    Value *RHS = EmitScalarExpr(E->getArg(1));
20436    Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
20437    return Builder.CreateCall(Callee, {LHS, RHS});
20438  }
20439  case WebAssembly::BI__builtin_wasm_abs_i8x16:
20440  case WebAssembly::BI__builtin_wasm_abs_i16x8:
20441  case WebAssembly::BI__builtin_wasm_abs_i32x4:
20442  case WebAssembly::BI__builtin_wasm_abs_i64x2: {
20443    Value *Vec = EmitScalarExpr(E->getArg(0));
20444    Value *Neg = Builder.CreateNeg(Vec, "neg");
20445    Constant *Zero = llvm::Constant::getNullValue(Vec->getType());
20446    Value *ICmp = Builder.CreateICmpSLT(Vec, Zero, "abscond");
20447    return Builder.CreateSelect(ICmp, Neg, Vec, "abs");
20448  }
20449  case WebAssembly::BI__builtin_wasm_min_s_i8x16:
20450  case WebAssembly::BI__builtin_wasm_min_u_i8x16:
20451  case WebAssembly::BI__builtin_wasm_max_s_i8x16:
20452  case WebAssembly::BI__builtin_wasm_max_u_i8x16:
20453  case WebAssembly::BI__builtin_wasm_min_s_i16x8:
20454  case WebAssembly::BI__builtin_wasm_min_u_i16x8:
20455  case WebAssembly::BI__builtin_wasm_max_s_i16x8:
20456  case WebAssembly::BI__builtin_wasm_max_u_i16x8:
20457  case WebAssembly::BI__builtin_wasm_min_s_i32x4:
20458  case WebAssembly::BI__builtin_wasm_min_u_i32x4:
20459  case WebAssembly::BI__builtin_wasm_max_s_i32x4:
20460  case WebAssembly::BI__builtin_wasm_max_u_i32x4: {
20461    Value *LHS = EmitScalarExpr(E->getArg(0));
20462    Value *RHS = EmitScalarExpr(E->getArg(1));
20463    Value *ICmp;
20464    switch (BuiltinID) {
20465    case WebAssembly::BI__builtin_wasm_min_s_i8x16:
20466    case WebAssembly::BI__builtin_wasm_min_s_i16x8:
20467    case WebAssembly::BI__builtin_wasm_min_s_i32x4:
20468      ICmp = Builder.CreateICmpSLT(LHS, RHS);
20469      break;
20470    case WebAssembly::BI__builtin_wasm_min_u_i8x16:
20471    case WebAssembly::BI__builtin_wasm_min_u_i16x8:
20472    case WebAssembly::BI__builtin_wasm_min_u_i32x4:
20473      ICmp = Builder.CreateICmpULT(LHS, RHS);
20474      break;
20475    case WebAssembly::BI__builtin_wasm_max_s_i8x16:
20476    case WebAssembly::BI__builtin_wasm_max_s_i16x8:
20477    case WebAssembly::BI__builtin_wasm_max_s_i32x4:
20478      ICmp = Builder.CreateICmpSGT(LHS, RHS);
20479      break;
20480    case WebAssembly::BI__builtin_wasm_max_u_i8x16:
20481    case WebAssembly::BI__builtin_wasm_max_u_i16x8:
20482    case WebAssembly::BI__builtin_wasm_max_u_i32x4:
20483      ICmp = Builder.CreateICmpUGT(LHS, RHS);
20484      break;
20485    default:
20486      llvm_unreachable("unexpected builtin ID");
20487    }
20488    return Builder.CreateSelect(ICmp, LHS, RHS);
20489  }
20490  case WebAssembly::BI__builtin_wasm_avgr_u_i8x16:
20491  case WebAssembly::BI__builtin_wasm_avgr_u_i16x8: {
20492    Value *LHS = EmitScalarExpr(E->getArg(0));
20493    Value *RHS = EmitScalarExpr(E->getArg(1));
20494    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_avgr_unsigned,
20495                                        ConvertType(E->getType()));
20496    return Builder.CreateCall(Callee, {LHS, RHS});
20497  }
20498  case WebAssembly::BI__builtin_wasm_q15mulr_sat_s_i16x8: {
20499    Value *LHS = EmitScalarExpr(E->getArg(0));
20500    Value *RHS = EmitScalarExpr(E->getArg(1));
20501    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_q15mulr_sat_signed);
20502    return Builder.CreateCall(Callee, {LHS, RHS});
20503  }
20504  case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_s_i16x8:
20505  case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_u_i16x8:
20506  case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_s_i32x4:
20507  case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_u_i32x4: {
20508    Value *Vec = EmitScalarExpr(E->getArg(0));
20509    unsigned IntNo;
20510    switch (BuiltinID) {
20511    case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_s_i16x8:
20512    case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_s_i32x4:
20513      IntNo = Intrinsic::wasm_extadd_pairwise_signed;
20514      break;
20515    case WebAssembly::BI__builtin_wasm_extadd_pairwise_i8x16_u_i16x8:
20516    case WebAssembly::BI__builtin_wasm_extadd_pairwise_i16x8_u_i32x4:
20517      IntNo = Intrinsic::wasm_extadd_pairwise_unsigned;
20518      break;
20519    default:
20520      llvm_unreachable("unexpected builtin ID");
20521    }
20522
20523    Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType()));
20524    return Builder.CreateCall(Callee, Vec);
20525  }
20526  case WebAssembly::BI__builtin_wasm_bitselect: {
20527    Value *V1 = EmitScalarExpr(E->getArg(0));
20528    Value *V2 = EmitScalarExpr(E->getArg(1));
20529    Value *C = EmitScalarExpr(E->getArg(2));
20530    Function *Callee =
20531        CGM.getIntrinsic(Intrinsic::wasm_bitselect, ConvertType(E->getType()));
20532    return Builder.CreateCall(Callee, {V1, V2, C});
20533  }
20534  case WebAssembly::BI__builtin_wasm_dot_s_i32x4_i16x8: {
20535    Value *LHS = EmitScalarExpr(E->getArg(0));
20536    Value *RHS = EmitScalarExpr(E->getArg(1));
20537    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_dot);
20538    return Builder.CreateCall(Callee, {LHS, RHS});
20539  }
20540  case WebAssembly::BI__builtin_wasm_popcnt_i8x16: {
20541    Value *Vec = EmitScalarExpr(E->getArg(0));
20542    Function *Callee =
20543        CGM.getIntrinsic(Intrinsic::ctpop, ConvertType(E->getType()));
20544    return Builder.CreateCall(Callee, {Vec});
20545  }
20546  case WebAssembly::BI__builtin_wasm_any_true_v128:
20547  case WebAssembly::BI__builtin_wasm_all_true_i8x16:
20548  case WebAssembly::BI__builtin_wasm_all_true_i16x8:
20549  case WebAssembly::BI__builtin_wasm_all_true_i32x4:
20550  case WebAssembly::BI__builtin_wasm_all_true_i64x2: {
20551    unsigned IntNo;
20552    switch (BuiltinID) {
20553    case WebAssembly::BI__builtin_wasm_any_true_v128:
20554      IntNo = Intrinsic::wasm_anytrue;
20555      break;
20556    case WebAssembly::BI__builtin_wasm_all_true_i8x16:
20557    case WebAssembly::BI__builtin_wasm_all_true_i16x8:
20558    case WebAssembly::BI__builtin_wasm_all_true_i32x4:
20559    case WebAssembly::BI__builtin_wasm_all_true_i64x2:
20560      IntNo = Intrinsic::wasm_alltrue;
20561      break;
20562    default:
20563      llvm_unreachable("unexpected builtin ID");
20564    }
20565    Value *Vec = EmitScalarExpr(E->getArg(0));
20566    Function *Callee = CGM.getIntrinsic(IntNo, Vec->getType());
20567    return Builder.CreateCall(Callee, {Vec});
20568  }
20569  case WebAssembly::BI__builtin_wasm_bitmask_i8x16:
20570  case WebAssembly::BI__builtin_wasm_bitmask_i16x8:
20571  case WebAssembly::BI__builtin_wasm_bitmask_i32x4:
20572  case WebAssembly::BI__builtin_wasm_bitmask_i64x2: {
20573    Value *Vec = EmitScalarExpr(E->getArg(0));
20574    Function *Callee =
20575        CGM.getIntrinsic(Intrinsic::wasm_bitmask, Vec->getType());
20576    return Builder.CreateCall(Callee, {Vec});
20577  }
20578  case WebAssembly::BI__builtin_wasm_abs_f32x4:
20579  case WebAssembly::BI__builtin_wasm_abs_f64x2: {
20580    Value *Vec = EmitScalarExpr(E->getArg(0));
20581    Function *Callee = CGM.getIntrinsic(Intrinsic::fabs, Vec->getType());
20582    return Builder.CreateCall(Callee, {Vec});
20583  }
20584  case WebAssembly::BI__builtin_wasm_sqrt_f32x4:
20585  case WebAssembly::BI__builtin_wasm_sqrt_f64x2: {
20586    Value *Vec = EmitScalarExpr(E->getArg(0));
20587    Function *Callee = CGM.getIntrinsic(Intrinsic::sqrt, Vec->getType());
20588    return Builder.CreateCall(Callee, {Vec});
20589  }
20590  case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:
20591  case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8:
20592  case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4:
20593  case WebAssembly::BI__builtin_wasm_narrow_u_i16x8_i32x4: {
20594    Value *Low = EmitScalarExpr(E->getArg(0));
20595    Value *High = EmitScalarExpr(E->getArg(1));
20596    unsigned IntNo;
20597    switch (BuiltinID) {
20598    case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8:
20599    case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4:
20600      IntNo = Intrinsic::wasm_narrow_signed;
20601      break;
20602    case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8:
20603    case WebAssembly::BI__builtin_wasm_narrow_u_i16x8_i32x4:
20604      IntNo = Intrinsic::wasm_narrow_unsigned;
20605      break;
20606    default:
20607      llvm_unreachable("unexpected builtin ID");
20608    }
20609    Function *Callee =
20610        CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()});
20611    return Builder.CreateCall(Callee, {Low, High});
20612  }
20613  case WebAssembly::BI__builtin_wasm_trunc_sat_s_zero_f64x2_i32x4:
20614  case WebAssembly::BI__builtin_wasm_trunc_sat_u_zero_f64x2_i32x4: {
20615    Value *Vec = EmitScalarExpr(E->getArg(0));
20616    unsigned IntNo;
20617    switch (BuiltinID) {
20618    case WebAssembly::BI__builtin_wasm_trunc_sat_s_zero_f64x2_i32x4:
20619      IntNo = Intrinsic::fptosi_sat;
20620      break;
20621    case WebAssembly::BI__builtin_wasm_trunc_sat_u_zero_f64x2_i32x4:
20622      IntNo = Intrinsic::fptoui_sat;
20623      break;
20624    default:
20625      llvm_unreachable("unexpected builtin ID");
20626    }
20627    llvm::Type *SrcT = Vec->getType();
20628    llvm::Type *TruncT = SrcT->getWithNewType(Builder.getInt32Ty());
20629    Function *Callee = CGM.getIntrinsic(IntNo, {TruncT, SrcT});
20630    Value *Trunc = Builder.CreateCall(Callee, Vec);
20631    Value *Splat = Constant::getNullValue(TruncT);
20632    return Builder.CreateShuffleVector(Trunc, Splat, ArrayRef<int>{0, 1, 2, 3});
20633  }
20634  case WebAssembly::BI__builtin_wasm_shuffle_i8x16: {
20635    Value *Ops[18];
20636    size_t OpIdx = 0;
20637    Ops[OpIdx++] = EmitScalarExpr(E->getArg(0));
20638    Ops[OpIdx++] = EmitScalarExpr(E->getArg(1));
20639    while (OpIdx < 18) {
20640      std::optional<llvm::APSInt> LaneConst =
20641          E->getArg(OpIdx)->getIntegerConstantExpr(getContext());
20642      assert(LaneConst && "Constant arg isn't actually constant?");
20643      Ops[OpIdx++] = llvm::ConstantInt::get(getLLVMContext(), *LaneConst);
20644    }
20645    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle);
20646    return Builder.CreateCall(Callee, Ops);
20647  }
20648  case WebAssembly::BI__builtin_wasm_relaxed_madd_f32x4:
20649  case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f32x4:
20650  case WebAssembly::BI__builtin_wasm_relaxed_madd_f64x2:
20651  case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f64x2: {
20652    Value *A = EmitScalarExpr(E->getArg(0));
20653    Value *B = EmitScalarExpr(E->getArg(1));
20654    Value *C = EmitScalarExpr(E->getArg(2));
20655    unsigned IntNo;
20656    switch (BuiltinID) {
20657    case WebAssembly::BI__builtin_wasm_relaxed_madd_f32x4:
20658    case WebAssembly::BI__builtin_wasm_relaxed_madd_f64x2:
20659      IntNo = Intrinsic::wasm_relaxed_madd;
20660      break;
20661    case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f32x4:
20662    case WebAssembly::BI__builtin_wasm_relaxed_nmadd_f64x2:
20663      IntNo = Intrinsic::wasm_relaxed_nmadd;
20664      break;
20665    default:
20666      llvm_unreachable("unexpected builtin ID");
20667    }
20668    Function *Callee = CGM.getIntrinsic(IntNo, A->getType());
20669    return Builder.CreateCall(Callee, {A, B, C});
20670  }
20671  case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i8x16:
20672  case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i16x8:
20673  case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i32x4:
20674  case WebAssembly::BI__builtin_wasm_relaxed_laneselect_i64x2: {
20675    Value *A = EmitScalarExpr(E->getArg(0));
20676    Value *B = EmitScalarExpr(E->getArg(1));
20677    Value *C = EmitScalarExpr(E->getArg(2));
20678    Function *Callee =
20679        CGM.getIntrinsic(Intrinsic::wasm_relaxed_laneselect, A->getType());
20680    return Builder.CreateCall(Callee, {A, B, C});
20681  }
20682  case WebAssembly::BI__builtin_wasm_relaxed_swizzle_i8x16: {
20683    Value *Src = EmitScalarExpr(E->getArg(0));
20684    Value *Indices = EmitScalarExpr(E->getArg(1));
20685    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_relaxed_swizzle);
20686    return Builder.CreateCall(Callee, {Src, Indices});
20687  }
20688  case WebAssembly::BI__builtin_wasm_relaxed_min_f32x4:
20689  case WebAssembly::BI__builtin_wasm_relaxed_max_f32x4:
20690  case WebAssembly::BI__builtin_wasm_relaxed_min_f64x2:
20691  case WebAssembly::BI__builtin_wasm_relaxed_max_f64x2: {
20692    Value *LHS = EmitScalarExpr(E->getArg(0));
20693    Value *RHS = EmitScalarExpr(E->getArg(1));
20694    unsigned IntNo;
20695    switch (BuiltinID) {
20696    case WebAssembly::BI__builtin_wasm_relaxed_min_f32x4:
20697    case WebAssembly::BI__builtin_wasm_relaxed_min_f64x2:
20698      IntNo = Intrinsic::wasm_relaxed_min;
20699      break;
20700    case WebAssembly::BI__builtin_wasm_relaxed_max_f32x4:
20701    case WebAssembly::BI__builtin_wasm_relaxed_max_f64x2:
20702      IntNo = Intrinsic::wasm_relaxed_max;
20703      break;
20704    default:
20705      llvm_unreachable("unexpected builtin ID");
20706    }
20707    Function *Callee = CGM.getIntrinsic(IntNo, LHS->getType());
20708    return Builder.CreateCall(Callee, {LHS, RHS});
20709  }
20710  case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_i32x4_f32x4:
20711  case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_i32x4_f32x4:
20712  case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_zero_i32x4_f64x2:
20713  case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_zero_i32x4_f64x2: {
20714    Value *Vec = EmitScalarExpr(E->getArg(0));
20715    unsigned IntNo;
20716    switch (BuiltinID) {
20717    case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_i32x4_f32x4:
20718      IntNo = Intrinsic::wasm_relaxed_trunc_signed;
20719      break;
20720    case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_i32x4_f32x4:
20721      IntNo = Intrinsic::wasm_relaxed_trunc_unsigned;
20722      break;
20723    case WebAssembly::BI__builtin_wasm_relaxed_trunc_s_zero_i32x4_f64x2:
20724      IntNo = Intrinsic::wasm_relaxed_trunc_signed_zero;
20725      break;
20726    case WebAssembly::BI__builtin_wasm_relaxed_trunc_u_zero_i32x4_f64x2:
20727      IntNo = Intrinsic::wasm_relaxed_trunc_unsigned_zero;
20728      break;
20729    default:
20730      llvm_unreachable("unexpected builtin ID");
20731    }
20732    Function *Callee = CGM.getIntrinsic(IntNo);
20733    return Builder.CreateCall(Callee, {Vec});
20734  }
20735  case WebAssembly::BI__builtin_wasm_relaxed_q15mulr_s_i16x8: {
20736    Value *LHS = EmitScalarExpr(E->getArg(0));
20737    Value *RHS = EmitScalarExpr(E->getArg(1));
20738    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_relaxed_q15mulr_signed);
20739    return Builder.CreateCall(Callee, {LHS, RHS});
20740  }
20741  case WebAssembly::BI__builtin_wasm_relaxed_dot_i8x16_i7x16_s_i16x8: {
20742    Value *LHS = EmitScalarExpr(E->getArg(0));
20743    Value *RHS = EmitScalarExpr(E->getArg(1));
20744    Function *Callee =
20745        CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_i8x16_i7x16_signed);
20746    return Builder.CreateCall(Callee, {LHS, RHS});
20747  }
20748  case WebAssembly::BI__builtin_wasm_relaxed_dot_i8x16_i7x16_add_s_i32x4: {
20749    Value *LHS = EmitScalarExpr(E->getArg(0));
20750    Value *RHS = EmitScalarExpr(E->getArg(1));
20751    Value *Acc = EmitScalarExpr(E->getArg(2));
20752    Function *Callee =
20753        CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_i8x16_i7x16_add_signed);
20754    return Builder.CreateCall(Callee, {LHS, RHS, Acc});
20755  }
20756  case WebAssembly::BI__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4: {
20757    Value *LHS = EmitScalarExpr(E->getArg(0));
20758    Value *RHS = EmitScalarExpr(E->getArg(1));
20759    Value *Acc = EmitScalarExpr(E->getArg(2));
20760    Function *Callee =
20761        CGM.getIntrinsic(Intrinsic::wasm_relaxed_dot_bf16x8_add_f32);
20762    return Builder.CreateCall(Callee, {LHS, RHS, Acc});
20763  }
20764  case WebAssembly::BI__builtin_wasm_table_get: {
20765    assert(E->getArg(0)->getType()->isArrayType());
20766    Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20767    Value *Index = EmitScalarExpr(E->getArg(1));
20768    Function *Callee;
20769    if (E->getType().isWebAssemblyExternrefType())
20770      Callee = CGM.getIntrinsic(Intrinsic::wasm_table_get_externref);
20771    else if (E->getType().isWebAssemblyFuncrefType())
20772      Callee = CGM.getIntrinsic(Intrinsic::wasm_table_get_funcref);
20773    else
20774      llvm_unreachable(
20775          "Unexpected reference type for __builtin_wasm_table_get");
20776    return Builder.CreateCall(Callee, {Table, Index});
20777  }
20778  case WebAssembly::BI__builtin_wasm_table_set: {
20779    assert(E->getArg(0)->getType()->isArrayType());
20780    Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20781    Value *Index = EmitScalarExpr(E->getArg(1));
20782    Value *Val = EmitScalarExpr(E->getArg(2));
20783    Function *Callee;
20784    if (E->getArg(2)->getType().isWebAssemblyExternrefType())
20785      Callee = CGM.getIntrinsic(Intrinsic::wasm_table_set_externref);
20786    else if (E->getArg(2)->getType().isWebAssemblyFuncrefType())
20787      Callee = CGM.getIntrinsic(Intrinsic::wasm_table_set_funcref);
20788    else
20789      llvm_unreachable(
20790          "Unexpected reference type for __builtin_wasm_table_set");
20791    return Builder.CreateCall(Callee, {Table, Index, Val});
20792  }
20793  case WebAssembly::BI__builtin_wasm_table_size: {
20794    assert(E->getArg(0)->getType()->isArrayType());
20795    Value *Value = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20796    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_table_size);
20797    return Builder.CreateCall(Callee, Value);
20798  }
20799  case WebAssembly::BI__builtin_wasm_table_grow: {
20800    assert(E->getArg(0)->getType()->isArrayType());
20801    Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20802    Value *Val = EmitScalarExpr(E->getArg(1));
20803    Value *NElems = EmitScalarExpr(E->getArg(2));
20804
20805    Function *Callee;
20806    if (E->getArg(1)->getType().isWebAssemblyExternrefType())
20807      Callee = CGM.getIntrinsic(Intrinsic::wasm_table_grow_externref);
20808    else if (E->getArg(2)->getType().isWebAssemblyFuncrefType())
20809      Callee = CGM.getIntrinsic(Intrinsic::wasm_table_fill_funcref);
20810    else
20811      llvm_unreachable(
20812          "Unexpected reference type for __builtin_wasm_table_grow");
20813
20814    return Builder.CreateCall(Callee, {Table, Val, NElems});
20815  }
20816  case WebAssembly::BI__builtin_wasm_table_fill: {
20817    assert(E->getArg(0)->getType()->isArrayType());
20818    Value *Table = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20819    Value *Index = EmitScalarExpr(E->getArg(1));
20820    Value *Val = EmitScalarExpr(E->getArg(2));
20821    Value *NElems = EmitScalarExpr(E->getArg(3));
20822
20823    Function *Callee;
20824    if (E->getArg(2)->getType().isWebAssemblyExternrefType())
20825      Callee = CGM.getIntrinsic(Intrinsic::wasm_table_fill_externref);
20826    else if (E->getArg(2)->getType().isWebAssemblyFuncrefType())
20827      Callee = CGM.getIntrinsic(Intrinsic::wasm_table_fill_funcref);
20828    else
20829      llvm_unreachable(
20830          "Unexpected reference type for __builtin_wasm_table_fill");
20831
20832    return Builder.CreateCall(Callee, {Table, Index, Val, NElems});
20833  }
20834  case WebAssembly::BI__builtin_wasm_table_copy: {
20835    assert(E->getArg(0)->getType()->isArrayType());
20836    Value *TableX = EmitArrayToPointerDecay(E->getArg(0)).getPointer();
20837    Value *TableY = EmitArrayToPointerDecay(E->getArg(1)).getPointer();
20838    Value *DstIdx = EmitScalarExpr(E->getArg(2));
20839    Value *SrcIdx = EmitScalarExpr(E->getArg(3));
20840    Value *NElems = EmitScalarExpr(E->getArg(4));
20841
20842    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_table_copy);
20843
20844    return Builder.CreateCall(Callee, {TableX, TableY, SrcIdx, DstIdx, NElems});
20845  }
20846  default:
20847    return nullptr;
20848  }
20849}
20850
20851static std::pair<Intrinsic::ID, unsigned>
20852getIntrinsicForHexagonNonClangBuiltin(unsigned BuiltinID) {
20853  struct Info {
20854    unsigned BuiltinID;
20855    Intrinsic::ID IntrinsicID;
20856    unsigned VecLen;
20857  };
20858  static Info Infos[] = {
20859#define CUSTOM_BUILTIN_MAPPING(x,s) \
20860  { Hexagon::BI__builtin_HEXAGON_##x, Intrinsic::hexagon_##x, s },
20861    CUSTOM_BUILTIN_MAPPING(L2_loadrub_pci, 0)
20862    CUSTOM_BUILTIN_MAPPING(L2_loadrb_pci, 0)
20863    CUSTOM_BUILTIN_MAPPING(L2_loadruh_pci, 0)
20864    CUSTOM_BUILTIN_MAPPING(L2_loadrh_pci, 0)
20865    CUSTOM_BUILTIN_MAPPING(L2_loadri_pci, 0)
20866    CUSTOM_BUILTIN_MAPPING(L2_loadrd_pci, 0)
20867    CUSTOM_BUILTIN_MAPPING(L2_loadrub_pcr, 0)
20868    CUSTOM_BUILTIN_MAPPING(L2_loadrb_pcr, 0)
20869    CUSTOM_BUILTIN_MAPPING(L2_loadruh_pcr, 0)
20870    CUSTOM_BUILTIN_MAPPING(L2_loadrh_pcr, 0)
20871    CUSTOM_BUILTIN_MAPPING(L2_loadri_pcr, 0)
20872    CUSTOM_BUILTIN_MAPPING(L2_loadrd_pcr, 0)
20873    CUSTOM_BUILTIN_MAPPING(S2_storerb_pci, 0)
20874    CUSTOM_BUILTIN_MAPPING(S2_storerh_pci, 0)
20875    CUSTOM_BUILTIN_MAPPING(S2_storerf_pci, 0)
20876    CUSTOM_BUILTIN_MAPPING(S2_storeri_pci, 0)
20877    CUSTOM_BUILTIN_MAPPING(S2_storerd_pci, 0)
20878    CUSTOM_BUILTIN_MAPPING(S2_storerb_pcr, 0)
20879    CUSTOM_BUILTIN_MAPPING(S2_storerh_pcr, 0)
20880    CUSTOM_BUILTIN_MAPPING(S2_storerf_pcr, 0)
20881    CUSTOM_BUILTIN_MAPPING(S2_storeri_pcr, 0)
20882    CUSTOM_BUILTIN_MAPPING(S2_storerd_pcr, 0)
20883    // Legacy builtins that take a vector in place of a vector predicate.
20884    CUSTOM_BUILTIN_MAPPING(V6_vmaskedstoreq, 64)
20885    CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorenq, 64)
20886    CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentq, 64)
20887    CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentnq, 64)
20888    CUSTOM_BUILTIN_MAPPING(V6_vmaskedstoreq_128B, 128)
20889    CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorenq_128B, 128)
20890    CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentq_128B, 128)
20891    CUSTOM_BUILTIN_MAPPING(V6_vmaskedstorentnq_128B, 128)
20892#include "clang/Basic/BuiltinsHexagonMapCustomDep.def"
20893#undef CUSTOM_BUILTIN_MAPPING
20894  };
20895
20896  auto CmpInfo = [] (Info A, Info B) { return A.BuiltinID < B.BuiltinID; };
20897  static const bool SortOnce = (llvm::sort(Infos, CmpInfo), true);
20898  (void)SortOnce;
20899
20900  const Info *F = llvm::lower_bound(Infos, Info{BuiltinID, 0, 0}, CmpInfo);
20901  if (F == std::end(Infos) || F->BuiltinID != BuiltinID)
20902    return {Intrinsic::not_intrinsic, 0};
20903
20904  return {F->IntrinsicID, F->VecLen};
20905}
20906
20907Value *CodeGenFunction::EmitHexagonBuiltinExpr(unsigned BuiltinID,
20908                                               const CallExpr *E) {
20909  Intrinsic::ID ID;
20910  unsigned VecLen;
20911  std::tie(ID, VecLen) = getIntrinsicForHexagonNonClangBuiltin(BuiltinID);
20912
20913  auto MakeCircOp = [this, E](unsigned IntID, bool IsLoad) {
20914    // The base pointer is passed by address, so it needs to be loaded.
20915    Address A = EmitPointerWithAlignment(E->getArg(0));
20916    Address BP = Address(A.getPointer(), Int8PtrTy, A.getAlignment());
20917    llvm::Value *Base = Builder.CreateLoad(BP);
20918    // The treatment of both loads and stores is the same: the arguments for
20919    // the builtin are the same as the arguments for the intrinsic.
20920    // Load:
20921    //   builtin(Base, Inc, Mod, Start) -> intr(Base, Inc, Mod, Start)
20922    //   builtin(Base, Mod, Start)      -> intr(Base, Mod, Start)
20923    // Store:
20924    //   builtin(Base, Inc, Mod, Val, Start) -> intr(Base, Inc, Mod, Val, Start)
20925    //   builtin(Base, Mod, Val, Start)      -> intr(Base, Mod, Val, Start)
20926    SmallVector<llvm::Value*,5> Ops = { Base };
20927    for (unsigned i = 1, e = E->getNumArgs(); i != e; ++i)
20928      Ops.push_back(EmitScalarExpr(E->getArg(i)));
20929
20930    llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(IntID), Ops);
20931    // The load intrinsics generate two results (Value, NewBase), stores
20932    // generate one (NewBase). The new base address needs to be stored.
20933    llvm::Value *NewBase = IsLoad ? Builder.CreateExtractValue(Result, 1)
20934                                  : Result;
20935    llvm::Value *LV = EmitScalarExpr(E->getArg(0));
20936    Address Dest = EmitPointerWithAlignment(E->getArg(0));
20937    llvm::Value *RetVal =
20938        Builder.CreateAlignedStore(NewBase, LV, Dest.getAlignment());
20939    if (IsLoad)
20940      RetVal = Builder.CreateExtractValue(Result, 0);
20941    return RetVal;
20942  };
20943
20944  // Handle the conversion of bit-reverse load intrinsics to bit code.
20945  // The intrinsic call after this function only reads from memory and the
20946  // write to memory is dealt by the store instruction.
20947  auto MakeBrevLd = [this, E](unsigned IntID, llvm::Type *DestTy) {
20948    // The intrinsic generates one result, which is the new value for the base
20949    // pointer. It needs to be returned. The result of the load instruction is
20950    // passed to intrinsic by address, so the value needs to be stored.
20951    llvm::Value *BaseAddress = EmitScalarExpr(E->getArg(0));
20952
20953    // Expressions like &(*pt++) will be incremented per evaluation.
20954    // EmitPointerWithAlignment and EmitScalarExpr evaluates the expression
20955    // per call.
20956    Address DestAddr = EmitPointerWithAlignment(E->getArg(1));
20957    DestAddr = Address(DestAddr.getPointer(), Int8Ty, DestAddr.getAlignment());
20958    llvm::Value *DestAddress = DestAddr.getPointer();
20959
20960    // Operands are Base, Dest, Modifier.
20961    // The intrinsic format in LLVM IR is defined as
20962    // { ValueType, i8* } (i8*, i32).
20963    llvm::Value *Result = Builder.CreateCall(
20964        CGM.getIntrinsic(IntID), {BaseAddress, EmitScalarExpr(E->getArg(2))});
20965
20966    // The value needs to be stored as the variable is passed by reference.
20967    llvm::Value *DestVal = Builder.CreateExtractValue(Result, 0);
20968
20969    // The store needs to be truncated to fit the destination type.
20970    // While i32 and i64 are natively supported on Hexagon, i8 and i16 needs
20971    // to be handled with stores of respective destination type.
20972    DestVal = Builder.CreateTrunc(DestVal, DestTy);
20973
20974    Builder.CreateAlignedStore(DestVal, DestAddress, DestAddr.getAlignment());
20975    // The updated value of the base pointer is returned.
20976    return Builder.CreateExtractValue(Result, 1);
20977  };
20978
20979  auto V2Q = [this, VecLen] (llvm::Value *Vec) {
20980    Intrinsic::ID ID = VecLen == 128 ? Intrinsic::hexagon_V6_vandvrt_128B
20981                                     : Intrinsic::hexagon_V6_vandvrt;
20982    return Builder.CreateCall(CGM.getIntrinsic(ID),
20983                              {Vec, Builder.getInt32(-1)});
20984  };
20985  auto Q2V = [this, VecLen] (llvm::Value *Pred) {
20986    Intrinsic::ID ID = VecLen == 128 ? Intrinsic::hexagon_V6_vandqrt_128B
20987                                     : Intrinsic::hexagon_V6_vandqrt;
20988    return Builder.CreateCall(CGM.getIntrinsic(ID),
20989                              {Pred, Builder.getInt32(-1)});
20990  };
20991
20992  switch (BuiltinID) {
20993  // These intrinsics return a tuple {Vector, VectorPred} in LLVM IR,
20994  // and the corresponding C/C++ builtins use loads/stores to update
20995  // the predicate.
20996  case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry:
20997  case Hexagon::BI__builtin_HEXAGON_V6_vaddcarry_128B:
20998  case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry:
20999  case Hexagon::BI__builtin_HEXAGON_V6_vsubcarry_128B: {
21000    // Get the type from the 0-th argument.
21001    llvm::Type *VecType = ConvertType(E->getArg(0)->getType());
21002    Address PredAddr =
21003        EmitPointerWithAlignment(E->getArg(2)).withElementType(VecType);
21004    llvm::Value *PredIn = V2Q(Builder.CreateLoad(PredAddr));
21005    llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID),
21006        {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1)), PredIn});
21007
21008    llvm::Value *PredOut = Builder.CreateExtractValue(Result, 1);
21009    Builder.CreateAlignedStore(Q2V(PredOut), PredAddr.getPointer(),
21010        PredAddr.getAlignment());
21011    return Builder.CreateExtractValue(Result, 0);
21012  }
21013  // These are identical to the builtins above, except they don't consume
21014  // input carry, only generate carry-out. Since they still produce two
21015  // outputs, generate the store of the predicate, but no load.
21016  case Hexagon::BI__builtin_HEXAGON_V6_vaddcarryo:
21017  case Hexagon::BI__builtin_HEXAGON_V6_vaddcarryo_128B:
21018  case Hexagon::BI__builtin_HEXAGON_V6_vsubcarryo:
21019  case Hexagon::BI__builtin_HEXAGON_V6_vsubcarryo_128B: {
21020    // Get the type from the 0-th argument.
21021    llvm::Type *VecType = ConvertType(E->getArg(0)->getType());
21022    Address PredAddr =
21023        EmitPointerWithAlignment(E->getArg(2)).withElementType(VecType);
21024    llvm::Value *Result = Builder.CreateCall(CGM.getIntrinsic(ID),
21025        {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
21026
21027    llvm::Value *PredOut = Builder.CreateExtractValue(Result, 1);
21028    Builder.CreateAlignedStore(Q2V(PredOut), PredAddr.getPointer(),
21029        PredAddr.getAlignment());
21030    return Builder.CreateExtractValue(Result, 0);
21031  }
21032
21033  case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstoreq:
21034  case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorenq:
21035  case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentq:
21036  case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentnq:
21037  case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstoreq_128B:
21038  case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorenq_128B:
21039  case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentq_128B:
21040  case Hexagon::BI__builtin_HEXAGON_V6_vmaskedstorentnq_128B: {
21041    SmallVector<llvm::Value*,4> Ops;
21042    const Expr *PredOp = E->getArg(0);
21043    // There will be an implicit cast to a boolean vector. Strip it.
21044    if (auto *Cast = dyn_cast<ImplicitCastExpr>(PredOp)) {
21045      if (Cast->getCastKind() == CK_BitCast)
21046        PredOp = Cast->getSubExpr();
21047      Ops.push_back(V2Q(EmitScalarExpr(PredOp)));
21048    }
21049    for (int i = 1, e = E->getNumArgs(); i != e; ++i)
21050      Ops.push_back(EmitScalarExpr(E->getArg(i)));
21051    return Builder.CreateCall(CGM.getIntrinsic(ID), Ops);
21052  }
21053
21054  case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pci:
21055  case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pci:
21056  case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pci:
21057  case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pci:
21058  case Hexagon::BI__builtin_HEXAGON_L2_loadri_pci:
21059  case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pci:
21060  case Hexagon::BI__builtin_HEXAGON_L2_loadrub_pcr:
21061  case Hexagon::BI__builtin_HEXAGON_L2_loadrb_pcr:
21062  case Hexagon::BI__builtin_HEXAGON_L2_loadruh_pcr:
21063  case Hexagon::BI__builtin_HEXAGON_L2_loadrh_pcr:
21064  case Hexagon::BI__builtin_HEXAGON_L2_loadri_pcr:
21065  case Hexagon::BI__builtin_HEXAGON_L2_loadrd_pcr:
21066    return MakeCircOp(ID, /*IsLoad=*/true);
21067  case Hexagon::BI__builtin_HEXAGON_S2_storerb_pci:
21068  case Hexagon::BI__builtin_HEXAGON_S2_storerh_pci:
21069  case Hexagon::BI__builtin_HEXAGON_S2_storerf_pci:
21070  case Hexagon::BI__builtin_HEXAGON_S2_storeri_pci:
21071  case Hexagon::BI__builtin_HEXAGON_S2_storerd_pci:
21072  case Hexagon::BI__builtin_HEXAGON_S2_storerb_pcr:
21073  case Hexagon::BI__builtin_HEXAGON_S2_storerh_pcr:
21074  case Hexagon::BI__builtin_HEXAGON_S2_storerf_pcr:
21075  case Hexagon::BI__builtin_HEXAGON_S2_storeri_pcr:
21076  case Hexagon::BI__builtin_HEXAGON_S2_storerd_pcr:
21077    return MakeCircOp(ID, /*IsLoad=*/false);
21078  case Hexagon::BI__builtin_brev_ldub:
21079    return MakeBrevLd(Intrinsic::hexagon_L2_loadrub_pbr, Int8Ty);
21080  case Hexagon::BI__builtin_brev_ldb:
21081    return MakeBrevLd(Intrinsic::hexagon_L2_loadrb_pbr, Int8Ty);
21082  case Hexagon::BI__builtin_brev_lduh:
21083    return MakeBrevLd(Intrinsic::hexagon_L2_loadruh_pbr, Int16Ty);
21084  case Hexagon::BI__builtin_brev_ldh:
21085    return MakeBrevLd(Intrinsic::hexagon_L2_loadrh_pbr, Int16Ty);
21086  case Hexagon::BI__builtin_brev_ldw:
21087    return MakeBrevLd(Intrinsic::hexagon_L2_loadri_pbr, Int32Ty);
21088  case Hexagon::BI__builtin_brev_ldd:
21089    return MakeBrevLd(Intrinsic::hexagon_L2_loadrd_pbr, Int64Ty);
21090  } // switch
21091
21092  return nullptr;
21093}
21094
21095Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
21096                                             const CallExpr *E,
21097                                             ReturnValueSlot ReturnValue) {
21098  SmallVector<Value *, 4> Ops;
21099  llvm::Type *ResultType = ConvertType(E->getType());
21100
21101  // Find out if any arguments are required to be integer constant expressions.
21102  unsigned ICEArguments = 0;
21103  ASTContext::GetBuiltinTypeError Error;
21104  getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
21105  if (Error == ASTContext::GE_Missing_type) {
21106    // Vector intrinsics don't have a type string.
21107    assert(BuiltinID >= clang::RISCV::FirstRVVBuiltin &&
21108           BuiltinID <= clang::RISCV::LastRVVBuiltin);
21109    ICEArguments = 0;
21110    if (BuiltinID == RISCVVector::BI__builtin_rvv_vget_v ||
21111        BuiltinID == RISCVVector::BI__builtin_rvv_vset_v)
21112      ICEArguments = 1 << 1;
21113  } else {
21114    assert(Error == ASTContext::GE_None && "Unexpected error");
21115  }
21116
21117  if (BuiltinID == RISCV::BI__builtin_riscv_ntl_load)
21118    ICEArguments |= (1 << 1);
21119  if (BuiltinID == RISCV::BI__builtin_riscv_ntl_store)
21120    ICEArguments |= (1 << 2);
21121
21122  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
21123    // Handle aggregate argument, namely RVV tuple types in segment load/store
21124    if (hasAggregateEvaluationKind(E->getArg(i)->getType())) {
21125      LValue L = EmitAggExprToLValue(E->getArg(i));
21126      llvm::Value *AggValue = Builder.CreateLoad(L.getAddress(*this));
21127      Ops.push_back(AggValue);
21128      continue;
21129    }
21130    Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
21131  }
21132
21133  Intrinsic::ID ID = Intrinsic::not_intrinsic;
21134  unsigned NF = 1;
21135  // The 0th bit simulates the `vta` of RVV
21136  // The 1st bit simulates the `vma` of RVV
21137  constexpr unsigned RVV_VTA = 0x1;
21138  constexpr unsigned RVV_VMA = 0x2;
21139  int PolicyAttrs = 0;
21140  bool IsMasked = false;
21141
21142  // Required for overloaded intrinsics.
21143  llvm::SmallVector<llvm::Type *, 2> IntrinsicTypes;
21144  switch (BuiltinID) {
21145  default: llvm_unreachable("unexpected builtin ID");
21146  case RISCV::BI__builtin_riscv_orc_b_32:
21147  case RISCV::BI__builtin_riscv_orc_b_64:
21148  case RISCV::BI__builtin_riscv_clz_32:
21149  case RISCV::BI__builtin_riscv_clz_64:
21150  case RISCV::BI__builtin_riscv_ctz_32:
21151  case RISCV::BI__builtin_riscv_ctz_64:
21152  case RISCV::BI__builtin_riscv_clmul_32:
21153  case RISCV::BI__builtin_riscv_clmul_64:
21154  case RISCV::BI__builtin_riscv_clmulh_32:
21155  case RISCV::BI__builtin_riscv_clmulh_64:
21156  case RISCV::BI__builtin_riscv_clmulr_32:
21157  case RISCV::BI__builtin_riscv_clmulr_64:
21158  case RISCV::BI__builtin_riscv_xperm4_32:
21159  case RISCV::BI__builtin_riscv_xperm4_64:
21160  case RISCV::BI__builtin_riscv_xperm8_32:
21161  case RISCV::BI__builtin_riscv_xperm8_64:
21162  case RISCV::BI__builtin_riscv_brev8_32:
21163  case RISCV::BI__builtin_riscv_brev8_64:
21164  case RISCV::BI__builtin_riscv_zip_32:
21165  case RISCV::BI__builtin_riscv_unzip_32: {
21166    switch (BuiltinID) {
21167    default: llvm_unreachable("unexpected builtin ID");
21168    // Zbb
21169    case RISCV::BI__builtin_riscv_orc_b_32:
21170    case RISCV::BI__builtin_riscv_orc_b_64:
21171      ID = Intrinsic::riscv_orc_b;
21172      break;
21173    case RISCV::BI__builtin_riscv_clz_32:
21174    case RISCV::BI__builtin_riscv_clz_64: {
21175      Function *F = CGM.getIntrinsic(Intrinsic::ctlz, Ops[0]->getType());
21176      Value *Result = Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
21177      if (Result->getType() != ResultType)
21178        Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
21179                                       "cast");
21180      return Result;
21181    }
21182    case RISCV::BI__builtin_riscv_ctz_32:
21183    case RISCV::BI__builtin_riscv_ctz_64: {
21184      Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
21185      Value *Result = Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
21186      if (Result->getType() != ResultType)
21187        Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
21188                                       "cast");
21189      return Result;
21190    }
21191
21192    // Zbc
21193    case RISCV::BI__builtin_riscv_clmul_32:
21194    case RISCV::BI__builtin_riscv_clmul_64:
21195      ID = Intrinsic::riscv_clmul;
21196      break;
21197    case RISCV::BI__builtin_riscv_clmulh_32:
21198    case RISCV::BI__builtin_riscv_clmulh_64:
21199      ID = Intrinsic::riscv_clmulh;
21200      break;
21201    case RISCV::BI__builtin_riscv_clmulr_32:
21202    case RISCV::BI__builtin_riscv_clmulr_64:
21203      ID = Intrinsic::riscv_clmulr;
21204      break;
21205
21206    // Zbkx
21207    case RISCV::BI__builtin_riscv_xperm8_32:
21208    case RISCV::BI__builtin_riscv_xperm8_64:
21209      ID = Intrinsic::riscv_xperm8;
21210      break;
21211    case RISCV::BI__builtin_riscv_xperm4_32:
21212    case RISCV::BI__builtin_riscv_xperm4_64:
21213      ID = Intrinsic::riscv_xperm4;
21214      break;
21215
21216    // Zbkb
21217    case RISCV::BI__builtin_riscv_brev8_32:
21218    case RISCV::BI__builtin_riscv_brev8_64:
21219      ID = Intrinsic::riscv_brev8;
21220      break;
21221    case RISCV::BI__builtin_riscv_zip_32:
21222      ID = Intrinsic::riscv_zip;
21223      break;
21224    case RISCV::BI__builtin_riscv_unzip_32:
21225      ID = Intrinsic::riscv_unzip;
21226      break;
21227    }
21228
21229    IntrinsicTypes = {ResultType};
21230    break;
21231  }
21232
21233  // Zk builtins
21234
21235  // Zknh
21236  case RISCV::BI__builtin_riscv_sha256sig0:
21237    ID = Intrinsic::riscv_sha256sig0;
21238    break;
21239  case RISCV::BI__builtin_riscv_sha256sig1:
21240    ID = Intrinsic::riscv_sha256sig1;
21241    break;
21242  case RISCV::BI__builtin_riscv_sha256sum0:
21243    ID = Intrinsic::riscv_sha256sum0;
21244    break;
21245  case RISCV::BI__builtin_riscv_sha256sum1:
21246    ID = Intrinsic::riscv_sha256sum1;
21247    break;
21248
21249  // Zksed
21250  case RISCV::BI__builtin_riscv_sm4ks:
21251    ID = Intrinsic::riscv_sm4ks;
21252    break;
21253  case RISCV::BI__builtin_riscv_sm4ed:
21254    ID = Intrinsic::riscv_sm4ed;
21255    break;
21256
21257  // Zksh
21258  case RISCV::BI__builtin_riscv_sm3p0:
21259    ID = Intrinsic::riscv_sm3p0;
21260    break;
21261  case RISCV::BI__builtin_riscv_sm3p1:
21262    ID = Intrinsic::riscv_sm3p1;
21263    break;
21264
21265  // Zihintntl
21266  case RISCV::BI__builtin_riscv_ntl_load: {
21267    llvm::Type *ResTy = ConvertType(E->getType());
21268    unsigned DomainVal = 5; // Default __RISCV_NTLH_ALL
21269    if (Ops.size() == 2)
21270      DomainVal = cast<ConstantInt>(Ops[1])->getZExtValue();
21271
21272    llvm::MDNode *RISCVDomainNode = llvm::MDNode::get(
21273        getLLVMContext(),
21274        llvm::ConstantAsMetadata::get(Builder.getInt32(DomainVal)));
21275    llvm::MDNode *NontemporalNode = llvm::MDNode::get(
21276        getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
21277
21278    int Width;
21279    if(ResTy->isScalableTy()) {
21280      const ScalableVectorType *SVTy = cast<ScalableVectorType>(ResTy);
21281      llvm::Type *ScalarTy = ResTy->getScalarType();
21282      Width = ScalarTy->getPrimitiveSizeInBits() *
21283              SVTy->getElementCount().getKnownMinValue();
21284    } else
21285      Width = ResTy->getPrimitiveSizeInBits();
21286    LoadInst *Load = Builder.CreateLoad(
21287        Address(Ops[0], ResTy, CharUnits::fromQuantity(Width / 8)));
21288
21289    Load->setMetadata(llvm::LLVMContext::MD_nontemporal, NontemporalNode);
21290    Load->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"),
21291                      RISCVDomainNode);
21292
21293    return Load;
21294  }
21295  case RISCV::BI__builtin_riscv_ntl_store: {
21296    unsigned DomainVal = 5; // Default __RISCV_NTLH_ALL
21297    if (Ops.size() == 3)
21298      DomainVal = cast<ConstantInt>(Ops[2])->getZExtValue();
21299
21300    llvm::MDNode *RISCVDomainNode = llvm::MDNode::get(
21301        getLLVMContext(),
21302        llvm::ConstantAsMetadata::get(Builder.getInt32(DomainVal)));
21303    llvm::MDNode *NontemporalNode = llvm::MDNode::get(
21304        getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1)));
21305
21306    StoreInst *Store = Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]);
21307    Store->setMetadata(llvm::LLVMContext::MD_nontemporal, NontemporalNode);
21308    Store->setMetadata(CGM.getModule().getMDKindID("riscv-nontemporal-domain"),
21309                       RISCVDomainNode);
21310
21311    return Store;
21312  }
21313
21314  // Vector builtins are handled from here.
21315#include "clang/Basic/riscv_vector_builtin_cg.inc"
21316  // SiFive Vector builtins are handled from here.
21317#include "clang/Basic/riscv_sifive_vector_builtin_cg.inc"
21318  }
21319
21320  assert(ID != Intrinsic::not_intrinsic);
21321
21322  llvm::Function *F = CGM.getIntrinsic(ID, IntrinsicTypes);
21323  return Builder.CreateCall(F, Ops, "");
21324}
21325