1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUSubtarget.h"
17#include "AMDGPUTargetMachine.h"
18#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19#include "SIDefines.h"
20#include "SIInstrInfo.h"
21#include "SIMachineFunctionInfo.h"
22#include "SIRegisterInfo.h"
23#include "Utils/AMDGPUBaseInfo.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/BitVector.h"
28#include "llvm/ADT/SmallVector.h"
29#include "llvm/ADT/Statistic.h"
30#include "llvm/ADT/StringRef.h"
31#include "llvm/ADT/StringSwitch.h"
32#include "llvm/ADT/Twine.h"
33#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
34#include "llvm/CodeGen/Analysis.h"
35#include "llvm/CodeGen/CallingConvLower.h"
36#include "llvm/CodeGen/DAGCombine.h"
37#include "llvm/CodeGen/ISDOpcodes.h"
38#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
39#include "llvm/CodeGen/MachineBasicBlock.h"
40#include "llvm/CodeGen/MachineFrameInfo.h"
41#include "llvm/CodeGen/MachineFunction.h"
42#include "llvm/CodeGen/MachineInstr.h"
43#include "llvm/CodeGen/MachineInstrBuilder.h"
44#include "llvm/CodeGen/MachineLoopInfo.h"
45#include "llvm/CodeGen/MachineMemOperand.h"
46#include "llvm/CodeGen/MachineModuleInfo.h"
47#include "llvm/CodeGen/MachineOperand.h"
48#include "llvm/CodeGen/MachineRegisterInfo.h"
49#include "llvm/CodeGen/SelectionDAG.h"
50#include "llvm/CodeGen/SelectionDAGNodes.h"
51#include "llvm/CodeGen/TargetCallingConv.h"
52#include "llvm/CodeGen/TargetRegisterInfo.h"
53#include "llvm/CodeGen/ValueTypes.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/DebugLoc.h"
57#include "llvm/IR/DerivedTypes.h"
58#include "llvm/IR/DiagnosticInfo.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/GlobalValue.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Instructions.h"
64#include "llvm/IR/IntrinsicInst.h"
65#include "llvm/IR/Type.h"
66#include "llvm/Support/Casting.h"
67#include "llvm/Support/CodeGen.h"
68#include "llvm/Support/CommandLine.h"
69#include "llvm/Support/Compiler.h"
70#include "llvm/Support/ErrorHandling.h"
71#include "llvm/Support/KnownBits.h"
72#include "llvm/Support/MachineValueType.h"
73#include "llvm/Support/MathExtras.h"
74#include "llvm/Target/TargetOptions.h"
75#include <cassert>
76#include <cmath>
77#include <cstdint>
78#include <iterator>
79#include <tuple>
80#include <utility>
81#include <vector>
82
83using namespace llvm;
84
85#define DEBUG_TYPE "si-lower"
86
87STATISTIC(NumTailCalls, "Number of tail calls");
88
89static cl::opt<bool> DisableLoopAlignment(
90  "amdgpu-disable-loop-alignment",
91  cl::desc("Do not align and prefetch loops"),
92  cl::init(false));
93
94static cl::opt<bool> VGPRReserveforSGPRSpill(
95    "amdgpu-reserve-vgpr-for-sgpr-spill",
96    cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true));
97
98static cl::opt<bool> UseDivergentRegisterIndexing(
99  "amdgpu-use-divergent-register-indexing",
100  cl::Hidden,
101  cl::desc("Use indirect register addressing for divergent indexes"),
102  cl::init(false));
103
104static bool hasFP32Denormals(const MachineFunction &MF) {
105  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
106  return Info->getMode().allFP32Denormals();
107}
108
109static bool hasFP64FP16Denormals(const MachineFunction &MF) {
110  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
111  return Info->getMode().allFP64FP16Denormals();
112}
113
114static unsigned findFirstFreeSGPR(CCState &CCInfo) {
115  unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
116  for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
117    if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
118      return AMDGPU::SGPR0 + Reg;
119    }
120  }
121  llvm_unreachable("Cannot allocate sgpr");
122}
123
124SITargetLowering::SITargetLowering(const TargetMachine &TM,
125                                   const GCNSubtarget &STI)
126    : AMDGPUTargetLowering(TM, STI),
127      Subtarget(&STI) {
128  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
129  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
130
131  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
132  addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
133
134  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
135  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
136  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
137
138  addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
139  addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
140
141  addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
142  addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
143
144  addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
145  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
146
147  addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
148  addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
149
150  addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
151  addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
152
153  addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
154  addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
155
156  addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
157  addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
158
159  addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
160  addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
161
162  addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
163  addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
164
165  if (Subtarget->has16BitInsts()) {
166    addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
167    addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
168
169    // Unless there are also VOP3P operations, not operations are really legal.
170    addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
171    addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
172    addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
173    addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
174  }
175
176  addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
177  addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
178
179  computeRegisterProperties(Subtarget->getRegisterInfo());
180
181  // The boolean content concept here is too inflexible. Compares only ever
182  // really produce a 1-bit result. Any copy/extend from these will turn into a
183  // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
184  // it's what most targets use.
185  setBooleanContents(ZeroOrOneBooleanContent);
186  setBooleanVectorContents(ZeroOrOneBooleanContent);
187
188  // We need to custom lower vector stores from local memory
189  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
190  setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
191  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
192  setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
193  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
194  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
195  setOperationAction(ISD::LOAD, MVT::i1, Custom);
196  setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
197
198  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
199  setOperationAction(ISD::STORE, MVT::v3i32, Custom);
200  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
201  setOperationAction(ISD::STORE, MVT::v5i32, Custom);
202  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
203  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
204  setOperationAction(ISD::STORE, MVT::i1, Custom);
205  setOperationAction(ISD::STORE, MVT::v32i32, Custom);
206
207  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
208  setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
209  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
210  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
211  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
212  setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
213  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
214  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
215  setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
216  setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
217  setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
218  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
219  setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
220  setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
221  setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
222  setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
223
224  setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
225  setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
226  setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
227  setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
228  setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
229
230  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
231  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
232
233  setOperationAction(ISD::SELECT, MVT::i1, Promote);
234  setOperationAction(ISD::SELECT, MVT::i64, Custom);
235  setOperationAction(ISD::SELECT, MVT::f64, Promote);
236  AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
237
238  setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
239  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
240  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
241  setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
242  setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
243
244  setOperationAction(ISD::SETCC, MVT::i1, Promote);
245  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
246  setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
247  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
248
249  setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
250  setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
251  setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
252  setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
253  setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
254  setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
255  setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
256  setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand);
257
258  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
259  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
260  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
261  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
262  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
263  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom);
264  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
265  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
266
267  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
268  setOperationAction(ISD::BR_CC, MVT::i1, Expand);
269  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
270  setOperationAction(ISD::BR_CC, MVT::i64, Expand);
271  setOperationAction(ISD::BR_CC, MVT::f32, Expand);
272  setOperationAction(ISD::BR_CC, MVT::f64, Expand);
273
274  setOperationAction(ISD::UADDO, MVT::i32, Legal);
275  setOperationAction(ISD::USUBO, MVT::i32, Legal);
276
277  setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
278  setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
279
280  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
281  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
282  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
283
284#if 0
285  setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
286  setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
287#endif
288
289  // We only support LOAD/STORE and vector manipulation ops for vectors
290  // with > 4 elements.
291  for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
292                  MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
293                  MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
294                  MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
295    for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
296      switch (Op) {
297      case ISD::LOAD:
298      case ISD::STORE:
299      case ISD::BUILD_VECTOR:
300      case ISD::BITCAST:
301      case ISD::EXTRACT_VECTOR_ELT:
302      case ISD::INSERT_VECTOR_ELT:
303      case ISD::INSERT_SUBVECTOR:
304      case ISD::EXTRACT_SUBVECTOR:
305      case ISD::SCALAR_TO_VECTOR:
306        break;
307      case ISD::CONCAT_VECTORS:
308        setOperationAction(Op, VT, Custom);
309        break;
310      default:
311        setOperationAction(Op, VT, Expand);
312        break;
313      }
314    }
315  }
316
317  setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
318
319  // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
320  // is expanded to avoid having two separate loops in case the index is a VGPR.
321
322  // Most operations are naturally 32-bit vector operations. We only support
323  // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
324  for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
325    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
326    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
327
328    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
329    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
330
331    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
332    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
333
334    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
335    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
336  }
337
338  for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
339    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
340    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
341
342    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
343    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
344
345    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
346    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
347
348    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
349    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
350  }
351
352  for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
353    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
354    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
355
356    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
357    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
358
359    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
360    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
361
362    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
363    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
364  }
365
366  for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
367    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
368    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
369
370    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
371    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
372
373    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
374    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
375
376    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
377    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
378  }
379
380  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
381  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
382  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
383  setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
384
385  setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
386  setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
387
388  // Avoid stack access for these.
389  // TODO: Generalize to more vector types.
390  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
391  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
392  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
393  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
394
395  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
396  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
397  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
398  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
399  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
400
401  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
402  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
403  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
404
405  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
406  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
407  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
408  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
409
410  // Deal with vec3 vector operations when widened to vec4.
411  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom);
412  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom);
413  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom);
414  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom);
415
416  // Deal with vec5 vector operations when widened to vec8.
417  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom);
418  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom);
419  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom);
420  setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom);
421
422  // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
423  // and output demarshalling
424  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
425  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
426
427  // We can't return success/failure, only the old value,
428  // let LLVM add the comparison
429  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
430  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
431
432  if (Subtarget->hasFlatAddressSpace()) {
433    setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
434    setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
435  }
436
437  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
438
439  // FIXME: This should be narrowed to i32, but that only happens if i64 is
440  // illegal.
441  // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
442  setOperationAction(ISD::BSWAP, MVT::i64, Legal);
443  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
444
445  // On SI this is s_memtime and s_memrealtime on VI.
446  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
447  setOperationAction(ISD::TRAP, MVT::Other, Custom);
448  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
449
450  if (Subtarget->has16BitInsts()) {
451    setOperationAction(ISD::FPOW, MVT::f16, Promote);
452    setOperationAction(ISD::FLOG, MVT::f16, Custom);
453    setOperationAction(ISD::FEXP, MVT::f16, Custom);
454    setOperationAction(ISD::FLOG10, MVT::f16, Custom);
455  }
456
457  if (Subtarget->hasMadMacF32Insts())
458    setOperationAction(ISD::FMAD, MVT::f32, Legal);
459
460  if (!Subtarget->hasBFI()) {
461    // fcopysign can be done in a single instruction with BFI.
462    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
463    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
464  }
465
466  if (!Subtarget->hasBCNT(32))
467    setOperationAction(ISD::CTPOP, MVT::i32, Expand);
468
469  if (!Subtarget->hasBCNT(64))
470    setOperationAction(ISD::CTPOP, MVT::i64, Expand);
471
472  if (Subtarget->hasFFBH())
473    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
474
475  if (Subtarget->hasFFBL())
476    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
477
478  // We only really have 32-bit BFE instructions (and 16-bit on VI).
479  //
480  // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
481  // effort to match them now. We want this to be false for i64 cases when the
482  // extraction isn't restricted to the upper or lower half. Ideally we would
483  // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
484  // span the midpoint are probably relatively rare, so don't worry about them
485  // for now.
486  if (Subtarget->hasBFE())
487    setHasExtractBitsInsn(true);
488
489  setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
490  setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
491  setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
492  setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
493
494
495  // These are really only legal for ieee_mode functions. We should be avoiding
496  // them for functions that don't have ieee_mode enabled, so just say they are
497  // legal.
498  setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
499  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
500  setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
501  setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
502
503
504  if (Subtarget->haveRoundOpsF64()) {
505    setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
506    setOperationAction(ISD::FCEIL, MVT::f64, Legal);
507    setOperationAction(ISD::FRINT, MVT::f64, Legal);
508  } else {
509    setOperationAction(ISD::FCEIL, MVT::f64, Custom);
510    setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
511    setOperationAction(ISD::FRINT, MVT::f64, Custom);
512    setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
513  }
514
515  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
516
517  setOperationAction(ISD::FSIN, MVT::f32, Custom);
518  setOperationAction(ISD::FCOS, MVT::f32, Custom);
519  setOperationAction(ISD::FDIV, MVT::f32, Custom);
520  setOperationAction(ISD::FDIV, MVT::f64, Custom);
521
522  if (Subtarget->has16BitInsts()) {
523    setOperationAction(ISD::Constant, MVT::i16, Legal);
524
525    setOperationAction(ISD::SMIN, MVT::i16, Legal);
526    setOperationAction(ISD::SMAX, MVT::i16, Legal);
527
528    setOperationAction(ISD::UMIN, MVT::i16, Legal);
529    setOperationAction(ISD::UMAX, MVT::i16, Legal);
530
531    setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
532    AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
533
534    setOperationAction(ISD::ROTR, MVT::i16, Promote);
535    setOperationAction(ISD::ROTL, MVT::i16, Promote);
536
537    setOperationAction(ISD::SDIV, MVT::i16, Promote);
538    setOperationAction(ISD::UDIV, MVT::i16, Promote);
539    setOperationAction(ISD::SREM, MVT::i16, Promote);
540    setOperationAction(ISD::UREM, MVT::i16, Promote);
541
542    setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
543
544    setOperationAction(ISD::CTTZ, MVT::i16, Promote);
545    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
546    setOperationAction(ISD::CTLZ, MVT::i16, Promote);
547    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
548    setOperationAction(ISD::CTPOP, MVT::i16, Promote);
549
550    setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
551
552    setOperationAction(ISD::BR_CC, MVT::i16, Expand);
553
554    setOperationAction(ISD::LOAD, MVT::i16, Custom);
555
556    setTruncStoreAction(MVT::i64, MVT::i16, Expand);
557
558    setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
559    AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
560    setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
561    AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
562
563    setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
564    setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
565
566    // F16 - Constant Actions.
567    setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
568
569    // F16 - Load/Store Actions.
570    setOperationAction(ISD::LOAD, MVT::f16, Promote);
571    AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
572    setOperationAction(ISD::STORE, MVT::f16, Promote);
573    AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
574
575    // F16 - VOP1 Actions.
576    setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
577    setOperationAction(ISD::FCOS, MVT::f16, Custom);
578    setOperationAction(ISD::FSIN, MVT::f16, Custom);
579
580    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
581    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom);
582
583    setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
584    setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
585    setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
586    setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
587    setOperationAction(ISD::FROUND, MVT::f16, Custom);
588
589    // F16 - VOP2 Actions.
590    setOperationAction(ISD::BR_CC, MVT::f16, Expand);
591    setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
592
593    setOperationAction(ISD::FDIV, MVT::f16, Custom);
594
595    // F16 - VOP3 Actions.
596    setOperationAction(ISD::FMA, MVT::f16, Legal);
597    if (STI.hasMadF16())
598      setOperationAction(ISD::FMAD, MVT::f16, Legal);
599
600    for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
601      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
602        switch (Op) {
603        case ISD::LOAD:
604        case ISD::STORE:
605        case ISD::BUILD_VECTOR:
606        case ISD::BITCAST:
607        case ISD::EXTRACT_VECTOR_ELT:
608        case ISD::INSERT_VECTOR_ELT:
609        case ISD::INSERT_SUBVECTOR:
610        case ISD::EXTRACT_SUBVECTOR:
611        case ISD::SCALAR_TO_VECTOR:
612          break;
613        case ISD::CONCAT_VECTORS:
614          setOperationAction(Op, VT, Custom);
615          break;
616        default:
617          setOperationAction(Op, VT, Expand);
618          break;
619        }
620      }
621    }
622
623    // v_perm_b32 can handle either of these.
624    setOperationAction(ISD::BSWAP, MVT::i16, Legal);
625    setOperationAction(ISD::BSWAP, MVT::v2i16, Legal);
626    setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
627
628    // XXX - Do these do anything? Vector constants turn into build_vector.
629    setOperationAction(ISD::Constant, MVT::v2i16, Legal);
630    setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
631
632    setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
633    setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
634
635    setOperationAction(ISD::STORE, MVT::v2i16, Promote);
636    AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
637    setOperationAction(ISD::STORE, MVT::v2f16, Promote);
638    AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
639
640    setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
641    AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
642    setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
643    AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
644
645    setOperationAction(ISD::AND, MVT::v2i16, Promote);
646    AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
647    setOperationAction(ISD::OR, MVT::v2i16, Promote);
648    AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
649    setOperationAction(ISD::XOR, MVT::v2i16, Promote);
650    AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
651
652    setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
653    AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
654    setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
655    AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
656
657    setOperationAction(ISD::STORE, MVT::v4i16, Promote);
658    AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
659    setOperationAction(ISD::STORE, MVT::v4f16, Promote);
660    AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
661
662    setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
663    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
664    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
665    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
666
667    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
668    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
669    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
670
671    if (!Subtarget->hasVOP3PInsts()) {
672      setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
673      setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
674    }
675
676    setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
677    // This isn't really legal, but this avoids the legalizer unrolling it (and
678    // allows matching fneg (fabs x) patterns)
679    setOperationAction(ISD::FABS, MVT::v2f16, Legal);
680
681    setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
682    setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
683    setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
684    setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
685
686    setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
687    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
688
689    setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
690    setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
691  }
692
693  if (Subtarget->hasVOP3PInsts()) {
694    setOperationAction(ISD::ADD, MVT::v2i16, Legal);
695    setOperationAction(ISD::SUB, MVT::v2i16, Legal);
696    setOperationAction(ISD::MUL, MVT::v2i16, Legal);
697    setOperationAction(ISD::SHL, MVT::v2i16, Legal);
698    setOperationAction(ISD::SRL, MVT::v2i16, Legal);
699    setOperationAction(ISD::SRA, MVT::v2i16, Legal);
700    setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
701    setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
702    setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
703    setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
704
705    setOperationAction(ISD::FADD, MVT::v2f16, Legal);
706    setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
707    setOperationAction(ISD::FMA, MVT::v2f16, Legal);
708
709    setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
710    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
711
712    setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
713
714    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
715    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
716
717    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
718    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
719
720    setOperationAction(ISD::SHL, MVT::v4i16, Custom);
721    setOperationAction(ISD::SRA, MVT::v4i16, Custom);
722    setOperationAction(ISD::SRL, MVT::v4i16, Custom);
723    setOperationAction(ISD::ADD, MVT::v4i16, Custom);
724    setOperationAction(ISD::SUB, MVT::v4i16, Custom);
725    setOperationAction(ISD::MUL, MVT::v4i16, Custom);
726
727    setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
728    setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
729    setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
730    setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
731
732    setOperationAction(ISD::FADD, MVT::v4f16, Custom);
733    setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
734    setOperationAction(ISD::FMA, MVT::v4f16, Custom);
735
736    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
737    setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
738
739    setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
740    setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
741    setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
742
743    setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
744    setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
745    setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
746  }
747
748  setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
749  setOperationAction(ISD::FABS, MVT::v4f16, Custom);
750
751  if (Subtarget->has16BitInsts()) {
752    setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
753    AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
754    setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
755    AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
756  } else {
757    // Legalization hack.
758    setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
759    setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
760
761    setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
762    setOperationAction(ISD::FABS, MVT::v2f16, Custom);
763  }
764
765  for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
766    setOperationAction(ISD::SELECT, VT, Custom);
767  }
768
769  setOperationAction(ISD::SMULO, MVT::i64, Custom);
770  setOperationAction(ISD::UMULO, MVT::i64, Custom);
771
772  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
773  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
774  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
775  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
776  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
777  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
778  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
779
780  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
781  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
782  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
783  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
784  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
785  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
786  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom);
787  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
788  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
789
790  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
791  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
792  setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
793  setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
794  setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
795  setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
796  setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
797  setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
798
799  setTargetDAGCombine(ISD::ADD);
800  setTargetDAGCombine(ISD::ADDCARRY);
801  setTargetDAGCombine(ISD::SUB);
802  setTargetDAGCombine(ISD::SUBCARRY);
803  setTargetDAGCombine(ISD::FADD);
804  setTargetDAGCombine(ISD::FSUB);
805  setTargetDAGCombine(ISD::FMINNUM);
806  setTargetDAGCombine(ISD::FMAXNUM);
807  setTargetDAGCombine(ISD::FMINNUM_IEEE);
808  setTargetDAGCombine(ISD::FMAXNUM_IEEE);
809  setTargetDAGCombine(ISD::FMA);
810  setTargetDAGCombine(ISD::SMIN);
811  setTargetDAGCombine(ISD::SMAX);
812  setTargetDAGCombine(ISD::UMIN);
813  setTargetDAGCombine(ISD::UMAX);
814  setTargetDAGCombine(ISD::SETCC);
815  setTargetDAGCombine(ISD::AND);
816  setTargetDAGCombine(ISD::OR);
817  setTargetDAGCombine(ISD::XOR);
818  setTargetDAGCombine(ISD::SINT_TO_FP);
819  setTargetDAGCombine(ISD::UINT_TO_FP);
820  setTargetDAGCombine(ISD::FCANONICALIZE);
821  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
822  setTargetDAGCombine(ISD::ZERO_EXTEND);
823  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
824  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
825  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
826
827  // All memory operations. Some folding on the pointer operand is done to help
828  // matching the constant offsets in the addressing modes.
829  setTargetDAGCombine(ISD::LOAD);
830  setTargetDAGCombine(ISD::STORE);
831  setTargetDAGCombine(ISD::ATOMIC_LOAD);
832  setTargetDAGCombine(ISD::ATOMIC_STORE);
833  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
834  setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
835  setTargetDAGCombine(ISD::ATOMIC_SWAP);
836  setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
837  setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
838  setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
839  setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
840  setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
841  setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
842  setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
843  setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
844  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
845  setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
846  setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
847
848  // FIXME: In other contexts we pretend this is a per-function property.
849  setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
850
851  setSchedulingPreference(Sched::RegPressure);
852}
853
854const GCNSubtarget *SITargetLowering::getSubtarget() const {
855  return Subtarget;
856}
857
858//===----------------------------------------------------------------------===//
859// TargetLowering queries
860//===----------------------------------------------------------------------===//
861
862// v_mad_mix* support a conversion from f16 to f32.
863//
864// There is only one special case when denormals are enabled we don't currently,
865// where this is OK to use.
866bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
867                                       EVT DestVT, EVT SrcVT) const {
868  return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
869          (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
870    DestVT.getScalarType() == MVT::f32 &&
871    SrcVT.getScalarType() == MVT::f16 &&
872    // TODO: This probably only requires no input flushing?
873    !hasFP32Denormals(DAG.getMachineFunction());
874}
875
876bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
877  // SI has some legal vector types, but no legal vector operations. Say no
878  // shuffles are legal in order to prefer scalarizing some vector operations.
879  return false;
880}
881
882MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
883                                                    CallingConv::ID CC,
884                                                    EVT VT) const {
885  if (CC == CallingConv::AMDGPU_KERNEL)
886    return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
887
888  if (VT.isVector()) {
889    EVT ScalarVT = VT.getScalarType();
890    unsigned Size = ScalarVT.getSizeInBits();
891    if (Size == 32)
892      return ScalarVT.getSimpleVT();
893
894    if (Size > 32)
895      return MVT::i32;
896
897    if (Size == 16 && Subtarget->has16BitInsts())
898      return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
899  } else if (VT.getSizeInBits() > 32)
900    return MVT::i32;
901
902  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
903}
904
905unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
906                                                         CallingConv::ID CC,
907                                                         EVT VT) const {
908  if (CC == CallingConv::AMDGPU_KERNEL)
909    return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
910
911  if (VT.isVector()) {
912    unsigned NumElts = VT.getVectorNumElements();
913    EVT ScalarVT = VT.getScalarType();
914    unsigned Size = ScalarVT.getSizeInBits();
915
916    if (Size == 32)
917      return NumElts;
918
919    if (Size > 32)
920      return NumElts * ((Size + 31) / 32);
921
922    if (Size == 16 && Subtarget->has16BitInsts())
923      return (NumElts + 1) / 2;
924  } else if (VT.getSizeInBits() > 32)
925    return (VT.getSizeInBits() + 31) / 32;
926
927  return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
928}
929
930unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
931  LLVMContext &Context, CallingConv::ID CC,
932  EVT VT, EVT &IntermediateVT,
933  unsigned &NumIntermediates, MVT &RegisterVT) const {
934  if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
935    unsigned NumElts = VT.getVectorNumElements();
936    EVT ScalarVT = VT.getScalarType();
937    unsigned Size = ScalarVT.getSizeInBits();
938    if (Size == 32) {
939      RegisterVT = ScalarVT.getSimpleVT();
940      IntermediateVT = RegisterVT;
941      NumIntermediates = NumElts;
942      return NumIntermediates;
943    }
944
945    if (Size > 32) {
946      RegisterVT = MVT::i32;
947      IntermediateVT = RegisterVT;
948      NumIntermediates = NumElts * ((Size + 31) / 32);
949      return NumIntermediates;
950    }
951
952    // FIXME: We should fix the ABI to be the same on targets without 16-bit
953    // support, but unless we can properly handle 3-vectors, it will be still be
954    // inconsistent.
955    if (Size == 16 && Subtarget->has16BitInsts()) {
956      RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
957      IntermediateVT = RegisterVT;
958      NumIntermediates = (NumElts + 1) / 2;
959      return NumIntermediates;
960    }
961  }
962
963  return TargetLowering::getVectorTypeBreakdownForCallingConv(
964    Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
965}
966
967static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
968  assert(DMaskLanes != 0);
969
970  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
971    unsigned NumElts = std::min(DMaskLanes, VT->getNumElements());
972    return EVT::getVectorVT(Ty->getContext(),
973                            EVT::getEVT(VT->getElementType()),
974                            NumElts);
975  }
976
977  return EVT::getEVT(Ty);
978}
979
980// Peek through TFE struct returns to only use the data size.
981static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) {
982  auto *ST = dyn_cast<StructType>(Ty);
983  if (!ST)
984    return memVTFromImageData(Ty, DMaskLanes);
985
986  // Some intrinsics return an aggregate type - special case to work out the
987  // correct memVT.
988  //
989  // Only limited forms of aggregate type currently expected.
990  if (ST->getNumContainedTypes() != 2 ||
991      !ST->getContainedType(1)->isIntegerTy(32))
992    return EVT();
993  return memVTFromImageData(ST->getContainedType(0), DMaskLanes);
994}
995
996bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
997                                          const CallInst &CI,
998                                          MachineFunction &MF,
999                                          unsigned IntrID) const {
1000  if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1001          AMDGPU::lookupRsrcIntrinsic(IntrID)) {
1002    AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
1003                                                  (Intrinsic::ID)IntrID);
1004    if (Attr.hasFnAttribute(Attribute::ReadNone))
1005      return false;
1006
1007    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1008
1009    if (RsrcIntr->IsImage) {
1010      Info.ptrVal = MFI->getImagePSV(
1011        *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
1012        CI.getArgOperand(RsrcIntr->RsrcArg));
1013      Info.align.reset();
1014    } else {
1015      Info.ptrVal = MFI->getBufferPSV(
1016        *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
1017        CI.getArgOperand(RsrcIntr->RsrcArg));
1018    }
1019
1020    Info.flags = MachineMemOperand::MODereferenceable;
1021    if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
1022      unsigned DMaskLanes = 4;
1023
1024      if (RsrcIntr->IsImage) {
1025        const AMDGPU::ImageDimIntrinsicInfo *Intr
1026          = AMDGPU::getImageDimIntrinsicInfo(IntrID);
1027        const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1028          AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1029
1030        if (!BaseOpcode->Gather4) {
1031          // If this isn't a gather, we may have excess loaded elements in the
1032          // IR type. Check the dmask for the real number of elements loaded.
1033          unsigned DMask
1034            = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1035          DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1036        }
1037
1038        Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes);
1039      } else
1040        Info.memVT = EVT::getEVT(CI.getType());
1041
1042      // FIXME: What does alignment mean for an image?
1043      Info.opc = ISD::INTRINSIC_W_CHAIN;
1044      Info.flags |= MachineMemOperand::MOLoad;
1045    } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
1046      Info.opc = ISD::INTRINSIC_VOID;
1047
1048      Type *DataTy = CI.getArgOperand(0)->getType();
1049      if (RsrcIntr->IsImage) {
1050        unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1051        unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1052        Info.memVT = memVTFromImageData(DataTy, DMaskLanes);
1053      } else
1054        Info.memVT = EVT::getEVT(DataTy);
1055
1056      Info.flags |= MachineMemOperand::MOStore;
1057    } else {
1058      // Atomic
1059      Info.opc = ISD::INTRINSIC_W_CHAIN;
1060      Info.memVT = MVT::getVT(CI.getType());
1061      Info.flags = MachineMemOperand::MOLoad |
1062                   MachineMemOperand::MOStore |
1063                   MachineMemOperand::MODereferenceable;
1064
1065      // XXX - Should this be volatile without known ordering?
1066      Info.flags |= MachineMemOperand::MOVolatile;
1067    }
1068    return true;
1069  }
1070
1071  switch (IntrID) {
1072  case Intrinsic::amdgcn_atomic_inc:
1073  case Intrinsic::amdgcn_atomic_dec:
1074  case Intrinsic::amdgcn_ds_ordered_add:
1075  case Intrinsic::amdgcn_ds_ordered_swap:
1076  case Intrinsic::amdgcn_ds_fadd:
1077  case Intrinsic::amdgcn_ds_fmin:
1078  case Intrinsic::amdgcn_ds_fmax: {
1079    Info.opc = ISD::INTRINSIC_W_CHAIN;
1080    Info.memVT = MVT::getVT(CI.getType());
1081    Info.ptrVal = CI.getOperand(0);
1082    Info.align.reset();
1083    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1084
1085    const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1086    if (!Vol->isZero())
1087      Info.flags |= MachineMemOperand::MOVolatile;
1088
1089    return true;
1090  }
1091  case Intrinsic::amdgcn_buffer_atomic_fadd: {
1092    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1093
1094    Info.opc = ISD::INTRINSIC_VOID;
1095    Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1096    Info.ptrVal = MFI->getBufferPSV(
1097      *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
1098      CI.getArgOperand(1));
1099    Info.align.reset();
1100    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1101
1102    const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1103    if (!Vol || !Vol->isZero())
1104      Info.flags |= MachineMemOperand::MOVolatile;
1105
1106    return true;
1107  }
1108  case Intrinsic::amdgcn_global_atomic_fadd: {
1109    Info.opc = ISD::INTRINSIC_VOID;
1110    Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
1111                            ->getPointerElementType());
1112    Info.ptrVal = CI.getOperand(0);
1113    Info.align.reset();
1114    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1115
1116    return true;
1117  }
1118  case Intrinsic::amdgcn_ds_append:
1119  case Intrinsic::amdgcn_ds_consume: {
1120    Info.opc = ISD::INTRINSIC_W_CHAIN;
1121    Info.memVT = MVT::getVT(CI.getType());
1122    Info.ptrVal = CI.getOperand(0);
1123    Info.align.reset();
1124    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1125
1126    const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1127    if (!Vol->isZero())
1128      Info.flags |= MachineMemOperand::MOVolatile;
1129
1130    return true;
1131  }
1132  case Intrinsic::amdgcn_global_atomic_csub: {
1133    Info.opc = ISD::INTRINSIC_W_CHAIN;
1134    Info.memVT = MVT::getVT(CI.getType());
1135    Info.ptrVal = CI.getOperand(0);
1136    Info.align.reset();
1137    Info.flags = MachineMemOperand::MOLoad |
1138                 MachineMemOperand::MOStore |
1139                 MachineMemOperand::MODereferenceable |
1140                 MachineMemOperand::MOVolatile;
1141    return true;
1142  }
1143  case Intrinsic::amdgcn_ds_gws_init:
1144  case Intrinsic::amdgcn_ds_gws_barrier:
1145  case Intrinsic::amdgcn_ds_gws_sema_v:
1146  case Intrinsic::amdgcn_ds_gws_sema_br:
1147  case Intrinsic::amdgcn_ds_gws_sema_p:
1148  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1149    Info.opc = ISD::INTRINSIC_VOID;
1150
1151    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1152    Info.ptrVal =
1153        MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
1154
1155    // This is an abstract access, but we need to specify a type and size.
1156    Info.memVT = MVT::i32;
1157    Info.size = 4;
1158    Info.align = Align(4);
1159
1160    Info.flags = MachineMemOperand::MOStore;
1161    if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1162      Info.flags = MachineMemOperand::MOLoad;
1163    return true;
1164  }
1165  default:
1166    return false;
1167  }
1168}
1169
1170bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
1171                                            SmallVectorImpl<Value*> &Ops,
1172                                            Type *&AccessTy) const {
1173  switch (II->getIntrinsicID()) {
1174  case Intrinsic::amdgcn_atomic_inc:
1175  case Intrinsic::amdgcn_atomic_dec:
1176  case Intrinsic::amdgcn_ds_ordered_add:
1177  case Intrinsic::amdgcn_ds_ordered_swap:
1178  case Intrinsic::amdgcn_ds_fadd:
1179  case Intrinsic::amdgcn_ds_fmin:
1180  case Intrinsic::amdgcn_ds_fmax: {
1181    Value *Ptr = II->getArgOperand(0);
1182    AccessTy = II->getType();
1183    Ops.push_back(Ptr);
1184    return true;
1185  }
1186  default:
1187    return false;
1188  }
1189}
1190
1191bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1192  if (!Subtarget->hasFlatInstOffsets()) {
1193    // Flat instructions do not have offsets, and only have the register
1194    // address.
1195    return AM.BaseOffs == 0 && AM.Scale == 0;
1196  }
1197
1198  return AM.Scale == 0 &&
1199         (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1200                                  AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS,
1201                                  /*Signed=*/false));
1202}
1203
1204bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1205  if (Subtarget->hasFlatGlobalInsts())
1206    return AM.Scale == 0 &&
1207           (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1208                                    AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS,
1209                                    /*Signed=*/true));
1210
1211  if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1212      // Assume the we will use FLAT for all global memory accesses
1213      // on VI.
1214      // FIXME: This assumption is currently wrong.  On VI we still use
1215      // MUBUF instructions for the r + i addressing mode.  As currently
1216      // implemented, the MUBUF instructions only work on buffer < 4GB.
1217      // It may be possible to support > 4GB buffers with MUBUF instructions,
1218      // by setting the stride value in the resource descriptor which would
1219      // increase the size limit to (stride * 4GB).  However, this is risky,
1220      // because it has never been validated.
1221    return isLegalFlatAddressingMode(AM);
1222  }
1223
1224  return isLegalMUBUFAddressingMode(AM);
1225}
1226
1227bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1228  // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1229  // additionally can do r + r + i with addr64. 32-bit has more addressing
1230  // mode options. Depending on the resource constant, it can also do
1231  // (i64 r0) + (i32 r1) * (i14 i).
1232  //
1233  // Private arrays end up using a scratch buffer most of the time, so also
1234  // assume those use MUBUF instructions. Scratch loads / stores are currently
1235  // implemented as mubuf instructions with offen bit set, so slightly
1236  // different than the normal addr64.
1237  if (!isUInt<12>(AM.BaseOffs))
1238    return false;
1239
1240  // FIXME: Since we can split immediate into soffset and immediate offset,
1241  // would it make sense to allow any immediate?
1242
1243  switch (AM.Scale) {
1244  case 0: // r + i or just i, depending on HasBaseReg.
1245    return true;
1246  case 1:
1247    return true; // We have r + r or r + i.
1248  case 2:
1249    if (AM.HasBaseReg) {
1250      // Reject 2 * r + r.
1251      return false;
1252    }
1253
1254    // Allow 2 * r as r + r
1255    // Or  2 * r + i is allowed as r + r + i.
1256    return true;
1257  default: // Don't allow n * r
1258    return false;
1259  }
1260}
1261
1262bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1263                                             const AddrMode &AM, Type *Ty,
1264                                             unsigned AS, Instruction *I) const {
1265  // No global is ever allowed as a base.
1266  if (AM.BaseGV)
1267    return false;
1268
1269  if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1270    return isLegalGlobalAddressingMode(AM);
1271
1272  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1273      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1274      AS == AMDGPUAS::BUFFER_FAT_POINTER) {
1275    // If the offset isn't a multiple of 4, it probably isn't going to be
1276    // correctly aligned.
1277    // FIXME: Can we get the real alignment here?
1278    if (AM.BaseOffs % 4 != 0)
1279      return isLegalMUBUFAddressingMode(AM);
1280
1281    // There are no SMRD extloads, so if we have to do a small type access we
1282    // will use a MUBUF load.
1283    // FIXME?: We also need to do this if unaligned, but we don't know the
1284    // alignment here.
1285    if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1286      return isLegalGlobalAddressingMode(AM);
1287
1288    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1289      // SMRD instructions have an 8-bit, dword offset on SI.
1290      if (!isUInt<8>(AM.BaseOffs / 4))
1291        return false;
1292    } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1293      // On CI+, this can also be a 32-bit literal constant offset. If it fits
1294      // in 8-bits, it can use a smaller encoding.
1295      if (!isUInt<32>(AM.BaseOffs / 4))
1296        return false;
1297    } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1298      // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1299      if (!isUInt<20>(AM.BaseOffs))
1300        return false;
1301    } else
1302      llvm_unreachable("unhandled generation");
1303
1304    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1305      return true;
1306
1307    if (AM.Scale == 1 && AM.HasBaseReg)
1308      return true;
1309
1310    return false;
1311
1312  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1313    return isLegalMUBUFAddressingMode(AM);
1314  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1315             AS == AMDGPUAS::REGION_ADDRESS) {
1316    // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1317    // field.
1318    // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1319    // an 8-bit dword offset but we don't know the alignment here.
1320    if (!isUInt<16>(AM.BaseOffs))
1321      return false;
1322
1323    if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1324      return true;
1325
1326    if (AM.Scale == 1 && AM.HasBaseReg)
1327      return true;
1328
1329    return false;
1330  } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1331             AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1332    // For an unknown address space, this usually means that this is for some
1333    // reason being used for pure arithmetic, and not based on some addressing
1334    // computation. We don't have instructions that compute pointers with any
1335    // addressing modes, so treat them as having no offset like flat
1336    // instructions.
1337    return isLegalFlatAddressingMode(AM);
1338  }
1339
1340  // Assume a user alias of global for unknown address spaces.
1341  return isLegalGlobalAddressingMode(AM);
1342}
1343
1344bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1345                                        const SelectionDAG &DAG) const {
1346  if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1347    return (MemVT.getSizeInBits() <= 4 * 32);
1348  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1349    unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1350    return (MemVT.getSizeInBits() <= MaxPrivateBits);
1351  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1352    return (MemVT.getSizeInBits() <= 2 * 32);
1353  }
1354  return true;
1355}
1356
1357bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1358    unsigned Size, unsigned AddrSpace, unsigned Align,
1359    MachineMemOperand::Flags Flags, bool *IsFast) const {
1360  if (IsFast)
1361    *IsFast = false;
1362
1363  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1364      AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1365    // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1366    // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1367    // with adjacent offsets.
1368    bool AlignedBy4 = (Align % 4 == 0);
1369    if (IsFast)
1370      *IsFast = AlignedBy4;
1371
1372    return AlignedBy4;
1373  }
1374
1375  // FIXME: We have to be conservative here and assume that flat operations
1376  // will access scratch.  If we had access to the IR function, then we
1377  // could determine if any private memory was used in the function.
1378  if (!Subtarget->hasUnalignedScratchAccess() &&
1379      (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1380       AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
1381    bool AlignedBy4 = Align >= 4;
1382    if (IsFast)
1383      *IsFast = AlignedBy4;
1384
1385    return AlignedBy4;
1386  }
1387
1388  if (Subtarget->hasUnalignedBufferAccess()) {
1389    // If we have an uniform constant load, it still requires using a slow
1390    // buffer instruction if unaligned.
1391    if (IsFast) {
1392      // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
1393      // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
1394      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1395                 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1396        Align >= 4 : Align != 2;
1397    }
1398
1399    return true;
1400  }
1401
1402  // Smaller than dword value must be aligned.
1403  if (Size < 32)
1404    return false;
1405
1406  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1407  // byte-address are ignored, thus forcing Dword alignment.
1408  // This applies to private, global, and constant memory.
1409  if (IsFast)
1410    *IsFast = true;
1411
1412  return Size >= 32 && Align >= 4;
1413}
1414
1415bool SITargetLowering::allowsMisalignedMemoryAccesses(
1416    EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
1417    bool *IsFast) const {
1418  if (IsFast)
1419    *IsFast = false;
1420
1421  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1422  // which isn't a simple VT.
1423  // Until MVT is extended to handle this, simply check for the size and
1424  // rely on the condition below: allow accesses if the size is a multiple of 4.
1425  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1426                           VT.getStoreSize() > 16)) {
1427    return false;
1428  }
1429
1430  return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
1431                                            Align, Flags, IsFast);
1432}
1433
1434EVT SITargetLowering::getOptimalMemOpType(
1435    const MemOp &Op, const AttributeList &FuncAttributes) const {
1436  // FIXME: Should account for address space here.
1437
1438  // The default fallback uses the private pointer size as a guess for a type to
1439  // use. Make sure we switch these to 64-bit accesses.
1440
1441  if (Op.size() >= 16 &&
1442      Op.isDstAligned(Align(4))) // XXX: Should only do for global
1443    return MVT::v4i32;
1444
1445  if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1446    return MVT::v2i32;
1447
1448  // Use the default.
1449  return MVT::Other;
1450}
1451
1452bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1453                                           unsigned DestAS) const {
1454  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
1455}
1456
1457bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1458  const MemSDNode *MemNode = cast<MemSDNode>(N);
1459  const Value *Ptr = MemNode->getMemOperand()->getValue();
1460  const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1461  return I && I->getMetadata("amdgpu.noclobber");
1462}
1463
1464bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
1465                                           unsigned DestAS) const {
1466  // Flat -> private/local is a simple truncate.
1467  // Flat -> global is no-op
1468  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1469    return true;
1470
1471  return isNoopAddrSpaceCast(SrcAS, DestAS);
1472}
1473
1474bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1475  const MemSDNode *MemNode = cast<MemSDNode>(N);
1476
1477  return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1478}
1479
1480TargetLoweringBase::LegalizeTypeAction
1481SITargetLowering::getPreferredVectorAction(MVT VT) const {
1482  int NumElts = VT.getVectorNumElements();
1483  if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))
1484    return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
1485  return TargetLoweringBase::getPreferredVectorAction(VT);
1486}
1487
1488bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1489                                                         Type *Ty) const {
1490  // FIXME: Could be smarter if called for vector constants.
1491  return true;
1492}
1493
1494bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1495  if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1496    switch (Op) {
1497    case ISD::LOAD:
1498    case ISD::STORE:
1499
1500    // These operations are done with 32-bit instructions anyway.
1501    case ISD::AND:
1502    case ISD::OR:
1503    case ISD::XOR:
1504    case ISD::SELECT:
1505      // TODO: Extensions?
1506      return true;
1507    default:
1508      return false;
1509    }
1510  }
1511
1512  // SimplifySetCC uses this function to determine whether or not it should
1513  // create setcc with i1 operands.  We don't have instructions for i1 setcc.
1514  if (VT == MVT::i1 && Op == ISD::SETCC)
1515    return false;
1516
1517  return TargetLowering::isTypeDesirableForOp(Op, VT);
1518}
1519
1520SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1521                                                   const SDLoc &SL,
1522                                                   SDValue Chain,
1523                                                   uint64_t Offset) const {
1524  const DataLayout &DL = DAG.getDataLayout();
1525  MachineFunction &MF = DAG.getMachineFunction();
1526  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1527
1528  const ArgDescriptor *InputPtrReg;
1529  const TargetRegisterClass *RC;
1530  LLT ArgTy;
1531
1532  std::tie(InputPtrReg, RC, ArgTy) =
1533      Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1534
1535  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1536  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
1537  SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1538    MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1539
1540  return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
1541}
1542
1543SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1544                                            const SDLoc &SL) const {
1545  uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1546                                               FIRST_IMPLICIT);
1547  return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1548}
1549
1550SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1551                                         const SDLoc &SL, SDValue Val,
1552                                         bool Signed,
1553                                         const ISD::InputArg *Arg) const {
1554  // First, if it is a widened vector, narrow it.
1555  if (VT.isVector() &&
1556      VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1557    EVT NarrowedVT =
1558        EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
1559                         VT.getVectorNumElements());
1560    Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1561                      DAG.getConstant(0, SL, MVT::i32));
1562  }
1563
1564  // Then convert the vector elements or scalar value.
1565  if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1566      VT.bitsLT(MemVT)) {
1567    unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1568    Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1569  }
1570
1571  if (MemVT.isFloatingPoint())
1572    Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1573  else if (Signed)
1574    Val = DAG.getSExtOrTrunc(Val, SL, VT);
1575  else
1576    Val = DAG.getZExtOrTrunc(Val, SL, VT);
1577
1578  return Val;
1579}
1580
1581SDValue SITargetLowering::lowerKernargMemParameter(
1582    SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1583    uint64_t Offset, Align Alignment, bool Signed,
1584    const ISD::InputArg *Arg) const {
1585  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1586
1587  // Try to avoid using an extload by loading earlier than the argument address,
1588  // and extracting the relevant bits. The load should hopefully be merged with
1589  // the previous argument.
1590  if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1591    // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1592    int64_t AlignDownOffset = alignDown(Offset, 4);
1593    int64_t OffsetDiff = Offset - AlignDownOffset;
1594
1595    EVT IntVT = MemVT.changeTypeToInteger();
1596
1597    // TODO: If we passed in the base kernel offset we could have a better
1598    // alignment than 4, but we don't really need it.
1599    SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1600    SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1601                               MachineMemOperand::MODereferenceable |
1602                               MachineMemOperand::MOInvariant);
1603
1604    SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1605    SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1606
1607    SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1608    ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1609    ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1610
1611
1612    return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1613  }
1614
1615  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1616  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1617                             MachineMemOperand::MODereferenceable |
1618                                 MachineMemOperand::MOInvariant);
1619
1620  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1621  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1622}
1623
1624SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1625                                              const SDLoc &SL, SDValue Chain,
1626                                              const ISD::InputArg &Arg) const {
1627  MachineFunction &MF = DAG.getMachineFunction();
1628  MachineFrameInfo &MFI = MF.getFrameInfo();
1629
1630  if (Arg.Flags.isByVal()) {
1631    unsigned Size = Arg.Flags.getByValSize();
1632    int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1633    return DAG.getFrameIndex(FrameIdx, MVT::i32);
1634  }
1635
1636  unsigned ArgOffset = VA.getLocMemOffset();
1637  unsigned ArgSize = VA.getValVT().getStoreSize();
1638
1639  int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1640
1641  // Create load nodes to retrieve arguments from the stack.
1642  SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1643  SDValue ArgValue;
1644
1645  // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1646  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1647  MVT MemVT = VA.getValVT();
1648
1649  switch (VA.getLocInfo()) {
1650  default:
1651    break;
1652  case CCValAssign::BCvt:
1653    MemVT = VA.getLocVT();
1654    break;
1655  case CCValAssign::SExt:
1656    ExtType = ISD::SEXTLOAD;
1657    break;
1658  case CCValAssign::ZExt:
1659    ExtType = ISD::ZEXTLOAD;
1660    break;
1661  case CCValAssign::AExt:
1662    ExtType = ISD::EXTLOAD;
1663    break;
1664  }
1665
1666  ArgValue = DAG.getExtLoad(
1667    ExtType, SL, VA.getLocVT(), Chain, FIN,
1668    MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1669    MemVT);
1670  return ArgValue;
1671}
1672
1673SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1674  const SIMachineFunctionInfo &MFI,
1675  EVT VT,
1676  AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1677  const ArgDescriptor *Reg;
1678  const TargetRegisterClass *RC;
1679  LLT Ty;
1680
1681  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1682  return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1683}
1684
1685static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1686                                   CallingConv::ID CallConv,
1687                                   ArrayRef<ISD::InputArg> Ins,
1688                                   BitVector &Skipped,
1689                                   FunctionType *FType,
1690                                   SIMachineFunctionInfo *Info) {
1691  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1692    const ISD::InputArg *Arg = &Ins[I];
1693
1694    assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1695           "vector type argument should have been split");
1696
1697    // First check if it's a PS input addr.
1698    if (CallConv == CallingConv::AMDGPU_PS &&
1699        !Arg->Flags.isInReg() && PSInputNum <= 15) {
1700      bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1701
1702      // Inconveniently only the first part of the split is marked as isSplit,
1703      // so skip to the end. We only want to increment PSInputNum once for the
1704      // entire split argument.
1705      if (Arg->Flags.isSplit()) {
1706        while (!Arg->Flags.isSplitEnd()) {
1707          assert((!Arg->VT.isVector() ||
1708                  Arg->VT.getScalarSizeInBits() == 16) &&
1709                 "unexpected vector split in ps argument type");
1710          if (!SkipArg)
1711            Splits.push_back(*Arg);
1712          Arg = &Ins[++I];
1713        }
1714      }
1715
1716      if (SkipArg) {
1717        // We can safely skip PS inputs.
1718        Skipped.set(Arg->getOrigArgIndex());
1719        ++PSInputNum;
1720        continue;
1721      }
1722
1723      Info->markPSInputAllocated(PSInputNum);
1724      if (Arg->Used)
1725        Info->markPSInputEnabled(PSInputNum);
1726
1727      ++PSInputNum;
1728    }
1729
1730    Splits.push_back(*Arg);
1731  }
1732}
1733
1734// Allocate special inputs passed in VGPRs.
1735void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1736                                                      MachineFunction &MF,
1737                                                      const SIRegisterInfo &TRI,
1738                                                      SIMachineFunctionInfo &Info) const {
1739  const LLT S32 = LLT::scalar(32);
1740  MachineRegisterInfo &MRI = MF.getRegInfo();
1741
1742  if (Info.hasWorkItemIDX()) {
1743    Register Reg = AMDGPU::VGPR0;
1744    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1745
1746    CCInfo.AllocateReg(Reg);
1747    Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1748  }
1749
1750  if (Info.hasWorkItemIDY()) {
1751    Register Reg = AMDGPU::VGPR1;
1752    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1753
1754    CCInfo.AllocateReg(Reg);
1755    Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1756  }
1757
1758  if (Info.hasWorkItemIDZ()) {
1759    Register Reg = AMDGPU::VGPR2;
1760    MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1761
1762    CCInfo.AllocateReg(Reg);
1763    Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1764  }
1765}
1766
1767// Try to allocate a VGPR at the end of the argument list, or if no argument
1768// VGPRs are left allocating a stack slot.
1769// If \p Mask is is given it indicates bitfield position in the register.
1770// If \p Arg is given use it with new ]p Mask instead of allocating new.
1771static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
1772                                         ArgDescriptor Arg = ArgDescriptor()) {
1773  if (Arg.isSet())
1774    return ArgDescriptor::createArg(Arg, Mask);
1775
1776  ArrayRef<MCPhysReg> ArgVGPRs
1777    = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1778  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1779  if (RegIdx == ArgVGPRs.size()) {
1780    // Spill to stack required.
1781    int64_t Offset = CCInfo.AllocateStack(4, Align(4));
1782
1783    return ArgDescriptor::createStack(Offset, Mask);
1784  }
1785
1786  unsigned Reg = ArgVGPRs[RegIdx];
1787  Reg = CCInfo.AllocateReg(Reg);
1788  assert(Reg != AMDGPU::NoRegister);
1789
1790  MachineFunction &MF = CCInfo.getMachineFunction();
1791  Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1792  MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
1793  return ArgDescriptor::createRegister(Reg, Mask);
1794}
1795
1796static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1797                                             const TargetRegisterClass *RC,
1798                                             unsigned NumArgRegs) {
1799  ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1800  unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1801  if (RegIdx == ArgSGPRs.size())
1802    report_fatal_error("ran out of SGPRs for arguments");
1803
1804  unsigned Reg = ArgSGPRs[RegIdx];
1805  Reg = CCInfo.AllocateReg(Reg);
1806  assert(Reg != AMDGPU::NoRegister);
1807
1808  MachineFunction &MF = CCInfo.getMachineFunction();
1809  MF.addLiveIn(Reg, RC);
1810  return ArgDescriptor::createRegister(Reg);
1811}
1812
1813static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1814  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1815}
1816
1817static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1818  return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1819}
1820
1821/// Allocate implicit function VGPR arguments at the end of allocated user
1822/// arguments.
1823void SITargetLowering::allocateSpecialInputVGPRs(
1824  CCState &CCInfo, MachineFunction &MF,
1825  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
1826  const unsigned Mask = 0x3ff;
1827  ArgDescriptor Arg;
1828
1829  if (Info.hasWorkItemIDX()) {
1830    Arg = allocateVGPR32Input(CCInfo, Mask);
1831    Info.setWorkItemIDX(Arg);
1832  }
1833
1834  if (Info.hasWorkItemIDY()) {
1835    Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
1836    Info.setWorkItemIDY(Arg);
1837  }
1838
1839  if (Info.hasWorkItemIDZ())
1840    Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
1841}
1842
1843/// Allocate implicit function VGPR arguments in fixed registers.
1844void SITargetLowering::allocateSpecialInputVGPRsFixed(
1845  CCState &CCInfo, MachineFunction &MF,
1846  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
1847  Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
1848  if (!Reg)
1849    report_fatal_error("failed to allocated VGPR for implicit arguments");
1850
1851  const unsigned Mask = 0x3ff;
1852  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1853  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
1854  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
1855}
1856
1857void SITargetLowering::allocateSpecialInputSGPRs(
1858  CCState &CCInfo,
1859  MachineFunction &MF,
1860  const SIRegisterInfo &TRI,
1861  SIMachineFunctionInfo &Info) const {
1862  auto &ArgInfo = Info.getArgInfo();
1863
1864  // TODO: Unify handling with private memory pointers.
1865
1866  if (Info.hasDispatchPtr())
1867    ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1868
1869  if (Info.hasQueuePtr())
1870    ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1871
1872  // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
1873  // constant offset from the kernarg segment.
1874  if (Info.hasImplicitArgPtr())
1875    ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1876
1877  if (Info.hasDispatchID())
1878    ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1879
1880  // flat_scratch_init is not applicable for non-kernel functions.
1881
1882  if (Info.hasWorkGroupIDX())
1883    ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1884
1885  if (Info.hasWorkGroupIDY())
1886    ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1887
1888  if (Info.hasWorkGroupIDZ())
1889    ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1890}
1891
1892// Allocate special inputs passed in user SGPRs.
1893void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
1894                                            MachineFunction &MF,
1895                                            const SIRegisterInfo &TRI,
1896                                            SIMachineFunctionInfo &Info) const {
1897  if (Info.hasImplicitBufferPtr()) {
1898    unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1899    MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1900    CCInfo.AllocateReg(ImplicitBufferPtrReg);
1901  }
1902
1903  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1904  if (Info.hasPrivateSegmentBuffer()) {
1905    unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1906    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1907    CCInfo.AllocateReg(PrivateSegmentBufferReg);
1908  }
1909
1910  if (Info.hasDispatchPtr()) {
1911    unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1912    MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1913    CCInfo.AllocateReg(DispatchPtrReg);
1914  }
1915
1916  if (Info.hasQueuePtr()) {
1917    unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1918    MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1919    CCInfo.AllocateReg(QueuePtrReg);
1920  }
1921
1922  if (Info.hasKernargSegmentPtr()) {
1923    MachineRegisterInfo &MRI = MF.getRegInfo();
1924    Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
1925    CCInfo.AllocateReg(InputPtrReg);
1926
1927    Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1928    MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1929  }
1930
1931  if (Info.hasDispatchID()) {
1932    unsigned DispatchIDReg = Info.addDispatchID(TRI);
1933    MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1934    CCInfo.AllocateReg(DispatchIDReg);
1935  }
1936
1937  if (Info.hasFlatScratchInit()) {
1938    unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1939    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1940    CCInfo.AllocateReg(FlatScratchInitReg);
1941  }
1942
1943  // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1944  // these from the dispatch pointer.
1945}
1946
1947// Allocate special input registers that are initialized per-wave.
1948void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
1949                                           MachineFunction &MF,
1950                                           SIMachineFunctionInfo &Info,
1951                                           CallingConv::ID CallConv,
1952                                           bool IsShader) const {
1953  if (Info.hasWorkGroupIDX()) {
1954    unsigned Reg = Info.addWorkGroupIDX();
1955    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
1956    CCInfo.AllocateReg(Reg);
1957  }
1958
1959  if (Info.hasWorkGroupIDY()) {
1960    unsigned Reg = Info.addWorkGroupIDY();
1961    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
1962    CCInfo.AllocateReg(Reg);
1963  }
1964
1965  if (Info.hasWorkGroupIDZ()) {
1966    unsigned Reg = Info.addWorkGroupIDZ();
1967    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
1968    CCInfo.AllocateReg(Reg);
1969  }
1970
1971  if (Info.hasWorkGroupInfo()) {
1972    unsigned Reg = Info.addWorkGroupInfo();
1973    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
1974    CCInfo.AllocateReg(Reg);
1975  }
1976
1977  if (Info.hasPrivateSegmentWaveByteOffset()) {
1978    // Scratch wave offset passed in system SGPR.
1979    unsigned PrivateSegmentWaveByteOffsetReg;
1980
1981    if (IsShader) {
1982      PrivateSegmentWaveByteOffsetReg =
1983        Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1984
1985      // This is true if the scratch wave byte offset doesn't have a fixed
1986      // location.
1987      if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1988        PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1989        Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1990      }
1991    } else
1992      PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1993
1994    MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1995    CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1996  }
1997}
1998
1999static void reservePrivateMemoryRegs(const TargetMachine &TM,
2000                                     MachineFunction &MF,
2001                                     const SIRegisterInfo &TRI,
2002                                     SIMachineFunctionInfo &Info) {
2003  // Now that we've figured out where the scratch register inputs are, see if
2004  // should reserve the arguments and use them directly.
2005  MachineFrameInfo &MFI = MF.getFrameInfo();
2006  bool HasStackObjects = MFI.hasStackObjects();
2007  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2008
2009  // Record that we know we have non-spill stack objects so we don't need to
2010  // check all stack objects later.
2011  if (HasStackObjects)
2012    Info.setHasNonSpillStackObjects(true);
2013
2014  // Everything live out of a block is spilled with fast regalloc, so it's
2015  // almost certain that spilling will be required.
2016  if (TM.getOptLevel() == CodeGenOpt::None)
2017    HasStackObjects = true;
2018
2019  // For now assume stack access is needed in any callee functions, so we need
2020  // the scratch registers to pass in.
2021  bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2022
2023  if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2024    // If we have stack objects, we unquestionably need the private buffer
2025    // resource. For the Code Object V2 ABI, this will be the first 4 user
2026    // SGPR inputs. We can reserve those and use them directly.
2027
2028    Register PrivateSegmentBufferReg =
2029        Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2030    Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2031  } else {
2032    unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2033    // We tentatively reserve the last registers (skipping the last registers
2034    // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2035    // we'll replace these with the ones immediately after those which were
2036    // really allocated. In the prologue copies will be inserted from the
2037    // argument to these reserved registers.
2038
2039    // Without HSA, relocations are used for the scratch pointer and the
2040    // buffer resource setup is always inserted in the prologue. Scratch wave
2041    // offset is still in an input SGPR.
2042    Info.setScratchRSrcReg(ReservedBufferReg);
2043  }
2044
2045  MachineRegisterInfo &MRI = MF.getRegInfo();
2046
2047  // For entry functions we have to set up the stack pointer if we use it,
2048  // whereas non-entry functions get this "for free". This means there is no
2049  // intrinsic advantage to using S32 over S34 in cases where we do not have
2050  // calls but do need a frame pointer (i.e. if we are requested to have one
2051  // because frame pointer elimination is disabled). To keep things simple we
2052  // only ever use S32 as the call ABI stack pointer, and so using it does not
2053  // imply we need a separate frame pointer.
2054  //
2055  // Try to use s32 as the SP, but move it if it would interfere with input
2056  // arguments. This won't work with calls though.
2057  //
2058  // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2059  // registers.
2060  if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2061    Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2062  } else {
2063    assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2064
2065    if (MFI.hasCalls())
2066      report_fatal_error("call in graphics shader with too many input SGPRs");
2067
2068    for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2069      if (!MRI.isLiveIn(Reg)) {
2070        Info.setStackPtrOffsetReg(Reg);
2071        break;
2072      }
2073    }
2074
2075    if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2076      report_fatal_error("failed to find register for SP");
2077  }
2078
2079  // hasFP should be accurate for entry functions even before the frame is
2080  // finalized, because it does not rely on the known stack size, only
2081  // properties like whether variable sized objects are present.
2082  if (ST.getFrameLowering()->hasFP(MF)) {
2083    Info.setFrameOffsetReg(AMDGPU::SGPR33);
2084  }
2085}
2086
2087bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
2088  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2089  return !Info->isEntryFunction();
2090}
2091
2092void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
2093
2094}
2095
2096void SITargetLowering::insertCopiesSplitCSR(
2097  MachineBasicBlock *Entry,
2098  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2099  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2100
2101  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2102  if (!IStart)
2103    return;
2104
2105  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2106  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2107  MachineBasicBlock::iterator MBBI = Entry->begin();
2108  for (const MCPhysReg *I = IStart; *I; ++I) {
2109    const TargetRegisterClass *RC = nullptr;
2110    if (AMDGPU::SReg_64RegClass.contains(*I))
2111      RC = &AMDGPU::SGPR_64RegClass;
2112    else if (AMDGPU::SReg_32RegClass.contains(*I))
2113      RC = &AMDGPU::SGPR_32RegClass;
2114    else
2115      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2116
2117    Register NewVR = MRI->createVirtualRegister(RC);
2118    // Create copy from CSR to a virtual register.
2119    Entry->addLiveIn(*I);
2120    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2121      .addReg(*I);
2122
2123    // Insert the copy-back instructions right before the terminator.
2124    for (auto *Exit : Exits)
2125      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2126              TII->get(TargetOpcode::COPY), *I)
2127        .addReg(NewVR);
2128  }
2129}
2130
2131SDValue SITargetLowering::LowerFormalArguments(
2132    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2133    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2134    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2135  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2136
2137  MachineFunction &MF = DAG.getMachineFunction();
2138  const Function &Fn = MF.getFunction();
2139  FunctionType *FType = MF.getFunction().getFunctionType();
2140  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2141
2142  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
2143    DiagnosticInfoUnsupported NoGraphicsHSA(
2144        Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2145    DAG.getContext()->diagnose(NoGraphicsHSA);
2146    return DAG.getEntryNode();
2147  }
2148
2149  SmallVector<ISD::InputArg, 16> Splits;
2150  SmallVector<CCValAssign, 16> ArgLocs;
2151  BitVector Skipped(Ins.size());
2152  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2153                 *DAG.getContext());
2154
2155  bool IsShader = AMDGPU::isShader(CallConv);
2156  bool IsKernel = AMDGPU::isKernel(CallConv);
2157  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2158
2159  if (IsShader) {
2160    processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2161
2162    // At least one interpolation mode must be enabled or else the GPU will
2163    // hang.
2164    //
2165    // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2166    // set PSInputAddr, the user wants to enable some bits after the compilation
2167    // based on run-time states. Since we can't know what the final PSInputEna
2168    // will look like, so we shouldn't do anything here and the user should take
2169    // responsibility for the correct programming.
2170    //
2171    // Otherwise, the following restrictions apply:
2172    // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2173    // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2174    //   enabled too.
2175    if (CallConv == CallingConv::AMDGPU_PS) {
2176      if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2177           ((Info->getPSInputAddr() & 0xF) == 0 &&
2178            Info->isPSInputAllocated(11))) {
2179        CCInfo.AllocateReg(AMDGPU::VGPR0);
2180        CCInfo.AllocateReg(AMDGPU::VGPR1);
2181        Info->markPSInputAllocated(0);
2182        Info->markPSInputEnabled(0);
2183      }
2184      if (Subtarget->isAmdPalOS()) {
2185        // For isAmdPalOS, the user does not enable some bits after compilation
2186        // based on run-time states; the register values being generated here are
2187        // the final ones set in hardware. Therefore we need to apply the
2188        // workaround to PSInputAddr and PSInputEnable together.  (The case where
2189        // a bit is set in PSInputAddr but not PSInputEnable is where the
2190        // frontend set up an input arg for a particular interpolation mode, but
2191        // nothing uses that input arg. Really we should have an earlier pass
2192        // that removes such an arg.)
2193        unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2194        if ((PsInputBits & 0x7F) == 0 ||
2195            ((PsInputBits & 0xF) == 0 &&
2196             (PsInputBits >> 11 & 1)))
2197          Info->markPSInputEnabled(
2198              countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
2199      }
2200    }
2201
2202    assert(!Info->hasDispatchPtr() &&
2203           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
2204           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2205           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
2206           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
2207           !Info->hasWorkItemIDZ());
2208  } else if (IsKernel) {
2209    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2210  } else {
2211    Splits.append(Ins.begin(), Ins.end());
2212  }
2213
2214  if (IsEntryFunc) {
2215    allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2216    allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2217  } else {
2218    // For the fixed ABI, pass workitem IDs in the last argument register.
2219    if (AMDGPUTargetMachine::EnableFixedFunctionABI)
2220      allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2221  }
2222
2223  if (IsKernel) {
2224    analyzeFormalArgumentsCompute(CCInfo, Ins);
2225  } else {
2226    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2227    CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2228  }
2229
2230  SmallVector<SDValue, 16> Chains;
2231
2232  // FIXME: This is the minimum kernel argument alignment. We should improve
2233  // this to the maximum alignment of the arguments.
2234  //
2235  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2236  // kern arg offset.
2237  const Align KernelArgBaseAlign = Align(16);
2238
2239  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2240    const ISD::InputArg &Arg = Ins[i];
2241    if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2242      InVals.push_back(DAG.getUNDEF(Arg.VT));
2243      continue;
2244    }
2245
2246    CCValAssign &VA = ArgLocs[ArgIdx++];
2247    MVT VT = VA.getLocVT();
2248
2249    if (IsEntryFunc && VA.isMemLoc()) {
2250      VT = Ins[i].VT;
2251      EVT MemVT = VA.getLocVT();
2252
2253      const uint64_t Offset = VA.getLocMemOffset();
2254      Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2255
2256      SDValue Arg =
2257          lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment,
2258                                   Ins[i].Flags.isSExt(), &Ins[i]);
2259      Chains.push_back(Arg.getValue(1));
2260
2261      auto *ParamTy =
2262        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2263      if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2264          ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2265                      ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2266        // On SI local pointers are just offsets into LDS, so they are always
2267        // less than 16-bits.  On CI and newer they could potentially be
2268        // real pointers, so we can't guarantee their size.
2269        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2270                          DAG.getValueType(MVT::i16));
2271      }
2272
2273      InVals.push_back(Arg);
2274      continue;
2275    } else if (!IsEntryFunc && VA.isMemLoc()) {
2276      SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2277      InVals.push_back(Val);
2278      if (!Arg.Flags.isByVal())
2279        Chains.push_back(Val.getValue(1));
2280      continue;
2281    }
2282
2283    assert(VA.isRegLoc() && "Parameter must be in a register!");
2284
2285    Register Reg = VA.getLocReg();
2286    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
2287    EVT ValVT = VA.getValVT();
2288
2289    Reg = MF.addLiveIn(Reg, RC);
2290    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2291
2292    if (Arg.Flags.isSRet()) {
2293      // The return object should be reasonably addressable.
2294
2295      // FIXME: This helps when the return is a real sret. If it is a
2296      // automatically inserted sret (i.e. CanLowerReturn returns false), an
2297      // extra copy is inserted in SelectionDAGBuilder which obscures this.
2298      unsigned NumBits
2299        = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
2300      Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2301        DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2302    }
2303
2304    // If this is an 8 or 16-bit value, it is really passed promoted
2305    // to 32 bits. Insert an assert[sz]ext to capture this, then
2306    // truncate to the right size.
2307    switch (VA.getLocInfo()) {
2308    case CCValAssign::Full:
2309      break;
2310    case CCValAssign::BCvt:
2311      Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2312      break;
2313    case CCValAssign::SExt:
2314      Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2315                        DAG.getValueType(ValVT));
2316      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2317      break;
2318    case CCValAssign::ZExt:
2319      Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2320                        DAG.getValueType(ValVT));
2321      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2322      break;
2323    case CCValAssign::AExt:
2324      Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2325      break;
2326    default:
2327      llvm_unreachable("Unknown loc info!");
2328    }
2329
2330    InVals.push_back(Val);
2331  }
2332
2333  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
2334    // Special inputs come after user arguments.
2335    allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2336  }
2337
2338  // Start adding system SGPRs.
2339  if (IsEntryFunc) {
2340    allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
2341  } else {
2342    CCInfo.AllocateReg(Info->getScratchRSrcReg());
2343    allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2344  }
2345
2346  auto &ArgUsageInfo =
2347    DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2348  ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2349
2350  unsigned StackArgSize = CCInfo.getNextStackOffset();
2351  Info->setBytesInStackArgArea(StackArgSize);
2352
2353  return Chains.empty() ? Chain :
2354    DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2355}
2356
2357// TODO: If return values can't fit in registers, we should return as many as
2358// possible in registers before passing on stack.
2359bool SITargetLowering::CanLowerReturn(
2360  CallingConv::ID CallConv,
2361  MachineFunction &MF, bool IsVarArg,
2362  const SmallVectorImpl<ISD::OutputArg> &Outs,
2363  LLVMContext &Context) const {
2364  // Replacing returns with sret/stack usage doesn't make sense for shaders.
2365  // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2366  // for shaders. Vector types should be explicitly handled by CC.
2367  if (AMDGPU::isEntryFunctionCC(CallConv))
2368    return true;
2369
2370  SmallVector<CCValAssign, 16> RVLocs;
2371  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2372  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2373}
2374
2375SDValue
2376SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2377                              bool isVarArg,
2378                              const SmallVectorImpl<ISD::OutputArg> &Outs,
2379                              const SmallVectorImpl<SDValue> &OutVals,
2380                              const SDLoc &DL, SelectionDAG &DAG) const {
2381  MachineFunction &MF = DAG.getMachineFunction();
2382  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2383
2384  if (AMDGPU::isKernel(CallConv)) {
2385    return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2386                                             OutVals, DL, DAG);
2387  }
2388
2389  bool IsShader = AMDGPU::isShader(CallConv);
2390
2391  Info->setIfReturnsVoid(Outs.empty());
2392  bool IsWaveEnd = Info->returnsVoid() && IsShader;
2393
2394  // CCValAssign - represent the assignment of the return value to a location.
2395  SmallVector<CCValAssign, 48> RVLocs;
2396  SmallVector<ISD::OutputArg, 48> Splits;
2397
2398  // CCState - Info about the registers and stack slots.
2399  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2400                 *DAG.getContext());
2401
2402  // Analyze outgoing return values.
2403  CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2404
2405  SDValue Flag;
2406  SmallVector<SDValue, 48> RetOps;
2407  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2408
2409  // Add return address for callable functions.
2410  if (!Info->isEntryFunction()) {
2411    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2412    SDValue ReturnAddrReg = CreateLiveInRegister(
2413      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2414
2415    SDValue ReturnAddrVirtualReg = DAG.getRegister(
2416        MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
2417        MVT::i64);
2418    Chain =
2419        DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
2420    Flag = Chain.getValue(1);
2421    RetOps.push_back(ReturnAddrVirtualReg);
2422  }
2423
2424  // Copy the result values into the output registers.
2425  for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2426       ++I, ++RealRVLocIdx) {
2427    CCValAssign &VA = RVLocs[I];
2428    assert(VA.isRegLoc() && "Can only return in registers!");
2429    // TODO: Partially return in registers if return values don't fit.
2430    SDValue Arg = OutVals[RealRVLocIdx];
2431
2432    // Copied from other backends.
2433    switch (VA.getLocInfo()) {
2434    case CCValAssign::Full:
2435      break;
2436    case CCValAssign::BCvt:
2437      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2438      break;
2439    case CCValAssign::SExt:
2440      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2441      break;
2442    case CCValAssign::ZExt:
2443      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2444      break;
2445    case CCValAssign::AExt:
2446      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2447      break;
2448    default:
2449      llvm_unreachable("Unknown loc info!");
2450    }
2451
2452    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2453    Flag = Chain.getValue(1);
2454    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2455  }
2456
2457  // FIXME: Does sret work properly?
2458  if (!Info->isEntryFunction()) {
2459    const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2460    const MCPhysReg *I =
2461      TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2462    if (I) {
2463      for (; *I; ++I) {
2464        if (AMDGPU::SReg_64RegClass.contains(*I))
2465          RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2466        else if (AMDGPU::SReg_32RegClass.contains(*I))
2467          RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2468        else
2469          llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2470      }
2471    }
2472  }
2473
2474  // Update chain and glue.
2475  RetOps[0] = Chain;
2476  if (Flag.getNode())
2477    RetOps.push_back(Flag);
2478
2479  unsigned Opc = AMDGPUISD::ENDPGM;
2480  if (!IsWaveEnd)
2481    Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
2482  return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2483}
2484
2485SDValue SITargetLowering::LowerCallResult(
2486    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2487    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2488    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2489    SDValue ThisVal) const {
2490  CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2491
2492  // Assign locations to each value returned by this call.
2493  SmallVector<CCValAssign, 16> RVLocs;
2494  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2495                 *DAG.getContext());
2496  CCInfo.AnalyzeCallResult(Ins, RetCC);
2497
2498  // Copy all of the result registers out of their specified physreg.
2499  for (unsigned i = 0; i != RVLocs.size(); ++i) {
2500    CCValAssign VA = RVLocs[i];
2501    SDValue Val;
2502
2503    if (VA.isRegLoc()) {
2504      Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2505      Chain = Val.getValue(1);
2506      InFlag = Val.getValue(2);
2507    } else if (VA.isMemLoc()) {
2508      report_fatal_error("TODO: return values in memory");
2509    } else
2510      llvm_unreachable("unknown argument location type");
2511
2512    switch (VA.getLocInfo()) {
2513    case CCValAssign::Full:
2514      break;
2515    case CCValAssign::BCvt:
2516      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2517      break;
2518    case CCValAssign::ZExt:
2519      Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2520                        DAG.getValueType(VA.getValVT()));
2521      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2522      break;
2523    case CCValAssign::SExt:
2524      Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2525                        DAG.getValueType(VA.getValVT()));
2526      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2527      break;
2528    case CCValAssign::AExt:
2529      Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2530      break;
2531    default:
2532      llvm_unreachable("Unknown loc info!");
2533    }
2534
2535    InVals.push_back(Val);
2536  }
2537
2538  return Chain;
2539}
2540
2541// Add code to pass special inputs required depending on used features separate
2542// from the explicit user arguments present in the IR.
2543void SITargetLowering::passSpecialInputs(
2544    CallLoweringInfo &CLI,
2545    CCState &CCInfo,
2546    const SIMachineFunctionInfo &Info,
2547    SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2548    SmallVectorImpl<SDValue> &MemOpChains,
2549    SDValue Chain) const {
2550  // If we don't have a call site, this was a call inserted by
2551  // legalization. These can never use special inputs.
2552  if (!CLI.CB)
2553    return;
2554
2555  SelectionDAG &DAG = CLI.DAG;
2556  const SDLoc &DL = CLI.DL;
2557
2558  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2559  const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2560
2561  const AMDGPUFunctionArgInfo *CalleeArgInfo
2562    = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
2563  if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2564    auto &ArgUsageInfo =
2565      DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2566    CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2567  }
2568
2569  // TODO: Unify with private memory register handling. This is complicated by
2570  // the fact that at least in kernels, the input argument is not necessarily
2571  // in the same location as the input.
2572  AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2573    AMDGPUFunctionArgInfo::DISPATCH_PTR,
2574    AMDGPUFunctionArgInfo::QUEUE_PTR,
2575    AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
2576    AMDGPUFunctionArgInfo::DISPATCH_ID,
2577    AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2578    AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2579    AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
2580  };
2581
2582  for (auto InputID : InputRegs) {
2583    const ArgDescriptor *OutgoingArg;
2584    const TargetRegisterClass *ArgRC;
2585    LLT ArgTy;
2586
2587    std::tie(OutgoingArg, ArgRC, ArgTy) =
2588        CalleeArgInfo->getPreloadedValue(InputID);
2589    if (!OutgoingArg)
2590      continue;
2591
2592    const ArgDescriptor *IncomingArg;
2593    const TargetRegisterClass *IncomingArgRC;
2594    LLT Ty;
2595    std::tie(IncomingArg, IncomingArgRC, Ty) =
2596        CallerArgInfo.getPreloadedValue(InputID);
2597    assert(IncomingArgRC == ArgRC);
2598
2599    // All special arguments are ints for now.
2600    EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2601    SDValue InputReg;
2602
2603    if (IncomingArg) {
2604      InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2605    } else {
2606      // The implicit arg ptr is special because it doesn't have a corresponding
2607      // input for kernels, and is computed from the kernarg segment pointer.
2608      assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2609      InputReg = getImplicitArgPtr(DAG, DL);
2610    }
2611
2612    if (OutgoingArg->isRegister()) {
2613      RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2614      if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2615        report_fatal_error("failed to allocate implicit input argument");
2616    } else {
2617      unsigned SpecialArgOffset =
2618          CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2619      SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2620                                              SpecialArgOffset);
2621      MemOpChains.push_back(ArgStore);
2622    }
2623  }
2624
2625  // Pack workitem IDs into a single register or pass it as is if already
2626  // packed.
2627  const ArgDescriptor *OutgoingArg;
2628  const TargetRegisterClass *ArgRC;
2629  LLT Ty;
2630
2631  std::tie(OutgoingArg, ArgRC, Ty) =
2632      CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2633  if (!OutgoingArg)
2634    std::tie(OutgoingArg, ArgRC, Ty) =
2635        CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2636  if (!OutgoingArg)
2637    std::tie(OutgoingArg, ArgRC, Ty) =
2638        CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2639  if (!OutgoingArg)
2640    return;
2641
2642  const ArgDescriptor *IncomingArgX = std::get<0>(
2643      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
2644  const ArgDescriptor *IncomingArgY = std::get<0>(
2645      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
2646  const ArgDescriptor *IncomingArgZ = std::get<0>(
2647      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
2648
2649  SDValue InputReg;
2650  SDLoc SL;
2651
2652  // If incoming ids are not packed we need to pack them.
2653  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX)
2654    InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2655
2656  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
2657    SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
2658    Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
2659                    DAG.getShiftAmountConstant(10, MVT::i32, SL));
2660    InputReg = InputReg.getNode() ?
2661                 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
2662  }
2663
2664  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
2665    SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
2666    Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
2667                    DAG.getShiftAmountConstant(20, MVT::i32, SL));
2668    InputReg = InputReg.getNode() ?
2669                 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
2670  }
2671
2672  if (!InputReg.getNode()) {
2673    // Workitem ids are already packed, any of present incoming arguments
2674    // will carry all required fields.
2675    ArgDescriptor IncomingArg = ArgDescriptor::createArg(
2676      IncomingArgX ? *IncomingArgX :
2677      IncomingArgY ? *IncomingArgY :
2678                     *IncomingArgZ, ~0u);
2679    InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
2680  }
2681
2682  if (OutgoingArg->isRegister()) {
2683    RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2684    CCInfo.AllocateReg(OutgoingArg->getRegister());
2685  } else {
2686    unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
2687    SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2688                                            SpecialArgOffset);
2689    MemOpChains.push_back(ArgStore);
2690  }
2691}
2692
2693static bool canGuaranteeTCO(CallingConv::ID CC) {
2694  return CC == CallingConv::Fast;
2695}
2696
2697/// Return true if we might ever do TCO for calls with this calling convention.
2698static bool mayTailCallThisCC(CallingConv::ID CC) {
2699  switch (CC) {
2700  case CallingConv::C:
2701    return true;
2702  default:
2703    return canGuaranteeTCO(CC);
2704  }
2705}
2706
2707bool SITargetLowering::isEligibleForTailCallOptimization(
2708    SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2709    const SmallVectorImpl<ISD::OutputArg> &Outs,
2710    const SmallVectorImpl<SDValue> &OutVals,
2711    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2712  if (!mayTailCallThisCC(CalleeCC))
2713    return false;
2714
2715  MachineFunction &MF = DAG.getMachineFunction();
2716  const Function &CallerF = MF.getFunction();
2717  CallingConv::ID CallerCC = CallerF.getCallingConv();
2718  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2719  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2720
2721  // Kernels aren't callable, and don't have a live in return address so it
2722  // doesn't make sense to do a tail call with entry functions.
2723  if (!CallerPreserved)
2724    return false;
2725
2726  bool CCMatch = CallerCC == CalleeCC;
2727
2728  if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2729    if (canGuaranteeTCO(CalleeCC) && CCMatch)
2730      return true;
2731    return false;
2732  }
2733
2734  // TODO: Can we handle var args?
2735  if (IsVarArg)
2736    return false;
2737
2738  for (const Argument &Arg : CallerF.args()) {
2739    if (Arg.hasByValAttr())
2740      return false;
2741  }
2742
2743  LLVMContext &Ctx = *DAG.getContext();
2744
2745  // Check that the call results are passed in the same way.
2746  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2747                                  CCAssignFnForCall(CalleeCC, IsVarArg),
2748                                  CCAssignFnForCall(CallerCC, IsVarArg)))
2749    return false;
2750
2751  // The callee has to preserve all registers the caller needs to preserve.
2752  if (!CCMatch) {
2753    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2754    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2755      return false;
2756  }
2757
2758  // Nothing more to check if the callee is taking no arguments.
2759  if (Outs.empty())
2760    return true;
2761
2762  SmallVector<CCValAssign, 16> ArgLocs;
2763  CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2764
2765  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2766
2767  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2768  // If the stack arguments for this call do not fit into our own save area then
2769  // the call cannot be made tail.
2770  // TODO: Is this really necessary?
2771  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2772    return false;
2773
2774  const MachineRegisterInfo &MRI = MF.getRegInfo();
2775  return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2776}
2777
2778bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2779  if (!CI->isTailCall())
2780    return false;
2781
2782  const Function *ParentFn = CI->getParent()->getParent();
2783  if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2784    return false;
2785  return true;
2786}
2787
2788// The wave scratch offset register is used as the global base pointer.
2789SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2790                                    SmallVectorImpl<SDValue> &InVals) const {
2791  SelectionDAG &DAG = CLI.DAG;
2792  const SDLoc &DL = CLI.DL;
2793  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2794  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2795  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2796  SDValue Chain = CLI.Chain;
2797  SDValue Callee = CLI.Callee;
2798  bool &IsTailCall = CLI.IsTailCall;
2799  CallingConv::ID CallConv = CLI.CallConv;
2800  bool IsVarArg = CLI.IsVarArg;
2801  bool IsSibCall = false;
2802  bool IsThisReturn = false;
2803  MachineFunction &MF = DAG.getMachineFunction();
2804
2805  if (Callee.isUndef() || isNullConstant(Callee)) {
2806    if (!CLI.IsTailCall) {
2807      for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
2808        InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
2809    }
2810
2811    return Chain;
2812  }
2813
2814  if (IsVarArg) {
2815    return lowerUnhandledCall(CLI, InVals,
2816                              "unsupported call to variadic function ");
2817  }
2818
2819  if (!CLI.CB)
2820    report_fatal_error("unsupported libcall legalization");
2821
2822  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
2823      !CLI.CB->getCalledFunction()) {
2824    return lowerUnhandledCall(CLI, InVals,
2825                              "unsupported indirect call to function ");
2826  }
2827
2828  if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2829    return lowerUnhandledCall(CLI, InVals,
2830                              "unsupported required tail call to function ");
2831  }
2832
2833  if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
2834    // Note the issue is with the CC of the calling function, not of the call
2835    // itself.
2836    return lowerUnhandledCall(CLI, InVals,
2837                          "unsupported call from graphics shader of function ");
2838  }
2839
2840  if (IsTailCall) {
2841    IsTailCall = isEligibleForTailCallOptimization(
2842      Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2843    if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
2844      report_fatal_error("failed to perform tail call elimination on a call "
2845                         "site marked musttail");
2846    }
2847
2848    bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2849
2850    // A sibling call is one where we're under the usual C ABI and not planning
2851    // to change that but can still do a tail call:
2852    if (!TailCallOpt && IsTailCall)
2853      IsSibCall = true;
2854
2855    if (IsTailCall)
2856      ++NumTailCalls;
2857  }
2858
2859  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2860  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2861  SmallVector<SDValue, 8> MemOpChains;
2862
2863  // Analyze operands of the call, assigning locations to each operand.
2864  SmallVector<CCValAssign, 16> ArgLocs;
2865  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2866  CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2867
2868  if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
2869    // With a fixed ABI, allocate fixed registers before user arguments.
2870    passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
2871  }
2872
2873  CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2874
2875  // Get a count of how many bytes are to be pushed on the stack.
2876  unsigned NumBytes = CCInfo.getNextStackOffset();
2877
2878  if (IsSibCall) {
2879    // Since we're not changing the ABI to make this a tail call, the memory
2880    // operands are already available in the caller's incoming argument space.
2881    NumBytes = 0;
2882  }
2883
2884  // FPDiff is the byte offset of the call's argument area from the callee's.
2885  // Stores to callee stack arguments will be placed in FixedStackSlots offset
2886  // by this amount for a tail call. In a sibling call it must be 0 because the
2887  // caller will deallocate the entire stack and the callee still expects its
2888  // arguments to begin at SP+0. Completely unused for non-tail calls.
2889  int32_t FPDiff = 0;
2890  MachineFrameInfo &MFI = MF.getFrameInfo();
2891
2892  // Adjust the stack pointer for the new arguments...
2893  // These operations are automatically eliminated by the prolog/epilog pass
2894  if (!IsSibCall) {
2895    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2896
2897    SmallVector<SDValue, 4> CopyFromChains;
2898
2899    // In the HSA case, this should be an identity copy.
2900    SDValue ScratchRSrcReg
2901      = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2902    RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2903    CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
2904    Chain = DAG.getTokenFactor(DL, CopyFromChains);
2905  }
2906
2907  MVT PtrVT = MVT::i32;
2908
2909  // Walk the register/memloc assignments, inserting copies/loads.
2910  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2911    CCValAssign &VA = ArgLocs[i];
2912    SDValue Arg = OutVals[i];
2913
2914    // Promote the value if needed.
2915    switch (VA.getLocInfo()) {
2916    case CCValAssign::Full:
2917      break;
2918    case CCValAssign::BCvt:
2919      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2920      break;
2921    case CCValAssign::ZExt:
2922      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2923      break;
2924    case CCValAssign::SExt:
2925      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2926      break;
2927    case CCValAssign::AExt:
2928      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2929      break;
2930    case CCValAssign::FPExt:
2931      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2932      break;
2933    default:
2934      llvm_unreachable("Unknown loc info!");
2935    }
2936
2937    if (VA.isRegLoc()) {
2938      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2939    } else {
2940      assert(VA.isMemLoc());
2941
2942      SDValue DstAddr;
2943      MachinePointerInfo DstInfo;
2944
2945      unsigned LocMemOffset = VA.getLocMemOffset();
2946      int32_t Offset = LocMemOffset;
2947
2948      SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
2949      MaybeAlign Alignment;
2950
2951      if (IsTailCall) {
2952        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2953        unsigned OpSize = Flags.isByVal() ?
2954          Flags.getByValSize() : VA.getValVT().getStoreSize();
2955
2956        // FIXME: We can have better than the minimum byval required alignment.
2957        Alignment =
2958            Flags.isByVal()
2959                ? Flags.getNonZeroByValAlign()
2960                : commonAlignment(Subtarget->getStackAlignment(), Offset);
2961
2962        Offset = Offset + FPDiff;
2963        int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2964
2965        DstAddr = DAG.getFrameIndex(FI, PtrVT);
2966        DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2967
2968        // Make sure any stack arguments overlapping with where we're storing
2969        // are loaded before this eventual operation. Otherwise they'll be
2970        // clobbered.
2971
2972        // FIXME: Why is this really necessary? This seems to just result in a
2973        // lot of code to copy the stack and write them back to the same
2974        // locations, which are supposed to be immutable?
2975        Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2976      } else {
2977        DstAddr = PtrOff;
2978        DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2979        Alignment =
2980            commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
2981      }
2982
2983      if (Outs[i].Flags.isByVal()) {
2984        SDValue SizeNode =
2985            DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2986        SDValue Cpy =
2987            DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
2988                          Outs[i].Flags.getNonZeroByValAlign(),
2989                          /*isVol = */ false, /*AlwaysInline = */ true,
2990                          /*isTailCall = */ false, DstInfo,
2991                          MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
2992
2993        MemOpChains.push_back(Cpy);
2994      } else {
2995        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo,
2996                                     Alignment ? Alignment->value() : 0);
2997        MemOpChains.push_back(Store);
2998      }
2999    }
3000  }
3001
3002  if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
3003    // Copy special input registers after user input arguments.
3004    passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3005  }
3006
3007  if (!MemOpChains.empty())
3008    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3009
3010  // Build a sequence of copy-to-reg nodes chained together with token chain
3011  // and flag operands which copy the outgoing args into the appropriate regs.
3012  SDValue InFlag;
3013  for (auto &RegToPass : RegsToPass) {
3014    Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3015                             RegToPass.second, InFlag);
3016    InFlag = Chain.getValue(1);
3017  }
3018
3019
3020  SDValue PhysReturnAddrReg;
3021  if (IsTailCall) {
3022    // Since the return is being combined with the call, we need to pass on the
3023    // return address.
3024
3025    const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
3026    SDValue ReturnAddrReg = CreateLiveInRegister(
3027      DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
3028
3029    PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
3030                                        MVT::i64);
3031    Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
3032    InFlag = Chain.getValue(1);
3033  }
3034
3035  // We don't usually want to end the call-sequence here because we would tidy
3036  // the frame up *after* the call, however in the ABI-changing tail-call case
3037  // we've carefully laid out the parameters so that when sp is reset they'll be
3038  // in the correct location.
3039  if (IsTailCall && !IsSibCall) {
3040    Chain = DAG.getCALLSEQ_END(Chain,
3041                               DAG.getTargetConstant(NumBytes, DL, MVT::i32),
3042                               DAG.getTargetConstant(0, DL, MVT::i32),
3043                               InFlag, DL);
3044    InFlag = Chain.getValue(1);
3045  }
3046
3047  std::vector<SDValue> Ops;
3048  Ops.push_back(Chain);
3049  Ops.push_back(Callee);
3050  // Add a redundant copy of the callee global which will not be legalized, as
3051  // we need direct access to the callee later.
3052  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3053    const GlobalValue *GV = GSD->getGlobal();
3054    Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3055  } else {
3056    Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3057  }
3058
3059  if (IsTailCall) {
3060    // Each tail call may have to adjust the stack by a different amount, so
3061    // this information must travel along with the operation for eventual
3062    // consumption by emitEpilogue.
3063    Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3064
3065    Ops.push_back(PhysReturnAddrReg);
3066  }
3067
3068  // Add argument registers to the end of the list so that they are known live
3069  // into the call.
3070  for (auto &RegToPass : RegsToPass) {
3071    Ops.push_back(DAG.getRegister(RegToPass.first,
3072                                  RegToPass.second.getValueType()));
3073  }
3074
3075  // Add a register mask operand representing the call-preserved registers.
3076
3077  auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3078  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3079  assert(Mask && "Missing call preserved mask for calling convention");
3080  Ops.push_back(DAG.getRegisterMask(Mask));
3081
3082  if (InFlag.getNode())
3083    Ops.push_back(InFlag);
3084
3085  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3086
3087  // If we're doing a tall call, use a TC_RETURN here rather than an
3088  // actual call instruction.
3089  if (IsTailCall) {
3090    MFI.setHasTailCall();
3091    return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
3092  }
3093
3094  // Returns a chain and a flag for retval copy to use.
3095  SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3096  Chain = Call.getValue(0);
3097  InFlag = Call.getValue(1);
3098
3099  uint64_t CalleePopBytes = NumBytes;
3100  Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
3101                             DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
3102                             InFlag, DL);
3103  if (!Ins.empty())
3104    InFlag = Chain.getValue(1);
3105
3106  // Handle result values, copying them out of physregs into vregs that we
3107  // return.
3108  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3109                         InVals, IsThisReturn,
3110                         IsThisReturn ? OutVals[0] : SDValue());
3111}
3112
3113// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3114// except for applying the wave size scale to the increment amount.
3115SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
3116    SDValue Op, SelectionDAG &DAG) const {
3117  const MachineFunction &MF = DAG.getMachineFunction();
3118  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3119
3120  SDLoc dl(Op);
3121  EVT VT = Op.getValueType();
3122  SDValue Tmp1 = Op;
3123  SDValue Tmp2 = Op.getValue(1);
3124  SDValue Tmp3 = Op.getOperand(2);
3125  SDValue Chain = Tmp1.getOperand(0);
3126
3127  Register SPReg = Info->getStackPtrOffsetReg();
3128
3129  // Chain the dynamic stack allocation so that it doesn't modify the stack
3130  // pointer when other instructions are using the stack.
3131  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3132
3133  SDValue Size  = Tmp2.getOperand(1);
3134  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3135  Chain = SP.getValue(1);
3136  MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3137  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3138  const TargetFrameLowering *TFL = ST.getFrameLowering();
3139  unsigned Opc =
3140    TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3141    ISD::ADD : ISD::SUB;
3142
3143  SDValue ScaledSize = DAG.getNode(
3144      ISD::SHL, dl, VT, Size,
3145      DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3146
3147  Align StackAlign = TFL->getStackAlign();
3148  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3149  if (Alignment && *Alignment > StackAlign) {
3150    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3151                       DAG.getConstant(-(uint64_t)Alignment->value()
3152                                           << ST.getWavefrontSizeLog2(),
3153                                       dl, VT));
3154  }
3155
3156  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);    // Output chain
3157  Tmp2 = DAG.getCALLSEQ_END(
3158      Chain, DAG.getIntPtrConstant(0, dl, true),
3159      DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
3160
3161  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3162}
3163
3164SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
3165                                                  SelectionDAG &DAG) const {
3166  // We only handle constant sizes here to allow non-entry block, static sized
3167  // allocas. A truly dynamic value is more difficult to support because we
3168  // don't know if the size value is uniform or not. If the size isn't uniform,
3169  // we would need to do a wave reduction to get the maximum size to know how
3170  // much to increment the uniform stack pointer.
3171  SDValue Size = Op.getOperand(1);
3172  if (isa<ConstantSDNode>(Size))
3173      return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3174
3175  return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
3176}
3177
3178Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
3179                                             const MachineFunction &MF) const {
3180  Register Reg = StringSwitch<Register>(RegName)
3181    .Case("m0", AMDGPU::M0)
3182    .Case("exec", AMDGPU::EXEC)
3183    .Case("exec_lo", AMDGPU::EXEC_LO)
3184    .Case("exec_hi", AMDGPU::EXEC_HI)
3185    .Case("flat_scratch", AMDGPU::FLAT_SCR)
3186    .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3187    .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3188    .Default(Register());
3189
3190  if (Reg == AMDGPU::NoRegister) {
3191    report_fatal_error(Twine("invalid register name \""
3192                             + StringRef(RegName)  + "\"."));
3193
3194  }
3195
3196  if (!Subtarget->hasFlatScrRegister() &&
3197       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3198    report_fatal_error(Twine("invalid register \""
3199                             + StringRef(RegName)  + "\" for subtarget."));
3200  }
3201
3202  switch (Reg) {
3203  case AMDGPU::M0:
3204  case AMDGPU::EXEC_LO:
3205  case AMDGPU::EXEC_HI:
3206  case AMDGPU::FLAT_SCR_LO:
3207  case AMDGPU::FLAT_SCR_HI:
3208    if (VT.getSizeInBits() == 32)
3209      return Reg;
3210    break;
3211  case AMDGPU::EXEC:
3212  case AMDGPU::FLAT_SCR:
3213    if (VT.getSizeInBits() == 64)
3214      return Reg;
3215    break;
3216  default:
3217    llvm_unreachable("missing register type checking");
3218  }
3219
3220  report_fatal_error(Twine("invalid type for register \""
3221                           + StringRef(RegName) + "\"."));
3222}
3223
3224// If kill is not the last instruction, split the block so kill is always a
3225// proper terminator.
3226MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
3227                                                    MachineBasicBlock *BB) const {
3228  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3229
3230  MachineBasicBlock::iterator SplitPoint(&MI);
3231  ++SplitPoint;
3232
3233  if (SplitPoint == BB->end()) {
3234    // Don't bother with a new block.
3235    MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3236    return BB;
3237  }
3238
3239  MachineFunction *MF = BB->getParent();
3240  MachineBasicBlock *SplitBB
3241    = MF->CreateMachineBasicBlock(BB->getBasicBlock());
3242
3243  MF->insert(++MachineFunction::iterator(BB), SplitBB);
3244  SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
3245
3246  SplitBB->transferSuccessorsAndUpdatePHIs(BB);
3247  BB->addSuccessor(SplitBB);
3248
3249  MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3250  return SplitBB;
3251}
3252
3253// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3254// \p MI will be the only instruction in the loop body block. Otherwise, it will
3255// be the first instruction in the remainder block.
3256//
3257/// \returns { LoopBody, Remainder }
3258static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3259splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
3260  MachineFunction *MF = MBB.getParent();
3261  MachineBasicBlock::iterator I(&MI);
3262
3263  // To insert the loop we need to split the block. Move everything after this
3264  // point to a new block, and insert a new empty block between the two.
3265  MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
3266  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3267  MachineFunction::iterator MBBI(MBB);
3268  ++MBBI;
3269
3270  MF->insert(MBBI, LoopBB);
3271  MF->insert(MBBI, RemainderBB);
3272
3273  LoopBB->addSuccessor(LoopBB);
3274  LoopBB->addSuccessor(RemainderBB);
3275
3276  // Move the rest of the block into a new block.
3277  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3278
3279  if (InstInLoop) {
3280    auto Next = std::next(I);
3281
3282    // Move instruction to loop body.
3283    LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3284
3285    // Move the rest of the block.
3286    RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3287  } else {
3288    RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3289  }
3290
3291  MBB.addSuccessor(LoopBB);
3292
3293  return std::make_pair(LoopBB, RemainderBB);
3294}
3295
3296/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3297void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
3298  MachineBasicBlock *MBB = MI.getParent();
3299  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3300  auto I = MI.getIterator();
3301  auto E = std::next(I);
3302
3303  BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3304    .addImm(0);
3305
3306  MIBundleBuilder Bundler(*MBB, I, E);
3307  finalizeBundle(*MBB, Bundler.begin());
3308}
3309
3310MachineBasicBlock *
3311SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
3312                                         MachineBasicBlock *BB) const {
3313  const DebugLoc &DL = MI.getDebugLoc();
3314
3315  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3316
3317  MachineBasicBlock *LoopBB;
3318  MachineBasicBlock *RemainderBB;
3319  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3320
3321  // Apparently kill flags are only valid if the def is in the same block?
3322  if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3323    Src->setIsKill(false);
3324
3325  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3326
3327  MachineBasicBlock::iterator I = LoopBB->end();
3328
3329  const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3330    AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
3331
3332  // Clear TRAP_STS.MEM_VIOL
3333  BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3334    .addImm(0)
3335    .addImm(EncodedReg);
3336
3337  bundleInstWithWaitcnt(MI);
3338
3339  Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3340
3341  // Load and check TRAP_STS.MEM_VIOL
3342  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3343    .addImm(EncodedReg);
3344
3345  // FIXME: Do we need to use an isel pseudo that may clobber scc?
3346  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3347    .addReg(Reg, RegState::Kill)
3348    .addImm(0);
3349  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3350    .addMBB(LoopBB);
3351
3352  return RemainderBB;
3353}
3354
3355// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3356// wavefront. If the value is uniform and just happens to be in a VGPR, this
3357// will only do one iteration. In the worst case, this will loop 64 times.
3358//
3359// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3360static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
3361  const SIInstrInfo *TII,
3362  MachineRegisterInfo &MRI,
3363  MachineBasicBlock &OrigBB,
3364  MachineBasicBlock &LoopBB,
3365  const DebugLoc &DL,
3366  const MachineOperand &IdxReg,
3367  unsigned InitReg,
3368  unsigned ResultReg,
3369  unsigned PhiReg,
3370  unsigned InitSaveExecReg,
3371  int Offset,
3372  bool UseGPRIdxMode,
3373  bool IsIndirectSrc) {
3374  MachineFunction *MF = OrigBB.getParent();
3375  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3376  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3377  MachineBasicBlock::iterator I = LoopBB.begin();
3378
3379  const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3380  Register PhiExec = MRI.createVirtualRegister(BoolRC);
3381  Register NewExec = MRI.createVirtualRegister(BoolRC);
3382  Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3383  Register CondReg = MRI.createVirtualRegister(BoolRC);
3384
3385  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3386    .addReg(InitReg)
3387    .addMBB(&OrigBB)
3388    .addReg(ResultReg)
3389    .addMBB(&LoopBB);
3390
3391  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3392    .addReg(InitSaveExecReg)
3393    .addMBB(&OrigBB)
3394    .addReg(NewExec)
3395    .addMBB(&LoopBB);
3396
3397  // Read the next variant <- also loop target.
3398  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3399    .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
3400
3401  // Compare the just read M0 value to all possible Idx values.
3402  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3403    .addReg(CurrentIdxReg)
3404    .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
3405
3406  // Update EXEC, save the original EXEC value to VCC.
3407  BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3408                                                : AMDGPU::S_AND_SAVEEXEC_B64),
3409          NewExec)
3410    .addReg(CondReg, RegState::Kill);
3411
3412  MRI.setSimpleHint(NewExec, CondReg);
3413
3414  if (UseGPRIdxMode) {
3415    unsigned IdxReg;
3416    if (Offset == 0) {
3417      IdxReg = CurrentIdxReg;
3418    } else {
3419      IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3420      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
3421        .addReg(CurrentIdxReg, RegState::Kill)
3422        .addImm(Offset);
3423    }
3424    unsigned IdxMode = IsIndirectSrc ?
3425      AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
3426    MachineInstr *SetOn =
3427      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3428      .addReg(IdxReg, RegState::Kill)
3429      .addImm(IdxMode);
3430    SetOn->getOperand(3).setIsUndef();
3431  } else {
3432    // Move index from VCC into M0
3433    if (Offset == 0) {
3434      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3435        .addReg(CurrentIdxReg, RegState::Kill);
3436    } else {
3437      BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3438        .addReg(CurrentIdxReg, RegState::Kill)
3439        .addImm(Offset);
3440    }
3441  }
3442
3443  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3444  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3445  MachineInstr *InsertPt =
3446    BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3447                                                  : AMDGPU::S_XOR_B64_term), Exec)
3448      .addReg(Exec)
3449      .addReg(NewExec);
3450
3451  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3452  // s_cbranch_scc0?
3453
3454  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3455  BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3456    .addMBB(&LoopBB);
3457
3458  return InsertPt->getIterator();
3459}
3460
3461// This has slightly sub-optimal regalloc when the source vector is killed by
3462// the read. The register allocator does not understand that the kill is
3463// per-workitem, so is kept alive for the whole loop so we end up not re-using a
3464// subregister from it, using 1 more VGPR than necessary. This was saved when
3465// this was expanded after register allocation.
3466static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
3467                                                  MachineBasicBlock &MBB,
3468                                                  MachineInstr &MI,
3469                                                  unsigned InitResultReg,
3470                                                  unsigned PhiReg,
3471                                                  int Offset,
3472                                                  bool UseGPRIdxMode,
3473                                                  bool IsIndirectSrc) {
3474  MachineFunction *MF = MBB.getParent();
3475  const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3476  const SIRegisterInfo *TRI = ST.getRegisterInfo();
3477  MachineRegisterInfo &MRI = MF->getRegInfo();
3478  const DebugLoc &DL = MI.getDebugLoc();
3479  MachineBasicBlock::iterator I(&MI);
3480
3481  const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3482  Register DstReg = MI.getOperand(0).getReg();
3483  Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3484  Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3485  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3486  unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3487
3488  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3489
3490  // Save the EXEC mask
3491  BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3492    .addReg(Exec);
3493
3494  MachineBasicBlock *LoopBB;
3495  MachineBasicBlock *RemainderBB;
3496  std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3497
3498  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3499
3500  auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3501                                      InitResultReg, DstReg, PhiReg, TmpExec,
3502                                      Offset, UseGPRIdxMode, IsIndirectSrc);
3503  MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
3504  MachineFunction::iterator MBBI(LoopBB);
3505  ++MBBI;
3506  MF->insert(MBBI, LandingPad);
3507  LoopBB->removeSuccessor(RemainderBB);
3508  LandingPad->addSuccessor(RemainderBB);
3509  LoopBB->addSuccessor(LandingPad);
3510  MachineBasicBlock::iterator First = LandingPad->begin();
3511  BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3512    .addReg(SaveExec);
3513
3514  return InsPt;
3515}
3516
3517// Returns subreg index, offset
3518static std::pair<unsigned, int>
3519computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
3520                            const TargetRegisterClass *SuperRC,
3521                            unsigned VecReg,
3522                            int Offset) {
3523  int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3524
3525  // Skip out of bounds offsets, or else we would end up using an undefined
3526  // register.
3527  if (Offset >= NumElts || Offset < 0)
3528    return std::make_pair(AMDGPU::sub0, Offset);
3529
3530  return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3531}
3532
3533// Return true if the index is an SGPR and was set.
3534static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
3535                                 MachineRegisterInfo &MRI,
3536                                 MachineInstr &MI,
3537                                 int Offset,
3538                                 bool UseGPRIdxMode,
3539                                 bool IsIndirectSrc) {
3540  MachineBasicBlock *MBB = MI.getParent();
3541  const DebugLoc &DL = MI.getDebugLoc();
3542  MachineBasicBlock::iterator I(&MI);
3543
3544  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3545  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3546
3547  assert(Idx->getReg() != AMDGPU::NoRegister);
3548
3549  if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
3550    return false;
3551
3552  if (UseGPRIdxMode) {
3553    unsigned IdxMode = IsIndirectSrc ?
3554      AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
3555    if (Offset == 0) {
3556      MachineInstr *SetOn =
3557          BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3558              .add(*Idx)
3559              .addImm(IdxMode);
3560
3561      SetOn->getOperand(3).setIsUndef();
3562    } else {
3563      Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3564      BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3565          .add(*Idx)
3566          .addImm(Offset);
3567      MachineInstr *SetOn =
3568        BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3569        .addReg(Tmp, RegState::Kill)
3570        .addImm(IdxMode);
3571
3572      SetOn->getOperand(3).setIsUndef();
3573    }
3574
3575    return true;
3576  }
3577
3578  if (Offset == 0) {
3579    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3580      .add(*Idx);
3581  } else {
3582    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3583      .add(*Idx)
3584      .addImm(Offset);
3585  }
3586
3587  return true;
3588}
3589
3590// Control flow needs to be inserted if indexing with a VGPR.
3591static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3592                                          MachineBasicBlock &MBB,
3593                                          const GCNSubtarget &ST) {
3594  const SIInstrInfo *TII = ST.getInstrInfo();
3595  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3596  MachineFunction *MF = MBB.getParent();
3597  MachineRegisterInfo &MRI = MF->getRegInfo();
3598
3599  Register Dst = MI.getOperand(0).getReg();
3600  Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3601  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3602
3603  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3604
3605  unsigned SubReg;
3606  std::tie(SubReg, Offset)
3607    = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3608
3609  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3610
3611  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
3612    MachineBasicBlock::iterator I(&MI);
3613    const DebugLoc &DL = MI.getDebugLoc();
3614
3615    if (UseGPRIdxMode) {
3616      // TODO: Look at the uses to avoid the copy. This may require rescheduling
3617      // to avoid interfering with other uses, so probably requires a new
3618      // optimization pass.
3619      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3620        .addReg(SrcReg, RegState::Undef, SubReg)
3621        .addReg(SrcReg, RegState::Implicit)
3622        .addReg(AMDGPU::M0, RegState::Implicit);
3623      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3624    } else {
3625      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3626        .addReg(SrcReg, RegState::Undef, SubReg)
3627        .addReg(SrcReg, RegState::Implicit);
3628    }
3629
3630    MI.eraseFromParent();
3631
3632    return &MBB;
3633  }
3634
3635  const DebugLoc &DL = MI.getDebugLoc();
3636  MachineBasicBlock::iterator I(&MI);
3637
3638  Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3639  Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3640
3641  BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3642
3643  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3644                              Offset, UseGPRIdxMode, true);
3645  MachineBasicBlock *LoopBB = InsPt->getParent();
3646
3647  if (UseGPRIdxMode) {
3648    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3649      .addReg(SrcReg, RegState::Undef, SubReg)
3650      .addReg(SrcReg, RegState::Implicit)
3651      .addReg(AMDGPU::M0, RegState::Implicit);
3652    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3653  } else {
3654    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3655      .addReg(SrcReg, RegState::Undef, SubReg)
3656      .addReg(SrcReg, RegState::Implicit);
3657  }
3658
3659  MI.eraseFromParent();
3660
3661  return LoopBB;
3662}
3663
3664static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3665                                          MachineBasicBlock &MBB,
3666                                          const GCNSubtarget &ST) {
3667  const SIInstrInfo *TII = ST.getInstrInfo();
3668  const SIRegisterInfo &TRI = TII->getRegisterInfo();
3669  MachineFunction *MF = MBB.getParent();
3670  MachineRegisterInfo &MRI = MF->getRegInfo();
3671
3672  Register Dst = MI.getOperand(0).getReg();
3673  const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3674  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3675  const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3676  int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3677  const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3678
3679  // This can be an immediate, but will be folded later.
3680  assert(Val->getReg());
3681
3682  unsigned SubReg;
3683  std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3684                                                         SrcVec->getReg(),
3685                                                         Offset);
3686  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3687
3688  if (Idx->getReg() == AMDGPU::NoRegister) {
3689    MachineBasicBlock::iterator I(&MI);
3690    const DebugLoc &DL = MI.getDebugLoc();
3691
3692    assert(Offset == 0);
3693
3694    BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3695        .add(*SrcVec)
3696        .add(*Val)
3697        .addImm(SubReg);
3698
3699    MI.eraseFromParent();
3700    return &MBB;
3701  }
3702
3703  const MCInstrDesc &MovRelDesc
3704    = TII->getIndirectRegWritePseudo(TRI.getRegSizeInBits(*VecRC), 32, false);
3705
3706  if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
3707    MachineBasicBlock::iterator I(&MI);
3708    const DebugLoc &DL = MI.getDebugLoc();
3709    BuildMI(MBB, I, DL, MovRelDesc, Dst)
3710      .addReg(SrcVec->getReg())
3711      .add(*Val)
3712      .addImm(SubReg);
3713    if (UseGPRIdxMode)
3714      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3715
3716    MI.eraseFromParent();
3717    return &MBB;
3718  }
3719
3720  if (Val->isReg())
3721    MRI.clearKillFlags(Val->getReg());
3722
3723  const DebugLoc &DL = MI.getDebugLoc();
3724
3725  Register PhiReg = MRI.createVirtualRegister(VecRC);
3726
3727  auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
3728                              Offset, UseGPRIdxMode, false);
3729  MachineBasicBlock *LoopBB = InsPt->getParent();
3730
3731  BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
3732    .addReg(PhiReg)
3733    .add(*Val)
3734    .addImm(AMDGPU::sub0);
3735  if (UseGPRIdxMode)
3736    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3737
3738  MI.eraseFromParent();
3739  return LoopBB;
3740}
3741
3742MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3743  MachineInstr &MI, MachineBasicBlock *BB) const {
3744
3745  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3746  MachineFunction *MF = BB->getParent();
3747  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3748
3749  switch (MI.getOpcode()) {
3750  case AMDGPU::S_UADDO_PSEUDO:
3751  case AMDGPU::S_USUBO_PSEUDO: {
3752    const DebugLoc &DL = MI.getDebugLoc();
3753    MachineOperand &Dest0 = MI.getOperand(0);
3754    MachineOperand &Dest1 = MI.getOperand(1);
3755    MachineOperand &Src0 = MI.getOperand(2);
3756    MachineOperand &Src1 = MI.getOperand(3);
3757
3758    unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
3759                       ? AMDGPU::S_ADD_I32
3760                       : AMDGPU::S_SUB_I32;
3761    BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
3762
3763    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
3764        .addImm(1)
3765        .addImm(0);
3766
3767    MI.eraseFromParent();
3768    return BB;
3769  }
3770  case AMDGPU::S_ADD_U64_PSEUDO:
3771  case AMDGPU::S_SUB_U64_PSEUDO: {
3772    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3773    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3774    const SIRegisterInfo *TRI = ST.getRegisterInfo();
3775    const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3776    const DebugLoc &DL = MI.getDebugLoc();
3777
3778    MachineOperand &Dest = MI.getOperand(0);
3779    MachineOperand &Src0 = MI.getOperand(1);
3780    MachineOperand &Src1 = MI.getOperand(2);
3781
3782    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3783    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3784
3785    MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
3786        MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
3787    MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
3788        MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
3789
3790    MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
3791        MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
3792    MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
3793        MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
3794
3795    bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3796
3797    unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3798    unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3799    BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
3800    BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
3801    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3802        .addReg(DestSub0)
3803        .addImm(AMDGPU::sub0)
3804        .addReg(DestSub1)
3805        .addImm(AMDGPU::sub1);
3806    MI.eraseFromParent();
3807    return BB;
3808  }
3809  case AMDGPU::V_ADD_U64_PSEUDO:
3810  case AMDGPU::V_SUB_U64_PSEUDO: {
3811    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3812    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3813    const SIRegisterInfo *TRI = ST.getRegisterInfo();
3814    const DebugLoc &DL = MI.getDebugLoc();
3815
3816    bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
3817
3818    const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3819
3820    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3821    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3822
3823    Register CarryReg = MRI.createVirtualRegister(CarryRC);
3824    Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
3825
3826    MachineOperand &Dest = MI.getOperand(0);
3827    MachineOperand &Src0 = MI.getOperand(1);
3828    MachineOperand &Src1 = MI.getOperand(2);
3829
3830    const TargetRegisterClass *Src0RC = Src0.isReg()
3831                                            ? MRI.getRegClass(Src0.getReg())
3832                                            : &AMDGPU::VReg_64RegClass;
3833    const TargetRegisterClass *Src1RC = Src1.isReg()
3834                                            ? MRI.getRegClass(Src1.getReg())
3835                                            : &AMDGPU::VReg_64RegClass;
3836
3837    const TargetRegisterClass *Src0SubRC =
3838        TRI->getSubRegClass(Src0RC, AMDGPU::sub0);
3839    const TargetRegisterClass *Src1SubRC =
3840        TRI->getSubRegClass(Src1RC, AMDGPU::sub1);
3841
3842    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
3843        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
3844    MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
3845        MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
3846
3847    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
3848        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
3849    MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
3850        MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
3851
3852    unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
3853    MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3854                               .addReg(CarryReg, RegState::Define)
3855                               .add(SrcReg0Sub0)
3856                               .add(SrcReg1Sub0)
3857                               .addImm(0); // clamp bit
3858
3859    unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
3860    MachineInstr *HiHalf =
3861        BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3862            .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
3863            .add(SrcReg0Sub1)
3864            .add(SrcReg1Sub1)
3865            .addReg(CarryReg, RegState::Kill)
3866            .addImm(0); // clamp bit
3867
3868    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3869        .addReg(DestSub0)
3870        .addImm(AMDGPU::sub0)
3871        .addReg(DestSub1)
3872        .addImm(AMDGPU::sub1);
3873    TII->legalizeOperands(*LoHalf);
3874    TII->legalizeOperands(*HiHalf);
3875    MI.eraseFromParent();
3876    return BB;
3877  }
3878  case AMDGPU::S_ADD_CO_PSEUDO:
3879  case AMDGPU::S_SUB_CO_PSEUDO: {
3880    // This pseudo has a chance to be selected
3881    // only from uniform add/subcarry node. All the VGPR operands
3882    // therefore assumed to be splat vectors.
3883    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3884    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3885    const SIRegisterInfo *TRI = ST.getRegisterInfo();
3886    MachineBasicBlock::iterator MII = MI;
3887    const DebugLoc &DL = MI.getDebugLoc();
3888    MachineOperand &Dest = MI.getOperand(0);
3889    MachineOperand &CarryDest = MI.getOperand(1);
3890    MachineOperand &Src0 = MI.getOperand(2);
3891    MachineOperand &Src1 = MI.getOperand(3);
3892    MachineOperand &Src2 = MI.getOperand(4);
3893    unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
3894                       ? AMDGPU::S_ADDC_U32
3895                       : AMDGPU::S_SUBB_U32;
3896    if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
3897      Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3898      BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
3899          .addReg(Src0.getReg());
3900      Src0.setReg(RegOp0);
3901    }
3902    if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
3903      Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3904      BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
3905          .addReg(Src1.getReg());
3906      Src1.setReg(RegOp1);
3907    }
3908    Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3909    if (TRI->isVectorRegister(MRI, Src2.getReg())) {
3910      BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
3911          .addReg(Src2.getReg());
3912      Src2.setReg(RegOp2);
3913    }
3914
3915    if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) {
3916      BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
3917          .addReg(Src2.getReg())
3918          .addImm(0);
3919    } else {
3920      BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32))
3921          .addReg(Src2.getReg())
3922          .addImm(0);
3923    }
3924
3925    BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
3926
3927    BuildMI(*BB, MII, DL, TII->get(AMDGPU::COPY), CarryDest.getReg())
3928      .addReg(AMDGPU::SCC);
3929    MI.eraseFromParent();
3930    return BB;
3931  }
3932  case AMDGPU::SI_INIT_M0: {
3933    BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3934            TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3935        .add(MI.getOperand(0));
3936    MI.eraseFromParent();
3937    return BB;
3938  }
3939  case AMDGPU::SI_INIT_EXEC:
3940    // This should be before all vector instructions.
3941    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3942            AMDGPU::EXEC)
3943        .addImm(MI.getOperand(0).getImm());
3944    MI.eraseFromParent();
3945    return BB;
3946
3947  case AMDGPU::SI_INIT_EXEC_LO:
3948    // This should be before all vector instructions.
3949    BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
3950            AMDGPU::EXEC_LO)
3951        .addImm(MI.getOperand(0).getImm());
3952    MI.eraseFromParent();
3953    return BB;
3954
3955  case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3956    // Extract the thread count from an SGPR input and set EXEC accordingly.
3957    // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3958    //
3959    // S_BFE_U32 count, input, {shift, 7}
3960    // S_BFM_B64 exec, count, 0
3961    // S_CMP_EQ_U32 count, 64
3962    // S_CMOV_B64 exec, -1
3963    MachineInstr *FirstMI = &*BB->begin();
3964    MachineRegisterInfo &MRI = MF->getRegInfo();
3965    Register InputReg = MI.getOperand(0).getReg();
3966    Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3967    bool Found = false;
3968
3969    // Move the COPY of the input reg to the beginning, so that we can use it.
3970    for (auto I = BB->begin(); I != &MI; I++) {
3971      if (I->getOpcode() != TargetOpcode::COPY ||
3972          I->getOperand(0).getReg() != InputReg)
3973        continue;
3974
3975      if (I == FirstMI) {
3976        FirstMI = &*++BB->begin();
3977      } else {
3978        I->removeFromParent();
3979        BB->insert(FirstMI, &*I);
3980      }
3981      Found = true;
3982      break;
3983    }
3984    assert(Found);
3985    (void)Found;
3986
3987    // This should be before all vector instructions.
3988    unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1;
3989    bool isWave32 = getSubtarget()->isWave32();
3990    unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3991    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3992        .addReg(InputReg)
3993        .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
3994    BuildMI(*BB, FirstMI, DebugLoc(),
3995            TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64),
3996            Exec)
3997        .addReg(CountReg)
3998        .addImm(0);
3999    BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
4000        .addReg(CountReg, RegState::Kill)
4001        .addImm(getSubtarget()->getWavefrontSize());
4002    BuildMI(*BB, FirstMI, DebugLoc(),
4003            TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
4004            Exec)
4005        .addImm(-1);
4006    MI.eraseFromParent();
4007    return BB;
4008  }
4009
4010  case AMDGPU::GET_GROUPSTATICSIZE: {
4011    assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
4012           getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
4013    DebugLoc DL = MI.getDebugLoc();
4014    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
4015        .add(MI.getOperand(0))
4016        .addImm(MFI->getLDSSize());
4017    MI.eraseFromParent();
4018    return BB;
4019  }
4020  case AMDGPU::SI_INDIRECT_SRC_V1:
4021  case AMDGPU::SI_INDIRECT_SRC_V2:
4022  case AMDGPU::SI_INDIRECT_SRC_V4:
4023  case AMDGPU::SI_INDIRECT_SRC_V8:
4024  case AMDGPU::SI_INDIRECT_SRC_V16:
4025  case AMDGPU::SI_INDIRECT_SRC_V32:
4026    return emitIndirectSrc(MI, *BB, *getSubtarget());
4027  case AMDGPU::SI_INDIRECT_DST_V1:
4028  case AMDGPU::SI_INDIRECT_DST_V2:
4029  case AMDGPU::SI_INDIRECT_DST_V4:
4030  case AMDGPU::SI_INDIRECT_DST_V8:
4031  case AMDGPU::SI_INDIRECT_DST_V16:
4032  case AMDGPU::SI_INDIRECT_DST_V32:
4033    return emitIndirectDst(MI, *BB, *getSubtarget());
4034  case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
4035  case AMDGPU::SI_KILL_I1_PSEUDO:
4036    return splitKillBlock(MI, BB);
4037  case AMDGPU::V_CNDMASK_B64_PSEUDO: {
4038    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4039    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4040    const SIRegisterInfo *TRI = ST.getRegisterInfo();
4041
4042    Register Dst = MI.getOperand(0).getReg();
4043    Register Src0 = MI.getOperand(1).getReg();
4044    Register Src1 = MI.getOperand(2).getReg();
4045    const DebugLoc &DL = MI.getDebugLoc();
4046    Register SrcCond = MI.getOperand(3).getReg();
4047
4048    Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4049    Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4050    const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4051    Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
4052
4053    BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
4054      .addReg(SrcCond);
4055    BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
4056      .addImm(0)
4057      .addReg(Src0, 0, AMDGPU::sub0)
4058      .addImm(0)
4059      .addReg(Src1, 0, AMDGPU::sub0)
4060      .addReg(SrcCondCopy);
4061    BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
4062      .addImm(0)
4063      .addReg(Src0, 0, AMDGPU::sub1)
4064      .addImm(0)
4065      .addReg(Src1, 0, AMDGPU::sub1)
4066      .addReg(SrcCondCopy);
4067
4068    BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
4069      .addReg(DstLo)
4070      .addImm(AMDGPU::sub0)
4071      .addReg(DstHi)
4072      .addImm(AMDGPU::sub1);
4073    MI.eraseFromParent();
4074    return BB;
4075  }
4076  case AMDGPU::SI_BR_UNDEF: {
4077    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4078    const DebugLoc &DL = MI.getDebugLoc();
4079    MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4080                           .add(MI.getOperand(0));
4081    Br->getOperand(1).setIsUndef(true); // read undef SCC
4082    MI.eraseFromParent();
4083    return BB;
4084  }
4085  case AMDGPU::ADJCALLSTACKUP:
4086  case AMDGPU::ADJCALLSTACKDOWN: {
4087    const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4088    MachineInstrBuilder MIB(*MF, &MI);
4089
4090    // Add an implicit use of the frame offset reg to prevent the restore copy
4091    // inserted after the call from being reorderd after stack operations in the
4092    // the caller's frame.
4093    MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
4094        .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
4095        .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
4096    return BB;
4097  }
4098  case AMDGPU::SI_CALL_ISEL: {
4099    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4100    const DebugLoc &DL = MI.getDebugLoc();
4101
4102    unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
4103
4104    MachineInstrBuilder MIB;
4105    MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
4106
4107    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
4108      MIB.add(MI.getOperand(I));
4109
4110    MIB.cloneMemRefs(MI);
4111    MI.eraseFromParent();
4112    return BB;
4113  }
4114  case AMDGPU::V_ADD_I32_e32:
4115  case AMDGPU::V_SUB_I32_e32:
4116  case AMDGPU::V_SUBREV_I32_e32: {
4117    // TODO: Define distinct V_*_I32_Pseudo instructions instead.
4118    const DebugLoc &DL = MI.getDebugLoc();
4119    unsigned Opc = MI.getOpcode();
4120
4121    bool NeedClampOperand = false;
4122    if (TII->pseudoToMCOpcode(Opc) == -1) {
4123      Opc = AMDGPU::getVOPe64(Opc);
4124      NeedClampOperand = true;
4125    }
4126
4127    auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
4128    if (TII->isVOP3(*I)) {
4129      const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4130      const SIRegisterInfo *TRI = ST.getRegisterInfo();
4131      I.addReg(TRI->getVCC(), RegState::Define);
4132    }
4133    I.add(MI.getOperand(1))
4134     .add(MI.getOperand(2));
4135    if (NeedClampOperand)
4136      I.addImm(0); // clamp bit for e64 encoding
4137
4138    TII->legalizeOperands(*I);
4139
4140    MI.eraseFromParent();
4141    return BB;
4142  }
4143  case AMDGPU::DS_GWS_INIT:
4144  case AMDGPU::DS_GWS_SEMA_V:
4145  case AMDGPU::DS_GWS_SEMA_BR:
4146  case AMDGPU::DS_GWS_SEMA_P:
4147  case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
4148  case AMDGPU::DS_GWS_BARRIER:
4149    // A s_waitcnt 0 is required to be the instruction immediately following.
4150    if (getSubtarget()->hasGWSAutoReplay()) {
4151      bundleInstWithWaitcnt(MI);
4152      return BB;
4153    }
4154
4155    return emitGWSMemViolTestLoop(MI, BB);
4156  case AMDGPU::S_SETREG_B32: {
4157    if (!getSubtarget()->hasDenormModeInst())
4158      return BB;
4159
4160    // Try to optimize cases that only set the denormal mode or rounding mode.
4161    //
4162    // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
4163    // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
4164    // instead.
4165    //
4166    // FIXME: This could be predicates on the immediate, but tablegen doesn't
4167    // allow you to have a no side effect instruction in the output of a
4168    // sideeffecting pattern.
4169
4170    // TODO: Should also emit a no side effects pseudo if only FP bits are
4171    // touched, even if not all of them or to a variable.
4172    unsigned ID, Offset, Width;
4173    AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
4174    if (ID != AMDGPU::Hwreg::ID_MODE)
4175      return BB;
4176
4177    const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
4178    const unsigned SetMask = WidthMask << Offset;
4179    unsigned SetDenormOp = 0;
4180    unsigned SetRoundOp = 0;
4181
4182    // The dedicated instructions can only set the whole denorm or round mode at
4183    // once, not a subset of bits in either.
4184    if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
4185                                  AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) {
4186      // If this fully sets both the round and denorm mode, emit the two
4187      // dedicated instructions for these.
4188      assert(Offset == 0);
4189      SetRoundOp = AMDGPU::S_ROUND_MODE;
4190      SetDenormOp = AMDGPU::S_DENORM_MODE;
4191    } else if (Width == 4) {
4192      if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) {
4193        SetRoundOp = AMDGPU::S_ROUND_MODE;
4194        assert(Offset == 0);
4195      } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) {
4196        SetDenormOp = AMDGPU::S_DENORM_MODE;
4197        assert(Offset == 4);
4198      }
4199    }
4200
4201    if (SetRoundOp || SetDenormOp) {
4202      MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4203      MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
4204      if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
4205        unsigned ImmVal = Def->getOperand(1).getImm();
4206        if (SetRoundOp) {
4207          BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
4208            .addImm(ImmVal & 0xf);
4209
4210          // If we also have the denorm mode, get just the denorm mode bits.
4211          ImmVal >>= 4;
4212        }
4213
4214        if (SetDenormOp) {
4215          BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
4216            .addImm(ImmVal & 0xf);
4217        }
4218
4219        MI.eraseFromParent();
4220      }
4221    }
4222
4223    return BB;
4224  }
4225  default:
4226    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
4227  }
4228}
4229
4230bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
4231  return isTypeLegal(VT.getScalarType());
4232}
4233
4234bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
4235  // This currently forces unfolding various combinations of fsub into fma with
4236  // free fneg'd operands. As long as we have fast FMA (controlled by
4237  // isFMAFasterThanFMulAndFAdd), we should perform these.
4238
4239  // When fma is quarter rate, for f64 where add / sub are at best half rate,
4240  // most of these combines appear to be cycle neutral but save on instruction
4241  // count / code size.
4242  return true;
4243}
4244
4245EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
4246                                         EVT VT) const {
4247  if (!VT.isVector()) {
4248    return MVT::i1;
4249  }
4250  return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
4251}
4252
4253MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
4254  // TODO: Should i16 be used always if legal? For now it would force VALU
4255  // shifts.
4256  return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
4257}
4258
4259// Answering this is somewhat tricky and depends on the specific device which
4260// have different rates for fma or all f64 operations.
4261//
4262// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
4263// regardless of which device (although the number of cycles differs between
4264// devices), so it is always profitable for f64.
4265//
4266// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
4267// only on full rate devices. Normally, we should prefer selecting v_mad_f32
4268// which we can always do even without fused FP ops since it returns the same
4269// result as the separate operations and since it is always full
4270// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
4271// however does not support denormals, so we do report fma as faster if we have
4272// a fast fma device and require denormals.
4273//
4274bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
4275                                                  EVT VT) const {
4276  VT = VT.getScalarType();
4277
4278  switch (VT.getSimpleVT().SimpleTy) {
4279  case MVT::f32: {
4280    // If mad is not available this depends only on if f32 fma is full rate.
4281    if (!Subtarget->hasMadMacF32Insts())
4282      return Subtarget->hasFastFMAF32();
4283
4284    // Otherwise f32 mad is always full rate and returns the same result as
4285    // the separate operations so should be preferred over fma.
4286    // However does not support denomals.
4287    if (hasFP32Denormals(MF))
4288      return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
4289
4290    // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
4291    return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
4292  }
4293  case MVT::f64:
4294    return true;
4295  case MVT::f16:
4296    return Subtarget->has16BitInsts() && hasFP64FP16Denormals(MF);
4297  default:
4298    break;
4299  }
4300
4301  return false;
4302}
4303
4304bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
4305                                   const SDNode *N) const {
4306  // TODO: Check future ftz flag
4307  // v_mad_f32/v_mac_f32 do not support denormals.
4308  EVT VT = N->getValueType(0);
4309  if (VT == MVT::f32)
4310    return Subtarget->hasMadMacF32Insts() &&
4311           !hasFP32Denormals(DAG.getMachineFunction());
4312  if (VT == MVT::f16) {
4313    return Subtarget->hasMadF16() &&
4314           !hasFP64FP16Denormals(DAG.getMachineFunction());
4315  }
4316
4317  return false;
4318}
4319
4320//===----------------------------------------------------------------------===//
4321// Custom DAG Lowering Operations
4322//===----------------------------------------------------------------------===//
4323
4324// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
4325// wider vector type is legal.
4326SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
4327                                             SelectionDAG &DAG) const {
4328  unsigned Opc = Op.getOpcode();
4329  EVT VT = Op.getValueType();
4330  assert(VT == MVT::v4f16 || VT == MVT::v4i16);
4331
4332  SDValue Lo, Hi;
4333  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
4334
4335  SDLoc SL(Op);
4336  SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
4337                             Op->getFlags());
4338  SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
4339                             Op->getFlags());
4340
4341  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4342}
4343
4344// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
4345// wider vector type is legal.
4346SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
4347                                              SelectionDAG &DAG) const {
4348  unsigned Opc = Op.getOpcode();
4349  EVT VT = Op.getValueType();
4350  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
4351
4352  SDValue Lo0, Hi0;
4353  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
4354  SDValue Lo1, Hi1;
4355  std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
4356
4357  SDLoc SL(Op);
4358
4359  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
4360                             Op->getFlags());
4361  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
4362                             Op->getFlags());
4363
4364  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4365}
4366
4367SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
4368                                              SelectionDAG &DAG) const {
4369  unsigned Opc = Op.getOpcode();
4370  EVT VT = Op.getValueType();
4371  assert(VT == MVT::v4i16 || VT == MVT::v4f16);
4372
4373  SDValue Lo0, Hi0;
4374  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
4375  SDValue Lo1, Hi1;
4376  std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
4377  SDValue Lo2, Hi2;
4378  std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
4379
4380  SDLoc SL(Op);
4381
4382  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2,
4383                             Op->getFlags());
4384  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2,
4385                             Op->getFlags());
4386
4387  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4388}
4389
4390
4391SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
4392  switch (Op.getOpcode()) {
4393  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
4394  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
4395  case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
4396  case ISD::LOAD: {
4397    SDValue Result = LowerLOAD(Op, DAG);
4398    assert((!Result.getNode() ||
4399            Result.getNode()->getNumValues() == 2) &&
4400           "Load should return a value and a chain");
4401    return Result;
4402  }
4403
4404  case ISD::FSIN:
4405  case ISD::FCOS:
4406    return LowerTrig(Op, DAG);
4407  case ISD::SELECT: return LowerSELECT(Op, DAG);
4408  case ISD::FDIV: return LowerFDIV(Op, DAG);
4409  case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
4410  case ISD::STORE: return LowerSTORE(Op, DAG);
4411  case ISD::GlobalAddress: {
4412    MachineFunction &MF = DAG.getMachineFunction();
4413    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
4414    return LowerGlobalAddress(MFI, Op, DAG);
4415  }
4416  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
4417  case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
4418  case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
4419  case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
4420  case ISD::INSERT_SUBVECTOR:
4421    return lowerINSERT_SUBVECTOR(Op, DAG);
4422  case ISD::INSERT_VECTOR_ELT:
4423    return lowerINSERT_VECTOR_ELT(Op, DAG);
4424  case ISD::EXTRACT_VECTOR_ELT:
4425    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
4426  case ISD::VECTOR_SHUFFLE:
4427    return lowerVECTOR_SHUFFLE(Op, DAG);
4428  case ISD::BUILD_VECTOR:
4429    return lowerBUILD_VECTOR(Op, DAG);
4430  case ISD::FP_ROUND:
4431    return lowerFP_ROUND(Op, DAG);
4432  case ISD::TRAP:
4433    return lowerTRAP(Op, DAG);
4434  case ISD::DEBUGTRAP:
4435    return lowerDEBUGTRAP(Op, DAG);
4436  case ISD::FABS:
4437  case ISD::FNEG:
4438  case ISD::FCANONICALIZE:
4439  case ISD::BSWAP:
4440    return splitUnaryVectorOp(Op, DAG);
4441  case ISD::FMINNUM:
4442  case ISD::FMAXNUM:
4443    return lowerFMINNUM_FMAXNUM(Op, DAG);
4444  case ISD::FMA:
4445    return splitTernaryVectorOp(Op, DAG);
4446  case ISD::SHL:
4447  case ISD::SRA:
4448  case ISD::SRL:
4449  case ISD::ADD:
4450  case ISD::SUB:
4451  case ISD::MUL:
4452  case ISD::SMIN:
4453  case ISD::SMAX:
4454  case ISD::UMIN:
4455  case ISD::UMAX:
4456  case ISD::FADD:
4457  case ISD::FMUL:
4458  case ISD::FMINNUM_IEEE:
4459  case ISD::FMAXNUM_IEEE:
4460    return splitBinaryVectorOp(Op, DAG);
4461  case ISD::SMULO:
4462  case ISD::UMULO:
4463    return lowerXMULO(Op, DAG);
4464  case ISD::DYNAMIC_STACKALLOC:
4465    return LowerDYNAMIC_STACKALLOC(Op, DAG);
4466  }
4467  return SDValue();
4468}
4469
4470static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
4471                                       const SDLoc &DL,
4472                                       SelectionDAG &DAG, bool Unpacked) {
4473  if (!LoadVT.isVector())
4474    return Result;
4475
4476  if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
4477    // Truncate to v2i16/v4i16.
4478    EVT IntLoadVT = LoadVT.changeTypeToInteger();
4479
4480    // Workaround legalizer not scalarizing truncate after vector op
4481    // legalization byt not creating intermediate vector trunc.
4482    SmallVector<SDValue, 4> Elts;
4483    DAG.ExtractVectorElements(Result, Elts);
4484    for (SDValue &Elt : Elts)
4485      Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
4486
4487    Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
4488
4489    // Bitcast to original type (v2f16/v4f16).
4490    return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
4491  }
4492
4493  // Cast back to the original packed type.
4494  return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
4495}
4496
4497SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
4498                                              MemSDNode *M,
4499                                              SelectionDAG &DAG,
4500                                              ArrayRef<SDValue> Ops,
4501                                              bool IsIntrinsic) const {
4502  SDLoc DL(M);
4503
4504  bool Unpacked = Subtarget->hasUnpackedD16VMem();
4505  EVT LoadVT = M->getValueType(0);
4506
4507  EVT EquivLoadVT = LoadVT;
4508  if (Unpacked && LoadVT.isVector()) {
4509    EquivLoadVT = LoadVT.isVector() ?
4510      EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4511                       LoadVT.getVectorNumElements()) : LoadVT;
4512  }
4513
4514  // Change from v4f16/v2f16 to EquivLoadVT.
4515  SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
4516
4517  SDValue Load
4518    = DAG.getMemIntrinsicNode(
4519      IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
4520      VTList, Ops, M->getMemoryVT(),
4521      M->getMemOperand());
4522  if (!Unpacked) // Just adjusted the opcode.
4523    return Load;
4524
4525  SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
4526
4527  return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
4528}
4529
4530SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
4531                                             SelectionDAG &DAG,
4532                                             ArrayRef<SDValue> Ops) const {
4533  SDLoc DL(M);
4534  EVT LoadVT = M->getValueType(0);
4535  EVT EltType = LoadVT.getScalarType();
4536  EVT IntVT = LoadVT.changeTypeToInteger();
4537
4538  bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
4539
4540  unsigned Opc =
4541      IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD;
4542
4543  if (IsD16) {
4544    return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
4545  }
4546
4547  // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
4548  if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
4549    return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
4550
4551  if (isTypeLegal(LoadVT)) {
4552    return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
4553                               M->getMemOperand(), DAG);
4554  }
4555
4556  EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
4557  SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
4558  SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
4559                                        M->getMemOperand(), DAG);
4560  return DAG.getMergeValues(
4561      {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
4562      DL);
4563}
4564
4565static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
4566                                  SDNode *N, SelectionDAG &DAG) {
4567  EVT VT = N->getValueType(0);
4568  const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
4569  unsigned CondCode = CD->getZExtValue();
4570  if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
4571    return DAG.getUNDEF(VT);
4572
4573  ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
4574
4575  SDValue LHS = N->getOperand(1);
4576  SDValue RHS = N->getOperand(2);
4577
4578  SDLoc DL(N);
4579
4580  EVT CmpVT = LHS.getValueType();
4581  if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
4582    unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
4583      ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4584    LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
4585    RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
4586  }
4587
4588  ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
4589
4590  unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
4591  EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
4592
4593  SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
4594                              DAG.getCondCode(CCOpcode));
4595  if (VT.bitsEq(CCVT))
4596    return SetCC;
4597  return DAG.getZExtOrTrunc(SetCC, DL, VT);
4598}
4599
4600static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
4601                                  SDNode *N, SelectionDAG &DAG) {
4602  EVT VT = N->getValueType(0);
4603  const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
4604
4605  unsigned CondCode = CD->getZExtValue();
4606  if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
4607    return DAG.getUNDEF(VT);
4608
4609  SDValue Src0 = N->getOperand(1);
4610  SDValue Src1 = N->getOperand(2);
4611  EVT CmpVT = Src0.getValueType();
4612  SDLoc SL(N);
4613
4614  if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
4615    Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
4616    Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
4617  }
4618
4619  FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
4620  ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
4621  unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
4622  EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
4623  SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
4624                              Src1, DAG.getCondCode(CCOpcode));
4625  if (VT.bitsEq(CCVT))
4626    return SetCC;
4627  return DAG.getZExtOrTrunc(SetCC, SL, VT);
4628}
4629
4630static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
4631                                    SelectionDAG &DAG) {
4632  EVT VT = N->getValueType(0);
4633  SDValue Src = N->getOperand(1);
4634  SDLoc SL(N);
4635
4636  if (Src.getOpcode() == ISD::SETCC) {
4637    // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
4638    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
4639                       Src.getOperand(1), Src.getOperand(2));
4640  }
4641  if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
4642    // (ballot 0) -> 0
4643    if (Arg->isNullValue())
4644      return DAG.getConstant(0, SL, VT);
4645
4646    // (ballot 1) -> EXEC/EXEC_LO
4647    if (Arg->isOne()) {
4648      Register Exec;
4649      if (VT.getScalarSizeInBits() == 32)
4650        Exec = AMDGPU::EXEC_LO;
4651      else if (VT.getScalarSizeInBits() == 64)
4652        Exec = AMDGPU::EXEC;
4653      else
4654        return SDValue();
4655
4656      return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
4657    }
4658  }
4659
4660  // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
4661  // ISD::SETNE)
4662  return DAG.getNode(
4663      AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
4664      DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
4665}
4666
4667void SITargetLowering::ReplaceNodeResults(SDNode *N,
4668                                          SmallVectorImpl<SDValue> &Results,
4669                                          SelectionDAG &DAG) const {
4670  switch (N->getOpcode()) {
4671  case ISD::INSERT_VECTOR_ELT: {
4672    if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
4673      Results.push_back(Res);
4674    return;
4675  }
4676  case ISD::EXTRACT_VECTOR_ELT: {
4677    if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
4678      Results.push_back(Res);
4679    return;
4680  }
4681  case ISD::INTRINSIC_WO_CHAIN: {
4682    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
4683    switch (IID) {
4684    case Intrinsic::amdgcn_cvt_pkrtz: {
4685      SDValue Src0 = N->getOperand(1);
4686      SDValue Src1 = N->getOperand(2);
4687      SDLoc SL(N);
4688      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
4689                                Src0, Src1);
4690      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
4691      return;
4692    }
4693    case Intrinsic::amdgcn_cvt_pknorm_i16:
4694    case Intrinsic::amdgcn_cvt_pknorm_u16:
4695    case Intrinsic::amdgcn_cvt_pk_i16:
4696    case Intrinsic::amdgcn_cvt_pk_u16: {
4697      SDValue Src0 = N->getOperand(1);
4698      SDValue Src1 = N->getOperand(2);
4699      SDLoc SL(N);
4700      unsigned Opcode;
4701
4702      if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
4703        Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
4704      else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
4705        Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
4706      else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
4707        Opcode = AMDGPUISD::CVT_PK_I16_I32;
4708      else
4709        Opcode = AMDGPUISD::CVT_PK_U16_U32;
4710
4711      EVT VT = N->getValueType(0);
4712      if (isTypeLegal(VT))
4713        Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
4714      else {
4715        SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
4716        Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
4717      }
4718      return;
4719    }
4720    }
4721    break;
4722  }
4723  case ISD::INTRINSIC_W_CHAIN: {
4724    if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
4725      if (Res.getOpcode() == ISD::MERGE_VALUES) {
4726        // FIXME: Hacky
4727        Results.push_back(Res.getOperand(0));
4728        Results.push_back(Res.getOperand(1));
4729      } else {
4730        Results.push_back(Res);
4731        Results.push_back(Res.getValue(1));
4732      }
4733      return;
4734    }
4735
4736    break;
4737  }
4738  case ISD::SELECT: {
4739    SDLoc SL(N);
4740    EVT VT = N->getValueType(0);
4741    EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4742    SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
4743    SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
4744
4745    EVT SelectVT = NewVT;
4746    if (NewVT.bitsLT(MVT::i32)) {
4747      LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
4748      RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
4749      SelectVT = MVT::i32;
4750    }
4751
4752    SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
4753                                    N->getOperand(0), LHS, RHS);
4754
4755    if (NewVT != SelectVT)
4756      NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
4757    Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
4758    return;
4759  }
4760  case ISD::FNEG: {
4761    if (N->getValueType(0) != MVT::v2f16)
4762      break;
4763
4764    SDLoc SL(N);
4765    SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
4766
4767    SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
4768                             BC,
4769                             DAG.getConstant(0x80008000, SL, MVT::i32));
4770    Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
4771    return;
4772  }
4773  case ISD::FABS: {
4774    if (N->getValueType(0) != MVT::v2f16)
4775      break;
4776
4777    SDLoc SL(N);
4778    SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
4779
4780    SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
4781                             BC,
4782                             DAG.getConstant(0x7fff7fff, SL, MVT::i32));
4783    Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
4784    return;
4785  }
4786  default:
4787    break;
4788  }
4789}
4790
4791/// Helper function for LowerBRCOND
4792static SDNode *findUser(SDValue Value, unsigned Opcode) {
4793
4794  SDNode *Parent = Value.getNode();
4795  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
4796       I != E; ++I) {
4797
4798    if (I.getUse().get() != Value)
4799      continue;
4800
4801    if (I->getOpcode() == Opcode)
4802      return *I;
4803  }
4804  return nullptr;
4805}
4806
4807unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
4808  if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
4809    switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
4810    case Intrinsic::amdgcn_if:
4811      return AMDGPUISD::IF;
4812    case Intrinsic::amdgcn_else:
4813      return AMDGPUISD::ELSE;
4814    case Intrinsic::amdgcn_loop:
4815      return AMDGPUISD::LOOP;
4816    case Intrinsic::amdgcn_end_cf:
4817      llvm_unreachable("should not occur");
4818    default:
4819      return 0;
4820    }
4821  }
4822
4823  // break, if_break, else_break are all only used as inputs to loop, not
4824  // directly as branch conditions.
4825  return 0;
4826}
4827
4828bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
4829  const Triple &TT = getTargetMachine().getTargetTriple();
4830  return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4831          GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
4832         AMDGPU::shouldEmitConstantsToTextSection(TT);
4833}
4834
4835bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
4836  // FIXME: Either avoid relying on address space here or change the default
4837  // address space for functions to avoid the explicit check.
4838  return (GV->getValueType()->isFunctionTy() ||
4839          !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
4840         !shouldEmitFixup(GV) &&
4841         !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
4842}
4843
4844bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
4845  return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
4846}
4847
4848bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
4849  if (!GV->hasExternalLinkage())
4850    return true;
4851
4852  const auto OS = getTargetMachine().getTargetTriple().getOS();
4853  return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
4854}
4855
4856/// This transforms the control flow intrinsics to get the branch destination as
4857/// last parameter, also switches branch target with BR if the need arise
4858SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
4859                                      SelectionDAG &DAG) const {
4860  SDLoc DL(BRCOND);
4861
4862  SDNode *Intr = BRCOND.getOperand(1).getNode();
4863  SDValue Target = BRCOND.getOperand(2);
4864  SDNode *BR = nullptr;
4865  SDNode *SetCC = nullptr;
4866
4867  if (Intr->getOpcode() == ISD::SETCC) {
4868    // As long as we negate the condition everything is fine
4869    SetCC = Intr;
4870    Intr = SetCC->getOperand(0).getNode();
4871
4872  } else {
4873    // Get the target from BR if we don't negate the condition
4874    BR = findUser(BRCOND, ISD::BR);
4875    assert(BR && "brcond missing unconditional branch user");
4876    Target = BR->getOperand(1);
4877  }
4878
4879  unsigned CFNode = isCFIntrinsic(Intr);
4880  if (CFNode == 0) {
4881    // This is a uniform branch so we don't need to legalize.
4882    return BRCOND;
4883  }
4884
4885  bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
4886                   Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
4887
4888  assert(!SetCC ||
4889        (SetCC->getConstantOperandVal(1) == 1 &&
4890         cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
4891                                                             ISD::SETNE));
4892
4893  // operands of the new intrinsic call
4894  SmallVector<SDValue, 4> Ops;
4895  if (HaveChain)
4896    Ops.push_back(BRCOND.getOperand(0));
4897
4898  Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
4899  Ops.push_back(Target);
4900
4901  ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4902
4903  // build the new intrinsic call
4904  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
4905
4906  if (!HaveChain) {
4907    SDValue Ops[] =  {
4908      SDValue(Result, 0),
4909      BRCOND.getOperand(0)
4910    };
4911
4912    Result = DAG.getMergeValues(Ops, DL).getNode();
4913  }
4914
4915  if (BR) {
4916    // Give the branch instruction our target
4917    SDValue Ops[] = {
4918      BR->getOperand(0),
4919      BRCOND.getOperand(2)
4920    };
4921    SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4922    DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4923  }
4924
4925  SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4926
4927  // Copy the intrinsic results to registers
4928  for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4929    SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
4930    if (!CopyToReg)
4931      continue;
4932
4933    Chain = DAG.getCopyToReg(
4934      Chain, DL,
4935      CopyToReg->getOperand(1),
4936      SDValue(Result, i - 1),
4937      SDValue());
4938
4939    DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4940  }
4941
4942  // Remove the old intrinsic from the chain
4943  DAG.ReplaceAllUsesOfValueWith(
4944    SDValue(Intr, Intr->getNumValues() - 1),
4945    Intr->getOperand(0));
4946
4947  return Chain;
4948}
4949
4950SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
4951                                          SelectionDAG &DAG) const {
4952  MVT VT = Op.getSimpleValueType();
4953  SDLoc DL(Op);
4954  // Checking the depth
4955  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0)
4956    return DAG.getConstant(0, DL, VT);
4957
4958  MachineFunction &MF = DAG.getMachineFunction();
4959  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4960  // Check for kernel and shader functions
4961  if (Info->isEntryFunction())
4962    return DAG.getConstant(0, DL, VT);
4963
4964  MachineFrameInfo &MFI = MF.getFrameInfo();
4965  // There is a call to @llvm.returnaddress in this function
4966  MFI.setReturnAddressIsTaken(true);
4967
4968  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
4969  // Get the return address reg and mark it as an implicit live-in
4970  unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
4971
4972  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
4973}
4974
4975SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
4976                                            SDValue Op,
4977                                            const SDLoc &DL,
4978                                            EVT VT) const {
4979  return Op.getValueType().bitsLE(VT) ?
4980      DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4981    DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
4982                DAG.getTargetConstant(0, DL, MVT::i32));
4983}
4984
4985SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
4986  assert(Op.getValueType() == MVT::f16 &&
4987         "Do not know how to custom lower FP_ROUND for non-f16 type");
4988
4989  SDValue Src = Op.getOperand(0);
4990  EVT SrcVT = Src.getValueType();
4991  if (SrcVT != MVT::f64)
4992    return Op;
4993
4994  SDLoc DL(Op);
4995
4996  SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4997  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
4998  return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
4999}
5000
5001SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
5002                                               SelectionDAG &DAG) const {
5003  EVT VT = Op.getValueType();
5004  const MachineFunction &MF = DAG.getMachineFunction();
5005  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5006  bool IsIEEEMode = Info->getMode().IEEE;
5007
5008  // FIXME: Assert during selection that this is only selected for
5009  // ieee_mode. Currently a combine can produce the ieee version for non-ieee
5010  // mode functions, but this happens to be OK since it's only done in cases
5011  // where there is known no sNaN.
5012  if (IsIEEEMode)
5013    return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
5014
5015  if (VT == MVT::v4f16)
5016    return splitBinaryVectorOp(Op, DAG);
5017  return Op;
5018}
5019
5020SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
5021  EVT VT = Op.getValueType();
5022  SDLoc SL(Op);
5023  SDValue LHS = Op.getOperand(0);
5024  SDValue RHS = Op.getOperand(1);
5025  bool isSigned = Op.getOpcode() == ISD::SMULO;
5026
5027  if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
5028    const APInt &C = RHSC->getAPIntValue();
5029    // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
5030    if (C.isPowerOf2()) {
5031      // smulo(x, signed_min) is same as umulo(x, signed_min).
5032      bool UseArithShift = isSigned && !C.isMinSignedValue();
5033      SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
5034      SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
5035      SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
5036          DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
5037                      SL, VT, Result, ShiftAmt),
5038          LHS, ISD::SETNE);
5039      return DAG.getMergeValues({ Result, Overflow }, SL);
5040    }
5041  }
5042
5043  SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
5044  SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU,
5045                            SL, VT, LHS, RHS);
5046
5047  SDValue Sign = isSigned
5048    ? DAG.getNode(ISD::SRA, SL, VT, Result,
5049                  DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
5050    : DAG.getConstant(0, SL, VT);
5051  SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
5052
5053  return DAG.getMergeValues({ Result, Overflow }, SL);
5054}
5055
5056SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
5057  SDLoc SL(Op);
5058  SDValue Chain = Op.getOperand(0);
5059
5060  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
5061      !Subtarget->isTrapHandlerEnabled())
5062    return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
5063
5064  MachineFunction &MF = DAG.getMachineFunction();
5065  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5066  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
5067  assert(UserSGPR != AMDGPU::NoRegister);
5068  SDValue QueuePtr = CreateLiveInRegister(
5069    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
5070  SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
5071  SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
5072                                   QueuePtr, SDValue());
5073  SDValue Ops[] = {
5074    ToReg,
5075    DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
5076    SGPR01,
5077    ToReg.getValue(1)
5078  };
5079  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5080}
5081
5082SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
5083  SDLoc SL(Op);
5084  SDValue Chain = Op.getOperand(0);
5085  MachineFunction &MF = DAG.getMachineFunction();
5086
5087  if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
5088      !Subtarget->isTrapHandlerEnabled()) {
5089    DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
5090                                     "debugtrap handler not supported",
5091                                     Op.getDebugLoc(),
5092                                     DS_Warning);
5093    LLVMContext &Ctx = MF.getFunction().getContext();
5094    Ctx.diagnose(NoTrap);
5095    return Chain;
5096  }
5097
5098  SDValue Ops[] = {
5099    Chain,
5100    DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
5101  };
5102  return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5103}
5104
5105SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
5106                                             SelectionDAG &DAG) const {
5107  // FIXME: Use inline constants (src_{shared, private}_base) instead.
5108  if (Subtarget->hasApertureRegs()) {
5109    unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
5110        AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
5111        AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
5112    unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
5113        AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
5114        AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
5115    unsigned Encoding =
5116        AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
5117        Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
5118        WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
5119
5120    SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
5121    SDValue ApertureReg = SDValue(
5122        DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
5123    SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
5124    return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
5125  }
5126
5127  MachineFunction &MF = DAG.getMachineFunction();
5128  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5129  Register UserSGPR = Info->getQueuePtrUserSGPR();
5130  assert(UserSGPR != AMDGPU::NoRegister);
5131
5132  SDValue QueuePtr = CreateLiveInRegister(
5133    DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
5134
5135  // Offset into amd_queue_t for group_segment_aperture_base_hi /
5136  // private_segment_aperture_base_hi.
5137  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
5138
5139  SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
5140
5141  // TODO: Use custom target PseudoSourceValue.
5142  // TODO: We should use the value from the IR intrinsic call, but it might not
5143  // be available and how do we get it?
5144  MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5145  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
5146                     MinAlign(64, StructOffset),
5147                     MachineMemOperand::MODereferenceable |
5148                         MachineMemOperand::MOInvariant);
5149}
5150
5151SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
5152                                             SelectionDAG &DAG) const {
5153  SDLoc SL(Op);
5154  const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
5155
5156  SDValue Src = ASC->getOperand(0);
5157  SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
5158
5159  const AMDGPUTargetMachine &TM =
5160    static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
5161
5162  // flat -> local/private
5163  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
5164    unsigned DestAS = ASC->getDestAddressSpace();
5165
5166    if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
5167        DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
5168      unsigned NullVal = TM.getNullPointerValue(DestAS);
5169      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
5170      SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
5171      SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
5172
5173      return DAG.getNode(ISD::SELECT, SL, MVT::i32,
5174                         NonNull, Ptr, SegmentNullPtr);
5175    }
5176  }
5177
5178  // local/private -> flat
5179  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
5180    unsigned SrcAS = ASC->getSrcAddressSpace();
5181
5182    if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
5183        SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
5184      unsigned NullVal = TM.getNullPointerValue(SrcAS);
5185      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
5186
5187      SDValue NonNull
5188        = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
5189
5190      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
5191      SDValue CvtPtr
5192        = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
5193
5194      return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
5195                         DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
5196                         FlatNullPtr);
5197    }
5198  }
5199
5200  if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
5201      Src.getValueType() == MVT::i64)
5202    return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
5203
5204  // global <-> flat are no-ops and never emitted.
5205
5206  const MachineFunction &MF = DAG.getMachineFunction();
5207  DiagnosticInfoUnsupported InvalidAddrSpaceCast(
5208    MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
5209  DAG.getContext()->diagnose(InvalidAddrSpaceCast);
5210
5211  return DAG.getUNDEF(ASC->getValueType(0));
5212}
5213
5214// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
5215// the small vector and inserting them into the big vector. That is better than
5216// the default expansion of doing it via a stack slot. Even though the use of
5217// the stack slot would be optimized away afterwards, the stack slot itself
5218// remains.
5219SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
5220                                                SelectionDAG &DAG) const {
5221  SDValue Vec = Op.getOperand(0);
5222  SDValue Ins = Op.getOperand(1);
5223  SDValue Idx = Op.getOperand(2);
5224  EVT VecVT = Vec.getValueType();
5225  EVT InsVT = Ins.getValueType();
5226  EVT EltVT = VecVT.getVectorElementType();
5227  unsigned InsNumElts = InsVT.getVectorNumElements();
5228  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5229  SDLoc SL(Op);
5230
5231  for (unsigned I = 0; I != InsNumElts; ++I) {
5232    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
5233                              DAG.getConstant(I, SL, MVT::i32));
5234    Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
5235                      DAG.getConstant(IdxVal + I, SL, MVT::i32));
5236  }
5237  return Vec;
5238}
5239
5240SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
5241                                                 SelectionDAG &DAG) const {
5242  SDValue Vec = Op.getOperand(0);
5243  SDValue InsVal = Op.getOperand(1);
5244  SDValue Idx = Op.getOperand(2);
5245  EVT VecVT = Vec.getValueType();
5246  EVT EltVT = VecVT.getVectorElementType();
5247  unsigned VecSize = VecVT.getSizeInBits();
5248  unsigned EltSize = EltVT.getSizeInBits();
5249
5250
5251  assert(VecSize <= 64);
5252
5253  unsigned NumElts = VecVT.getVectorNumElements();
5254  SDLoc SL(Op);
5255  auto KIdx = dyn_cast<ConstantSDNode>(Idx);
5256
5257  if (NumElts == 4 && EltSize == 16 && KIdx) {
5258    SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
5259
5260    SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
5261                                 DAG.getConstant(0, SL, MVT::i32));
5262    SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
5263                                 DAG.getConstant(1, SL, MVT::i32));
5264
5265    SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
5266    SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
5267
5268    unsigned Idx = KIdx->getZExtValue();
5269    bool InsertLo = Idx < 2;
5270    SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
5271      InsertLo ? LoVec : HiVec,
5272      DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
5273      DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
5274
5275    InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
5276
5277    SDValue Concat = InsertLo ?
5278      DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
5279      DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
5280
5281    return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
5282  }
5283
5284  if (isa<ConstantSDNode>(Idx))
5285    return SDValue();
5286
5287  MVT IntVT = MVT::getIntegerVT(VecSize);
5288
5289  // Avoid stack access for dynamic indexing.
5290  // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
5291
5292  // Create a congruent vector with the target value in each element so that
5293  // the required element can be masked and ORed into the target vector.
5294  SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
5295                               DAG.getSplatBuildVector(VecVT, SL, InsVal));
5296
5297  assert(isPowerOf2_32(EltSize));
5298  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
5299
5300  // Convert vector index to bit-index.
5301  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
5302
5303  SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
5304  SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
5305                            DAG.getConstant(0xffff, SL, IntVT),
5306                            ScaledIdx);
5307
5308  SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
5309  SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
5310                            DAG.getNOT(SL, BFM, IntVT), BCVec);
5311
5312  SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
5313  return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
5314}
5315
5316SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
5317                                                  SelectionDAG &DAG) const {
5318  SDLoc SL(Op);
5319
5320  EVT ResultVT = Op.getValueType();
5321  SDValue Vec = Op.getOperand(0);
5322  SDValue Idx = Op.getOperand(1);
5323  EVT VecVT = Vec.getValueType();
5324  unsigned VecSize = VecVT.getSizeInBits();
5325  EVT EltVT = VecVT.getVectorElementType();
5326  assert(VecSize <= 64);
5327
5328  DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
5329
5330  // Make sure we do any optimizations that will make it easier to fold
5331  // source modifiers before obscuring it with bit operations.
5332
5333  // XXX - Why doesn't this get called when vector_shuffle is expanded?
5334  if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
5335    return Combined;
5336
5337  unsigned EltSize = EltVT.getSizeInBits();
5338  assert(isPowerOf2_32(EltSize));
5339
5340  MVT IntVT = MVT::getIntegerVT(VecSize);
5341  SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
5342
5343  // Convert vector index to bit-index (* EltSize)
5344  SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
5345
5346  SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
5347  SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
5348
5349  if (ResultVT == MVT::f16) {
5350    SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
5351    return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
5352  }
5353
5354  return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
5355}
5356
5357static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
5358  assert(Elt % 2 == 0);
5359  return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
5360}
5361
5362SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
5363                                              SelectionDAG &DAG) const {
5364  SDLoc SL(Op);
5365  EVT ResultVT = Op.getValueType();
5366  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
5367
5368  EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
5369  EVT EltVT = PackVT.getVectorElementType();
5370  int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
5371
5372  // vector_shuffle <0,1,6,7> lhs, rhs
5373  // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
5374  //
5375  // vector_shuffle <6,7,2,3> lhs, rhs
5376  // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
5377  //
5378  // vector_shuffle <6,7,0,1> lhs, rhs
5379  // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
5380
5381  // Avoid scalarizing when both halves are reading from consecutive elements.
5382  SmallVector<SDValue, 4> Pieces;
5383  for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
5384    if (elementPairIsContiguous(SVN->getMask(), I)) {
5385      const int Idx = SVN->getMaskElt(I);
5386      int VecIdx = Idx < SrcNumElts ? 0 : 1;
5387      int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
5388      SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
5389                                    PackVT, SVN->getOperand(VecIdx),
5390                                    DAG.getConstant(EltIdx, SL, MVT::i32));
5391      Pieces.push_back(SubVec);
5392    } else {
5393      const int Idx0 = SVN->getMaskElt(I);
5394      const int Idx1 = SVN->getMaskElt(I + 1);
5395      int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
5396      int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
5397      int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
5398      int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
5399
5400      SDValue Vec0 = SVN->getOperand(VecIdx0);
5401      SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
5402                                 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
5403
5404      SDValue Vec1 = SVN->getOperand(VecIdx1);
5405      SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
5406                                 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
5407      Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
5408    }
5409  }
5410
5411  return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
5412}
5413
5414SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
5415                                            SelectionDAG &DAG) const {
5416  SDLoc SL(Op);
5417  EVT VT = Op.getValueType();
5418
5419  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
5420    EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
5421
5422    // Turn into pair of packed build_vectors.
5423    // TODO: Special case for constants that can be materialized with s_mov_b64.
5424    SDValue Lo = DAG.getBuildVector(HalfVT, SL,
5425                                    { Op.getOperand(0), Op.getOperand(1) });
5426    SDValue Hi = DAG.getBuildVector(HalfVT, SL,
5427                                    { Op.getOperand(2), Op.getOperand(3) });
5428
5429    SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
5430    SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
5431
5432    SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
5433    return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
5434  }
5435
5436  assert(VT == MVT::v2f16 || VT == MVT::v2i16);
5437  assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
5438
5439  SDValue Lo = Op.getOperand(0);
5440  SDValue Hi = Op.getOperand(1);
5441
5442  // Avoid adding defined bits with the zero_extend.
5443  if (Hi.isUndef()) {
5444    Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
5445    SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
5446    return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
5447  }
5448
5449  Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
5450  Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
5451
5452  SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
5453                              DAG.getConstant(16, SL, MVT::i32));
5454  if (Lo.isUndef())
5455    return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
5456
5457  Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
5458  Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
5459
5460  SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
5461  return DAG.getNode(ISD::BITCAST, SL, VT, Or);
5462}
5463
5464bool
5465SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
5466  // We can fold offsets for anything that doesn't require a GOT relocation.
5467  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
5468          GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
5469          GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
5470         !shouldEmitGOTReloc(GA->getGlobal());
5471}
5472
5473static SDValue
5474buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
5475                        const SDLoc &DL, int64_t Offset, EVT PtrVT,
5476                        unsigned GAFlags = SIInstrInfo::MO_NONE) {
5477  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
5478  // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
5479  // lowered to the following code sequence:
5480  //
5481  // For constant address space:
5482  //   s_getpc_b64 s[0:1]
5483  //   s_add_u32 s0, s0, $symbol
5484  //   s_addc_u32 s1, s1, 0
5485  //
5486  //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
5487  //   a fixup or relocation is emitted to replace $symbol with a literal
5488  //   constant, which is a pc-relative offset from the encoding of the $symbol
5489  //   operand to the global variable.
5490  //
5491  // For global address space:
5492  //   s_getpc_b64 s[0:1]
5493  //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
5494  //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
5495  //
5496  //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
5497  //   fixups or relocations are emitted to replace $symbol@*@lo and
5498  //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
5499  //   which is a 64-bit pc-relative offset from the encoding of the $symbol
5500  //   operand to the global variable.
5501  //
5502  // What we want here is an offset from the value returned by s_getpc
5503  // (which is the address of the s_add_u32 instruction) to the global
5504  // variable, but since the encoding of $symbol starts 4 bytes after the start
5505  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
5506  // small. This requires us to add 4 to the global variable offset in order to
5507  // compute the correct address.
5508  SDValue PtrLo =
5509      DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
5510  SDValue PtrHi;
5511  if (GAFlags == SIInstrInfo::MO_NONE) {
5512    PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
5513  } else {
5514    PtrHi =
5515        DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1);
5516  }
5517  return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
5518}
5519
5520SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
5521                                             SDValue Op,
5522                                             SelectionDAG &DAG) const {
5523  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
5524  const GlobalValue *GV = GSD->getGlobal();
5525  if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
5526       shouldUseLDSConstAddress(GV)) ||
5527      GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
5528      GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
5529    return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
5530
5531  SDLoc DL(GSD);
5532  EVT PtrVT = Op.getValueType();
5533
5534  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
5535    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
5536                                            SIInstrInfo::MO_ABS32_LO);
5537    return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
5538  }
5539
5540  if (shouldEmitFixup(GV))
5541    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
5542  else if (shouldEmitPCReloc(GV))
5543    return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
5544                                   SIInstrInfo::MO_REL32);
5545
5546  SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
5547                                            SIInstrInfo::MO_GOTPCREL32);
5548
5549  Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
5550  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
5551  const DataLayout &DataLayout = DAG.getDataLayout();
5552  Align Alignment = DataLayout.getABITypeAlign(PtrTy);
5553  MachinePointerInfo PtrInfo
5554    = MachinePointerInfo::getGOT(DAG.getMachineFunction());
5555
5556  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
5557                     MachineMemOperand::MODereferenceable |
5558                         MachineMemOperand::MOInvariant);
5559}
5560
5561SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
5562                                   const SDLoc &DL, SDValue V) const {
5563  // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
5564  // the destination register.
5565  //
5566  // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
5567  // so we will end up with redundant moves to m0.
5568  //
5569  // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
5570
5571  // A Null SDValue creates a glue result.
5572  SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
5573                                  V, Chain);
5574  return SDValue(M0, 0);
5575}
5576
5577SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
5578                                                 SDValue Op,
5579                                                 MVT VT,
5580                                                 unsigned Offset) const {
5581  SDLoc SL(Op);
5582  SDValue Param = lowerKernargMemParameter(
5583      DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
5584  // The local size values will have the hi 16-bits as zero.
5585  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
5586                     DAG.getValueType(VT));
5587}
5588
5589static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
5590                                        EVT VT) {
5591  DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
5592                                      "non-hsa intrinsic with hsa target",
5593                                      DL.getDebugLoc());
5594  DAG.getContext()->diagnose(BadIntrin);
5595  return DAG.getUNDEF(VT);
5596}
5597
5598static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
5599                                         EVT VT) {
5600  DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
5601                                      "intrinsic not supported on subtarget",
5602                                      DL.getDebugLoc());
5603  DAG.getContext()->diagnose(BadIntrin);
5604  return DAG.getUNDEF(VT);
5605}
5606
5607static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
5608                                    ArrayRef<SDValue> Elts) {
5609  assert(!Elts.empty());
5610  MVT Type;
5611  unsigned NumElts;
5612
5613  if (Elts.size() == 1) {
5614    Type = MVT::f32;
5615    NumElts = 1;
5616  } else if (Elts.size() == 2) {
5617    Type = MVT::v2f32;
5618    NumElts = 2;
5619  } else if (Elts.size() == 3) {
5620    Type = MVT::v3f32;
5621    NumElts = 3;
5622  } else if (Elts.size() <= 4) {
5623    Type = MVT::v4f32;
5624    NumElts = 4;
5625  } else if (Elts.size() <= 8) {
5626    Type = MVT::v8f32;
5627    NumElts = 8;
5628  } else {
5629    assert(Elts.size() <= 16);
5630    Type = MVT::v16f32;
5631    NumElts = 16;
5632  }
5633
5634  SmallVector<SDValue, 16> VecElts(NumElts);
5635  for (unsigned i = 0; i < Elts.size(); ++i) {
5636    SDValue Elt = Elts[i];
5637    if (Elt.getValueType() != MVT::f32)
5638      Elt = DAG.getBitcast(MVT::f32, Elt);
5639    VecElts[i] = Elt;
5640  }
5641  for (unsigned i = Elts.size(); i < NumElts; ++i)
5642    VecElts[i] = DAG.getUNDEF(MVT::f32);
5643
5644  if (NumElts == 1)
5645    return VecElts[0];
5646  return DAG.getBuildVector(Type, DL, VecElts);
5647}
5648
5649static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
5650                             SDValue *GLC, SDValue *SLC, SDValue *DLC) {
5651  auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
5652
5653  uint64_t Value = CachePolicyConst->getZExtValue();
5654  SDLoc DL(CachePolicy);
5655  if (GLC) {
5656    *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
5657    Value &= ~(uint64_t)0x1;
5658  }
5659  if (SLC) {
5660    *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
5661    Value &= ~(uint64_t)0x2;
5662  }
5663  if (DLC) {
5664    *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
5665    Value &= ~(uint64_t)0x4;
5666  }
5667
5668  return Value == 0;
5669}
5670
5671static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
5672                              SDValue Src, int ExtraElts) {
5673  EVT SrcVT = Src.getValueType();
5674
5675  SmallVector<SDValue, 8> Elts;
5676
5677  if (SrcVT.isVector())
5678    DAG.ExtractVectorElements(Src, Elts);
5679  else
5680    Elts.push_back(Src);
5681
5682  SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
5683  while (ExtraElts--)
5684    Elts.push_back(Undef);
5685
5686  return DAG.getBuildVector(CastVT, DL, Elts);
5687}
5688
5689// Re-construct the required return value for a image load intrinsic.
5690// This is more complicated due to the optional use TexFailCtrl which means the required
5691// return type is an aggregate
5692static SDValue constructRetValue(SelectionDAG &DAG,
5693                                 MachineSDNode *Result,
5694                                 ArrayRef<EVT> ResultTypes,
5695                                 bool IsTexFail, bool Unpacked, bool IsD16,
5696                                 int DMaskPop, int NumVDataDwords,
5697                                 const SDLoc &DL, LLVMContext &Context) {
5698  // Determine the required return type. This is the same regardless of IsTexFail flag
5699  EVT ReqRetVT = ResultTypes[0];
5700  int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
5701  int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
5702    ReqRetNumElts : (ReqRetNumElts + 1) / 2;
5703
5704  int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
5705    DMaskPop : (DMaskPop + 1) / 2;
5706
5707  MVT DataDwordVT = NumDataDwords == 1 ?
5708    MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
5709
5710  MVT MaskPopVT = MaskPopDwords == 1 ?
5711    MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
5712
5713  SDValue Data(Result, 0);
5714  SDValue TexFail;
5715
5716  if (IsTexFail) {
5717    SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
5718    if (MaskPopVT.isVector()) {
5719      Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
5720                         SDValue(Result, 0), ZeroIdx);
5721    } else {
5722      Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
5723                         SDValue(Result, 0), ZeroIdx);
5724    }
5725
5726    TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
5727                          SDValue(Result, 0),
5728                          DAG.getConstant(MaskPopDwords, DL, MVT::i32));
5729  }
5730
5731  if (DataDwordVT.isVector())
5732    Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
5733                          NumDataDwords - MaskPopDwords);
5734
5735  if (IsD16)
5736    Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
5737
5738  if (!ReqRetVT.isVector())
5739    Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
5740
5741  Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
5742
5743  if (TexFail)
5744    return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
5745
5746  if (Result->getNumValues() == 1)
5747    return Data;
5748
5749  return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
5750}
5751
5752static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
5753                         SDValue *LWE, bool &IsTexFail) {
5754  auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
5755
5756  uint64_t Value = TexFailCtrlConst->getZExtValue();
5757  if (Value) {
5758    IsTexFail = true;
5759  }
5760
5761  SDLoc DL(TexFailCtrlConst);
5762  *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
5763  Value &= ~(uint64_t)0x1;
5764  *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
5765  Value &= ~(uint64_t)0x2;
5766
5767  return Value == 0;
5768}
5769
5770static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op,
5771                                        MVT PackVectorVT,
5772                                        SmallVectorImpl<SDValue> &PackedAddrs,
5773                                        unsigned DimIdx, unsigned EndIdx,
5774                                        unsigned NumGradients) {
5775  SDLoc DL(Op);
5776  for (unsigned I = DimIdx; I < EndIdx; I++) {
5777    SDValue Addr = Op.getOperand(I);
5778
5779    // Gradients are packed with undef for each coordinate.
5780    // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
5781    // 1D: undef,dx/dh; undef,dx/dv
5782    // 2D: dy/dh,dx/dh; dy/dv,dx/dv
5783    // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
5784    if (((I + 1) >= EndIdx) ||
5785        ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
5786                                         I == DimIdx + NumGradients - 1))) {
5787      if (Addr.getValueType() != MVT::i16)
5788        Addr = DAG.getBitcast(MVT::i16, Addr);
5789      Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
5790    } else {
5791      Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
5792      I++;
5793    }
5794    Addr = DAG.getBitcast(MVT::f32, Addr);
5795    PackedAddrs.push_back(Addr);
5796  }
5797}
5798
5799SDValue SITargetLowering::lowerImage(SDValue Op,
5800                                     const AMDGPU::ImageDimIntrinsicInfo *Intr,
5801                                     SelectionDAG &DAG) const {
5802  SDLoc DL(Op);
5803  MachineFunction &MF = DAG.getMachineFunction();
5804  const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
5805  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5806      AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
5807  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
5808  const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
5809      AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
5810  const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
5811      AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
5812  unsigned IntrOpcode = Intr->BaseOpcode;
5813  bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
5814
5815  SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
5816  SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
5817  bool IsD16 = false;
5818  bool IsG16 = false;
5819  bool IsA16 = false;
5820  SDValue VData;
5821  int NumVDataDwords;
5822  bool AdjustRetType = false;
5823
5824  unsigned AddrIdx; // Index of first address argument
5825  unsigned DMask;
5826  unsigned DMaskLanes = 0;
5827
5828  if (BaseOpcode->Atomic) {
5829    VData = Op.getOperand(2);
5830
5831    bool Is64Bit = VData.getValueType() == MVT::i64;
5832    if (BaseOpcode->AtomicX2) {
5833      SDValue VData2 = Op.getOperand(3);
5834      VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
5835                                 {VData, VData2});
5836      if (Is64Bit)
5837        VData = DAG.getBitcast(MVT::v4i32, VData);
5838
5839      ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
5840      DMask = Is64Bit ? 0xf : 0x3;
5841      NumVDataDwords = Is64Bit ? 4 : 2;
5842      AddrIdx = 4;
5843    } else {
5844      DMask = Is64Bit ? 0x3 : 0x1;
5845      NumVDataDwords = Is64Bit ? 2 : 1;
5846      AddrIdx = 3;
5847    }
5848  } else {
5849    unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
5850    auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
5851    DMask = DMaskConst->getZExtValue();
5852    DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
5853
5854    if (BaseOpcode->Store) {
5855      VData = Op.getOperand(2);
5856
5857      MVT StoreVT = VData.getSimpleValueType();
5858      if (StoreVT.getScalarType() == MVT::f16) {
5859        if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
5860          return Op; // D16 is unsupported for this instruction
5861
5862        IsD16 = true;
5863        VData = handleD16VData(VData, DAG);
5864      }
5865
5866      NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
5867    } else {
5868      // Work out the num dwords based on the dmask popcount and underlying type
5869      // and whether packing is supported.
5870      MVT LoadVT = ResultTypes[0].getSimpleVT();
5871      if (LoadVT.getScalarType() == MVT::f16) {
5872        if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
5873          return Op; // D16 is unsupported for this instruction
5874
5875        IsD16 = true;
5876      }
5877
5878      // Confirm that the return type is large enough for the dmask specified
5879      if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
5880          (!LoadVT.isVector() && DMaskLanes > 1))
5881          return Op;
5882
5883      if (IsD16 && !Subtarget->hasUnpackedD16VMem())
5884        NumVDataDwords = (DMaskLanes + 1) / 2;
5885      else
5886        NumVDataDwords = DMaskLanes;
5887
5888      AdjustRetType = true;
5889    }
5890
5891    AddrIdx = DMaskIdx + 1;
5892  }
5893
5894  unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
5895  unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
5896  unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
5897  unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
5898                       NumCoords + NumLCM;
5899  unsigned NumMIVAddrs = NumVAddrs;
5900
5901  SmallVector<SDValue, 4> VAddrs;
5902
5903  // Optimize _L to _LZ when _L is zero
5904  if (LZMappingInfo) {
5905    if (auto ConstantLod =
5906         dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
5907      if (ConstantLod->isZero() || ConstantLod->isNegative()) {
5908        IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
5909        NumMIVAddrs--;               // remove 'lod'
5910      }
5911    }
5912  }
5913
5914  // Optimize _mip away, when 'lod' is zero
5915  if (MIPMappingInfo) {
5916    if (auto ConstantLod =
5917         dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
5918      if (ConstantLod->isNullValue()) {
5919        IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
5920        NumMIVAddrs--;               // remove 'lod'
5921      }
5922    }
5923  }
5924
5925  // Push back extra arguments.
5926  for (unsigned I = 0; I < BaseOpcode->NumExtraArgs; I++)
5927    VAddrs.push_back(Op.getOperand(AddrIdx + I));
5928
5929  // Check for 16 bit addresses or derivatives and pack if true.
5930  unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
5931  unsigned CoordIdx = DimIdx + NumGradients;
5932  unsigned CoordsEnd = AddrIdx + NumMIVAddrs;
5933
5934  MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
5935  MVT VAddrScalarVT = VAddrVT.getScalarType();
5936  MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
5937  IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
5938
5939  VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType();
5940  VAddrScalarVT = VAddrVT.getScalarType();
5941  IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
5942  if (IsA16 || IsG16) {
5943    if (IsA16) {
5944      if (!ST->hasA16()) {
5945        LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
5946                             "support 16 bit addresses\n");
5947        return Op;
5948      }
5949      if (!IsG16) {
5950        LLVM_DEBUG(
5951            dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
5952                      "need 16 bit derivatives but got 32 bit derivatives\n");
5953        return Op;
5954      }
5955    } else if (!ST->hasG16()) {
5956      LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
5957                           "support 16 bit derivatives\n");
5958      return Op;
5959    }
5960
5961    if (BaseOpcode->Gradients && !IsA16) {
5962      if (!ST->hasG16()) {
5963        LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
5964                             "support 16 bit derivatives\n");
5965        return Op;
5966      }
5967      // Activate g16
5968      const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
5969          AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
5970      IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
5971    }
5972
5973    // Don't compress addresses for G16
5974    const int PackEndIdx = IsA16 ? CoordsEnd : CoordIdx;
5975    packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, DimIdx,
5976                                PackEndIdx, NumGradients);
5977
5978    if (!IsA16) {
5979      // Add uncompressed address
5980      for (unsigned I = CoordIdx; I < CoordsEnd; I++)
5981        VAddrs.push_back(Op.getOperand(I));
5982    }
5983  } else {
5984    for (unsigned I = DimIdx; I < CoordsEnd; I++)
5985      VAddrs.push_back(Op.getOperand(I));
5986  }
5987
5988  // If the register allocator cannot place the address registers contiguously
5989  // without introducing moves, then using the non-sequential address encoding
5990  // is always preferable, since it saves VALU instructions and is usually a
5991  // wash in terms of code size or even better.
5992  //
5993  // However, we currently have no way of hinting to the register allocator that
5994  // MIMG addresses should be placed contiguously when it is possible to do so,
5995  // so force non-NSA for the common 2-address case as a heuristic.
5996  //
5997  // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5998  // allocation when possible.
5999  bool UseNSA =
6000      ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
6001  SDValue VAddr;
6002  if (!UseNSA)
6003    VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
6004
6005  SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
6006  SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
6007  unsigned CtrlIdx; // Index of texfailctrl argument
6008  SDValue Unorm;
6009  if (!BaseOpcode->Sampler) {
6010    Unorm = True;
6011    CtrlIdx = AddrIdx + NumVAddrs + 1;
6012  } else {
6013    auto UnormConst =
6014        cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
6015
6016    Unorm = UnormConst->getZExtValue() ? True : False;
6017    CtrlIdx = AddrIdx + NumVAddrs + 3;
6018  }
6019
6020  SDValue TFE;
6021  SDValue LWE;
6022  SDValue TexFail = Op.getOperand(CtrlIdx);
6023  bool IsTexFail = false;
6024  if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
6025    return Op;
6026
6027  if (IsTexFail) {
6028    if (!DMaskLanes) {
6029      // Expecting to get an error flag since TFC is on - and dmask is 0
6030      // Force dmask to be at least 1 otherwise the instruction will fail
6031      DMask = 0x1;
6032      DMaskLanes = 1;
6033      NumVDataDwords = 1;
6034    }
6035    NumVDataDwords += 1;
6036    AdjustRetType = true;
6037  }
6038
6039  // Has something earlier tagged that the return type needs adjusting
6040  // This happens if the instruction is a load or has set TexFailCtrl flags
6041  if (AdjustRetType) {
6042    // NumVDataDwords reflects the true number of dwords required in the return type
6043    if (DMaskLanes == 0 && !BaseOpcode->Store) {
6044      // This is a no-op load. This can be eliminated
6045      SDValue Undef = DAG.getUNDEF(Op.getValueType());
6046      if (isa<MemSDNode>(Op))
6047        return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
6048      return Undef;
6049    }
6050
6051    EVT NewVT = NumVDataDwords > 1 ?
6052                  EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
6053                : MVT::i32;
6054
6055    ResultTypes[0] = NewVT;
6056    if (ResultTypes.size() == 3) {
6057      // Original result was aggregate type used for TexFailCtrl results
6058      // The actual instruction returns as a vector type which has now been
6059      // created. Remove the aggregate result.
6060      ResultTypes.erase(&ResultTypes[1]);
6061    }
6062  }
6063
6064  SDValue GLC;
6065  SDValue SLC;
6066  SDValue DLC;
6067  if (BaseOpcode->Atomic) {
6068    GLC = True; // TODO no-return optimization
6069    if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
6070                          IsGFX10 ? &DLC : nullptr))
6071      return Op;
6072  } else {
6073    if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
6074                          IsGFX10 ? &DLC : nullptr))
6075      return Op;
6076  }
6077
6078  SmallVector<SDValue, 26> Ops;
6079  if (BaseOpcode->Store || BaseOpcode->Atomic)
6080    Ops.push_back(VData); // vdata
6081  if (UseNSA) {
6082    for (const SDValue &Addr : VAddrs)
6083      Ops.push_back(Addr);
6084  } else {
6085    Ops.push_back(VAddr);
6086  }
6087  Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
6088  if (BaseOpcode->Sampler)
6089    Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
6090  Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
6091  if (IsGFX10)
6092    Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
6093  Ops.push_back(Unorm);
6094  if (IsGFX10)
6095    Ops.push_back(DLC);
6096  Ops.push_back(GLC);
6097  Ops.push_back(SLC);
6098  Ops.push_back(IsA16 &&  // r128, a16 for gfx9
6099                ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
6100  if (IsGFX10)
6101    Ops.push_back(IsA16 ? True : False);
6102  Ops.push_back(TFE);
6103  Ops.push_back(LWE);
6104  if (!IsGFX10)
6105    Ops.push_back(DimInfo->DA ? True : False);
6106  if (BaseOpcode->HasD16)
6107    Ops.push_back(IsD16 ? True : False);
6108  if (isa<MemSDNode>(Op))
6109    Ops.push_back(Op.getOperand(0)); // chain
6110
6111  int NumVAddrDwords =
6112      UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
6113  int Opcode = -1;
6114
6115  if (IsGFX10) {
6116    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
6117                                   UseNSA ? AMDGPU::MIMGEncGfx10NSA
6118                                          : AMDGPU::MIMGEncGfx10Default,
6119                                   NumVDataDwords, NumVAddrDwords);
6120  } else {
6121    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
6122      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
6123                                     NumVDataDwords, NumVAddrDwords);
6124    if (Opcode == -1)
6125      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
6126                                     NumVDataDwords, NumVAddrDwords);
6127  }
6128  assert(Opcode != -1);
6129
6130  MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
6131  if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
6132    MachineMemOperand *MemRef = MemOp->getMemOperand();
6133    DAG.setNodeMemRefs(NewNode, {MemRef});
6134  }
6135
6136  if (BaseOpcode->AtomicX2) {
6137    SmallVector<SDValue, 1> Elt;
6138    DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
6139    return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
6140  } else if (!BaseOpcode->Store) {
6141    return constructRetValue(DAG, NewNode,
6142                             OrigResultTypes, IsTexFail,
6143                             Subtarget->hasUnpackedD16VMem(), IsD16,
6144                             DMaskLanes, NumVDataDwords, DL,
6145                             *DAG.getContext());
6146  }
6147
6148  return SDValue(NewNode, 0);
6149}
6150
6151SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
6152                                       SDValue Offset, SDValue CachePolicy,
6153                                       SelectionDAG &DAG) const {
6154  MachineFunction &MF = DAG.getMachineFunction();
6155
6156  const DataLayout &DataLayout = DAG.getDataLayout();
6157  Align Alignment =
6158      DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
6159
6160  MachineMemOperand *MMO = MF.getMachineMemOperand(
6161      MachinePointerInfo(),
6162      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6163          MachineMemOperand::MOInvariant,
6164      VT.getStoreSize(), Alignment);
6165
6166  if (!Offset->isDivergent()) {
6167    SDValue Ops[] = {
6168        Rsrc,
6169        Offset, // Offset
6170        CachePolicy
6171    };
6172
6173    // Widen vec3 load to vec4.
6174    if (VT.isVector() && VT.getVectorNumElements() == 3) {
6175      EVT WidenedVT =
6176          EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
6177      auto WidenedOp = DAG.getMemIntrinsicNode(
6178          AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
6179          MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
6180      auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
6181                                   DAG.getVectorIdxConstant(0, DL));
6182      return Subvector;
6183    }
6184
6185    return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
6186                                   DAG.getVTList(VT), Ops, VT, MMO);
6187  }
6188
6189  // We have a divergent offset. Emit a MUBUF buffer load instead. We can
6190  // assume that the buffer is unswizzled.
6191  SmallVector<SDValue, 4> Loads;
6192  unsigned NumLoads = 1;
6193  MVT LoadVT = VT.getSimpleVT();
6194  unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
6195  assert((LoadVT.getScalarType() == MVT::i32 ||
6196          LoadVT.getScalarType() == MVT::f32));
6197
6198  if (NumElts == 8 || NumElts == 16) {
6199    NumLoads = NumElts / 4;
6200    LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
6201  }
6202
6203  SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
6204  SDValue Ops[] = {
6205      DAG.getEntryNode(),                               // Chain
6206      Rsrc,                                             // rsrc
6207      DAG.getConstant(0, DL, MVT::i32),                 // vindex
6208      {},                                               // voffset
6209      {},                                               // soffset
6210      {},                                               // offset
6211      CachePolicy,                                      // cachepolicy
6212      DAG.getTargetConstant(0, DL, MVT::i1),            // idxen
6213  };
6214
6215  // Use the alignment to ensure that the required offsets will fit into the
6216  // immediate offsets.
6217  setBufferOffsets(Offset, DAG, &Ops[3],
6218                   NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
6219
6220  uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
6221  for (unsigned i = 0; i < NumLoads; ++i) {
6222    Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
6223    Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
6224                                        LoadVT, MMO, DAG));
6225  }
6226
6227  if (NumElts == 8 || NumElts == 16)
6228    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
6229
6230  return Loads[0];
6231}
6232
6233SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6234                                                  SelectionDAG &DAG) const {
6235  MachineFunction &MF = DAG.getMachineFunction();
6236  auto MFI = MF.getInfo<SIMachineFunctionInfo>();
6237
6238  EVT VT = Op.getValueType();
6239  SDLoc DL(Op);
6240  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6241
6242  // TODO: Should this propagate fast-math-flags?
6243
6244  switch (IntrinsicID) {
6245  case Intrinsic::amdgcn_implicit_buffer_ptr: {
6246    if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
6247      return emitNonHSAIntrinsicError(DAG, DL, VT);
6248    return getPreloadedValue(DAG, *MFI, VT,
6249                             AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
6250  }
6251  case Intrinsic::amdgcn_dispatch_ptr:
6252  case Intrinsic::amdgcn_queue_ptr: {
6253    if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
6254      DiagnosticInfoUnsupported BadIntrin(
6255          MF.getFunction(), "unsupported hsa intrinsic without hsa target",
6256          DL.getDebugLoc());
6257      DAG.getContext()->diagnose(BadIntrin);
6258      return DAG.getUNDEF(VT);
6259    }
6260
6261    auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
6262      AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
6263    return getPreloadedValue(DAG, *MFI, VT, RegID);
6264  }
6265  case Intrinsic::amdgcn_implicitarg_ptr: {
6266    if (MFI->isEntryFunction())
6267      return getImplicitArgPtr(DAG, DL);
6268    return getPreloadedValue(DAG, *MFI, VT,
6269                             AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
6270  }
6271  case Intrinsic::amdgcn_kernarg_segment_ptr: {
6272    if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
6273      // This only makes sense to call in a kernel, so just lower to null.
6274      return DAG.getConstant(0, DL, VT);
6275    }
6276
6277    return getPreloadedValue(DAG, *MFI, VT,
6278                             AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
6279  }
6280  case Intrinsic::amdgcn_dispatch_id: {
6281    return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
6282  }
6283  case Intrinsic::amdgcn_rcp:
6284    return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
6285  case Intrinsic::amdgcn_rsq:
6286    return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
6287  case Intrinsic::amdgcn_rsq_legacy:
6288    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
6289      return emitRemovedIntrinsicError(DAG, DL, VT);
6290    return SDValue();
6291  case Intrinsic::amdgcn_rcp_legacy:
6292    if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
6293      return emitRemovedIntrinsicError(DAG, DL, VT);
6294    return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
6295  case Intrinsic::amdgcn_rsq_clamp: {
6296    if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6297      return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
6298
6299    Type *Type = VT.getTypeForEVT(*DAG.getContext());
6300    APFloat Max = APFloat::getLargest(Type->getFltSemantics());
6301    APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
6302
6303    SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
6304    SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
6305                              DAG.getConstantFP(Max, DL, VT));
6306    return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
6307                       DAG.getConstantFP(Min, DL, VT));
6308  }
6309  case Intrinsic::r600_read_ngroups_x:
6310    if (Subtarget->isAmdHsaOS())
6311      return emitNonHSAIntrinsicError(DAG, DL, VT);
6312
6313    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6314                                    SI::KernelInputOffsets::NGROUPS_X, Align(4),
6315                                    false);
6316  case Intrinsic::r600_read_ngroups_y:
6317    if (Subtarget->isAmdHsaOS())
6318      return emitNonHSAIntrinsicError(DAG, DL, VT);
6319
6320    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6321                                    SI::KernelInputOffsets::NGROUPS_Y, Align(4),
6322                                    false);
6323  case Intrinsic::r600_read_ngroups_z:
6324    if (Subtarget->isAmdHsaOS())
6325      return emitNonHSAIntrinsicError(DAG, DL, VT);
6326
6327    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6328                                    SI::KernelInputOffsets::NGROUPS_Z, Align(4),
6329                                    false);
6330  case Intrinsic::r600_read_global_size_x:
6331    if (Subtarget->isAmdHsaOS())
6332      return emitNonHSAIntrinsicError(DAG, DL, VT);
6333
6334    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6335                                    SI::KernelInputOffsets::GLOBAL_SIZE_X,
6336                                    Align(4), false);
6337  case Intrinsic::r600_read_global_size_y:
6338    if (Subtarget->isAmdHsaOS())
6339      return emitNonHSAIntrinsicError(DAG, DL, VT);
6340
6341    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6342                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y,
6343                                    Align(4), false);
6344  case Intrinsic::r600_read_global_size_z:
6345    if (Subtarget->isAmdHsaOS())
6346      return emitNonHSAIntrinsicError(DAG, DL, VT);
6347
6348    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6349                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z,
6350                                    Align(4), false);
6351  case Intrinsic::r600_read_local_size_x:
6352    if (Subtarget->isAmdHsaOS())
6353      return emitNonHSAIntrinsicError(DAG, DL, VT);
6354
6355    return lowerImplicitZextParam(DAG, Op, MVT::i16,
6356                                  SI::KernelInputOffsets::LOCAL_SIZE_X);
6357  case Intrinsic::r600_read_local_size_y:
6358    if (Subtarget->isAmdHsaOS())
6359      return emitNonHSAIntrinsicError(DAG, DL, VT);
6360
6361    return lowerImplicitZextParam(DAG, Op, MVT::i16,
6362                                  SI::KernelInputOffsets::LOCAL_SIZE_Y);
6363  case Intrinsic::r600_read_local_size_z:
6364    if (Subtarget->isAmdHsaOS())
6365      return emitNonHSAIntrinsicError(DAG, DL, VT);
6366
6367    return lowerImplicitZextParam(DAG, Op, MVT::i16,
6368                                  SI::KernelInputOffsets::LOCAL_SIZE_Z);
6369  case Intrinsic::amdgcn_workgroup_id_x:
6370    return getPreloadedValue(DAG, *MFI, VT,
6371                             AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
6372  case Intrinsic::amdgcn_workgroup_id_y:
6373    return getPreloadedValue(DAG, *MFI, VT,
6374                             AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
6375  case Intrinsic::amdgcn_workgroup_id_z:
6376    return getPreloadedValue(DAG, *MFI, VT,
6377                             AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
6378  case Intrinsic::amdgcn_workitem_id_x:
6379    return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
6380                          SDLoc(DAG.getEntryNode()),
6381                          MFI->getArgInfo().WorkItemIDX);
6382  case Intrinsic::amdgcn_workitem_id_y:
6383    return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
6384                          SDLoc(DAG.getEntryNode()),
6385                          MFI->getArgInfo().WorkItemIDY);
6386  case Intrinsic::amdgcn_workitem_id_z:
6387    return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
6388                          SDLoc(DAG.getEntryNode()),
6389                          MFI->getArgInfo().WorkItemIDZ);
6390  case Intrinsic::amdgcn_wavefrontsize:
6391    return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
6392                           SDLoc(Op), MVT::i32);
6393  case Intrinsic::amdgcn_s_buffer_load: {
6394    bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
6395    SDValue GLC;
6396    SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1);
6397    if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
6398                          IsGFX10 ? &DLC : nullptr))
6399      return Op;
6400    return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
6401                        DAG);
6402  }
6403  case Intrinsic::amdgcn_fdiv_fast:
6404    return lowerFDIV_FAST(Op, DAG);
6405  case Intrinsic::amdgcn_sin:
6406    return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
6407
6408  case Intrinsic::amdgcn_cos:
6409    return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
6410
6411  case Intrinsic::amdgcn_mul_u24:
6412    return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
6413  case Intrinsic::amdgcn_mul_i24:
6414    return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
6415
6416  case Intrinsic::amdgcn_log_clamp: {
6417    if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6418      return SDValue();
6419
6420    DiagnosticInfoUnsupported BadIntrin(
6421      MF.getFunction(), "intrinsic not supported on subtarget",
6422      DL.getDebugLoc());
6423      DAG.getContext()->diagnose(BadIntrin);
6424      return DAG.getUNDEF(VT);
6425  }
6426  case Intrinsic::amdgcn_ldexp:
6427    return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
6428                       Op.getOperand(1), Op.getOperand(2));
6429
6430  case Intrinsic::amdgcn_fract:
6431    return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
6432
6433  case Intrinsic::amdgcn_class:
6434    return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
6435                       Op.getOperand(1), Op.getOperand(2));
6436  case Intrinsic::amdgcn_div_fmas:
6437    return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
6438                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
6439                       Op.getOperand(4));
6440
6441  case Intrinsic::amdgcn_div_fixup:
6442    return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
6443                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6444
6445  case Intrinsic::amdgcn_div_scale: {
6446    const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
6447
6448    // Translate to the operands expected by the machine instruction. The
6449    // first parameter must be the same as the first instruction.
6450    SDValue Numerator = Op.getOperand(1);
6451    SDValue Denominator = Op.getOperand(2);
6452
6453    // Note this order is opposite of the machine instruction's operations,
6454    // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
6455    // intrinsic has the numerator as the first operand to match a normal
6456    // division operation.
6457
6458    SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
6459
6460    return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
6461                       Denominator, Numerator);
6462  }
6463  case Intrinsic::amdgcn_icmp: {
6464    // There is a Pat that handles this variant, so return it as-is.
6465    if (Op.getOperand(1).getValueType() == MVT::i1 &&
6466        Op.getConstantOperandVal(2) == 0 &&
6467        Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
6468      return Op;
6469    return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
6470  }
6471  case Intrinsic::amdgcn_fcmp: {
6472    return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
6473  }
6474  case Intrinsic::amdgcn_ballot:
6475    return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
6476  case Intrinsic::amdgcn_fmed3:
6477    return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
6478                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6479  case Intrinsic::amdgcn_fdot2:
6480    return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
6481                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
6482                       Op.getOperand(4));
6483  case Intrinsic::amdgcn_fmul_legacy:
6484    return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
6485                       Op.getOperand(1), Op.getOperand(2));
6486  case Intrinsic::amdgcn_sffbh:
6487    return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
6488  case Intrinsic::amdgcn_sbfe:
6489    return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
6490                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6491  case Intrinsic::amdgcn_ubfe:
6492    return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
6493                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6494  case Intrinsic::amdgcn_cvt_pkrtz:
6495  case Intrinsic::amdgcn_cvt_pknorm_i16:
6496  case Intrinsic::amdgcn_cvt_pknorm_u16:
6497  case Intrinsic::amdgcn_cvt_pk_i16:
6498  case Intrinsic::amdgcn_cvt_pk_u16: {
6499    // FIXME: Stop adding cast if v2f16/v2i16 are legal.
6500    EVT VT = Op.getValueType();
6501    unsigned Opcode;
6502
6503    if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
6504      Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
6505    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
6506      Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
6507    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
6508      Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
6509    else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
6510      Opcode = AMDGPUISD::CVT_PK_I16_I32;
6511    else
6512      Opcode = AMDGPUISD::CVT_PK_U16_U32;
6513
6514    if (isTypeLegal(VT))
6515      return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
6516
6517    SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
6518                               Op.getOperand(1), Op.getOperand(2));
6519    return DAG.getNode(ISD::BITCAST, DL, VT, Node);
6520  }
6521  case Intrinsic::amdgcn_fmad_ftz:
6522    return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
6523                       Op.getOperand(2), Op.getOperand(3));
6524
6525  case Intrinsic::amdgcn_if_break:
6526    return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
6527                                      Op->getOperand(1), Op->getOperand(2)), 0);
6528
6529  case Intrinsic::amdgcn_groupstaticsize: {
6530    Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
6531    if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
6532      return Op;
6533
6534    const Module *M = MF.getFunction().getParent();
6535    const GlobalValue *GV =
6536        M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
6537    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
6538                                            SIInstrInfo::MO_ABS32_LO);
6539    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
6540  }
6541  case Intrinsic::amdgcn_is_shared:
6542  case Intrinsic::amdgcn_is_private: {
6543    SDLoc SL(Op);
6544    unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
6545      AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
6546    SDValue Aperture = getSegmentAperture(AS, SL, DAG);
6547    SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
6548                                 Op.getOperand(1));
6549
6550    SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
6551                                DAG.getConstant(1, SL, MVT::i32));
6552    return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
6553  }
6554  case Intrinsic::amdgcn_alignbit:
6555    return DAG.getNode(ISD::FSHR, DL, VT,
6556                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6557  case Intrinsic::amdgcn_reloc_constant: {
6558    Module *M = const_cast<Module *>(MF.getFunction().getParent());
6559    const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
6560    auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
6561    auto RelocSymbol = cast<GlobalVariable>(
6562        M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
6563    SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
6564                                            SIInstrInfo::MO_ABS32_LO);
6565    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
6566  }
6567  default:
6568    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6569            AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
6570      return lowerImage(Op, ImageDimIntr, DAG);
6571
6572    return Op;
6573  }
6574}
6575
6576// This function computes an appropriate offset to pass to
6577// MachineMemOperand::setOffset() based on the offset inputs to
6578// an intrinsic.  If any of the offsets are non-contstant or
6579// if VIndex is non-zero then this function returns 0.  Otherwise,
6580// it returns the sum of VOffset, SOffset, and Offset.
6581static unsigned getBufferOffsetForMMO(SDValue VOffset,
6582                                      SDValue SOffset,
6583                                      SDValue Offset,
6584                                      SDValue VIndex = SDValue()) {
6585
6586  if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
6587      !isa<ConstantSDNode>(Offset))
6588    return 0;
6589
6590  if (VIndex) {
6591    if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue())
6592      return 0;
6593  }
6594
6595  return cast<ConstantSDNode>(VOffset)->getSExtValue() +
6596         cast<ConstantSDNode>(SOffset)->getSExtValue() +
6597         cast<ConstantSDNode>(Offset)->getSExtValue();
6598}
6599
6600static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
6601  switch (MF.getFunction().getCallingConv()) {
6602  case CallingConv::AMDGPU_PS:
6603    return 1;
6604  case CallingConv::AMDGPU_VS:
6605    return 2;
6606  case CallingConv::AMDGPU_GS:
6607    return 3;
6608  case CallingConv::AMDGPU_HS:
6609  case CallingConv::AMDGPU_LS:
6610  case CallingConv::AMDGPU_ES:
6611    report_fatal_error("ds_ordered_count unsupported for this calling conv");
6612  case CallingConv::AMDGPU_CS:
6613  case CallingConv::AMDGPU_KERNEL:
6614  case CallingConv::C:
6615  case CallingConv::Fast:
6616  default:
6617    // Assume other calling conventions are various compute callable functions
6618    return 0;
6619  }
6620}
6621
6622SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6623                                                 SelectionDAG &DAG) const {
6624  unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6625  SDLoc DL(Op);
6626
6627  switch (IntrID) {
6628  case Intrinsic::amdgcn_ds_ordered_add:
6629  case Intrinsic::amdgcn_ds_ordered_swap: {
6630    MemSDNode *M = cast<MemSDNode>(Op);
6631    SDValue Chain = M->getOperand(0);
6632    SDValue M0 = M->getOperand(2);
6633    SDValue Value = M->getOperand(3);
6634    unsigned IndexOperand = M->getConstantOperandVal(7);
6635    unsigned WaveRelease = M->getConstantOperandVal(8);
6636    unsigned WaveDone = M->getConstantOperandVal(9);
6637
6638    unsigned OrderedCountIndex = IndexOperand & 0x3f;
6639    IndexOperand &= ~0x3f;
6640    unsigned CountDw = 0;
6641
6642    if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
6643      CountDw = (IndexOperand >> 24) & 0xf;
6644      IndexOperand &= ~(0xf << 24);
6645
6646      if (CountDw < 1 || CountDw > 4) {
6647        report_fatal_error(
6648            "ds_ordered_count: dword count must be between 1 and 4");
6649      }
6650    }
6651
6652    if (IndexOperand)
6653      report_fatal_error("ds_ordered_count: bad index operand");
6654
6655    if (WaveDone && !WaveRelease)
6656      report_fatal_error("ds_ordered_count: wave_done requires wave_release");
6657
6658    unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
6659    unsigned ShaderType = getDSShaderTypeValue(DAG.getMachineFunction());
6660    unsigned Offset0 = OrderedCountIndex << 2;
6661    unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
6662                       (Instruction << 4);
6663
6664    if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
6665      Offset1 |= (CountDw - 1) << 6;
6666
6667    unsigned Offset = Offset0 | (Offset1 << 8);
6668
6669    SDValue Ops[] = {
6670      Chain,
6671      Value,
6672      DAG.getTargetConstant(Offset, DL, MVT::i16),
6673      copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
6674    };
6675    return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
6676                                   M->getVTList(), Ops, M->getMemoryVT(),
6677                                   M->getMemOperand());
6678  }
6679  case Intrinsic::amdgcn_ds_fadd: {
6680    MemSDNode *M = cast<MemSDNode>(Op);
6681    unsigned Opc;
6682    switch (IntrID) {
6683    case Intrinsic::amdgcn_ds_fadd:
6684      Opc = ISD::ATOMIC_LOAD_FADD;
6685      break;
6686    }
6687
6688    return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
6689                         M->getOperand(0), M->getOperand(2), M->getOperand(3),
6690                         M->getMemOperand());
6691  }
6692  case Intrinsic::amdgcn_atomic_inc:
6693  case Intrinsic::amdgcn_atomic_dec:
6694  case Intrinsic::amdgcn_ds_fmin:
6695  case Intrinsic::amdgcn_ds_fmax: {
6696    MemSDNode *M = cast<MemSDNode>(Op);
6697    unsigned Opc;
6698    switch (IntrID) {
6699    case Intrinsic::amdgcn_atomic_inc:
6700      Opc = AMDGPUISD::ATOMIC_INC;
6701      break;
6702    case Intrinsic::amdgcn_atomic_dec:
6703      Opc = AMDGPUISD::ATOMIC_DEC;
6704      break;
6705    case Intrinsic::amdgcn_ds_fmin:
6706      Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
6707      break;
6708    case Intrinsic::amdgcn_ds_fmax:
6709      Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
6710      break;
6711    default:
6712      llvm_unreachable("Unknown intrinsic!");
6713    }
6714    SDValue Ops[] = {
6715      M->getOperand(0), // Chain
6716      M->getOperand(2), // Ptr
6717      M->getOperand(3)  // Value
6718    };
6719
6720    return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
6721                                   M->getMemoryVT(), M->getMemOperand());
6722  }
6723  case Intrinsic::amdgcn_buffer_load:
6724  case Intrinsic::amdgcn_buffer_load_format: {
6725    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
6726    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
6727    unsigned IdxEn = 1;
6728    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
6729      IdxEn = Idx->getZExtValue() != 0;
6730    SDValue Ops[] = {
6731      Op.getOperand(0), // Chain
6732      Op.getOperand(2), // rsrc
6733      Op.getOperand(3), // vindex
6734      SDValue(),        // voffset -- will be set by setBufferOffsets
6735      SDValue(),        // soffset -- will be set by setBufferOffsets
6736      SDValue(),        // offset -- will be set by setBufferOffsets
6737      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6738      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
6739    };
6740
6741    unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
6742    // We don't know the offset if vindex is non-zero, so clear it.
6743    if (IdxEn)
6744      Offset = 0;
6745
6746    unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
6747        AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
6748
6749    EVT VT = Op.getValueType();
6750    EVT IntVT = VT.changeTypeToInteger();
6751    auto *M = cast<MemSDNode>(Op);
6752    M->getMemOperand()->setOffset(Offset);
6753    EVT LoadVT = Op.getValueType();
6754
6755    if (LoadVT.getScalarType() == MVT::f16)
6756      return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
6757                                 M, DAG, Ops);
6758
6759    // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
6760    if (LoadVT.getScalarType() == MVT::i8 ||
6761        LoadVT.getScalarType() == MVT::i16)
6762      return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
6763
6764    return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
6765                               M->getMemOperand(), DAG);
6766  }
6767  case Intrinsic::amdgcn_raw_buffer_load:
6768  case Intrinsic::amdgcn_raw_buffer_load_format: {
6769    const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format;
6770
6771    auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
6772    SDValue Ops[] = {
6773      Op.getOperand(0), // Chain
6774      Op.getOperand(2), // rsrc
6775      DAG.getConstant(0, DL, MVT::i32), // vindex
6776      Offsets.first,    // voffset
6777      Op.getOperand(4), // soffset
6778      Offsets.second,   // offset
6779      Op.getOperand(5), // cachepolicy, swizzled buffer
6780      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6781    };
6782
6783    auto *M = cast<MemSDNode>(Op);
6784    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5]));
6785    return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
6786  }
6787  case Intrinsic::amdgcn_struct_buffer_load:
6788  case Intrinsic::amdgcn_struct_buffer_load_format: {
6789    const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format;
6790
6791    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6792    SDValue Ops[] = {
6793      Op.getOperand(0), // Chain
6794      Op.getOperand(2), // rsrc
6795      Op.getOperand(3), // vindex
6796      Offsets.first,    // voffset
6797      Op.getOperand(5), // soffset
6798      Offsets.second,   // offset
6799      Op.getOperand(6), // cachepolicy, swizzled buffer
6800      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
6801    };
6802
6803    auto *M = cast<MemSDNode>(Op);
6804    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5],
6805                                                        Ops[2]));
6806    return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
6807  }
6808  case Intrinsic::amdgcn_tbuffer_load: {
6809    MemSDNode *M = cast<MemSDNode>(Op);
6810    EVT LoadVT = Op.getValueType();
6811
6812    unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6813    unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
6814    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
6815    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
6816    unsigned IdxEn = 1;
6817    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
6818      IdxEn = Idx->getZExtValue() != 0;
6819    SDValue Ops[] = {
6820      Op.getOperand(0),  // Chain
6821      Op.getOperand(2),  // rsrc
6822      Op.getOperand(3),  // vindex
6823      Op.getOperand(4),  // voffset
6824      Op.getOperand(5),  // soffset
6825      Op.getOperand(6),  // offset
6826      DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
6827      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6828      DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
6829    };
6830
6831    if (LoadVT.getScalarType() == MVT::f16)
6832      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
6833                                 M, DAG, Ops);
6834    return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
6835                               Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
6836                               DAG);
6837  }
6838  case Intrinsic::amdgcn_raw_tbuffer_load: {
6839    MemSDNode *M = cast<MemSDNode>(Op);
6840    EVT LoadVT = Op.getValueType();
6841    auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
6842
6843    SDValue Ops[] = {
6844      Op.getOperand(0),  // Chain
6845      Op.getOperand(2),  // rsrc
6846      DAG.getConstant(0, DL, MVT::i32), // vindex
6847      Offsets.first,     // voffset
6848      Op.getOperand(4),  // soffset
6849      Offsets.second,    // offset
6850      Op.getOperand(5),  // format
6851      Op.getOperand(6),  // cachepolicy, swizzled buffer
6852      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6853    };
6854
6855    if (LoadVT.getScalarType() == MVT::f16)
6856      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
6857                                 M, DAG, Ops);
6858    return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
6859                               Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
6860                               DAG);
6861  }
6862  case Intrinsic::amdgcn_struct_tbuffer_load: {
6863    MemSDNode *M = cast<MemSDNode>(Op);
6864    EVT LoadVT = Op.getValueType();
6865    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6866
6867    SDValue Ops[] = {
6868      Op.getOperand(0),  // Chain
6869      Op.getOperand(2),  // rsrc
6870      Op.getOperand(3),  // vindex
6871      Offsets.first,     // voffset
6872      Op.getOperand(5),  // soffset
6873      Offsets.second,    // offset
6874      Op.getOperand(6),  // format
6875      Op.getOperand(7),  // cachepolicy, swizzled buffer
6876      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
6877    };
6878
6879    if (LoadVT.getScalarType() == MVT::f16)
6880      return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
6881                                 M, DAG, Ops);
6882    return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
6883                               Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
6884                               DAG);
6885  }
6886  case Intrinsic::amdgcn_buffer_atomic_swap:
6887  case Intrinsic::amdgcn_buffer_atomic_add:
6888  case Intrinsic::amdgcn_buffer_atomic_sub:
6889  case Intrinsic::amdgcn_buffer_atomic_csub:
6890  case Intrinsic::amdgcn_buffer_atomic_smin:
6891  case Intrinsic::amdgcn_buffer_atomic_umin:
6892  case Intrinsic::amdgcn_buffer_atomic_smax:
6893  case Intrinsic::amdgcn_buffer_atomic_umax:
6894  case Intrinsic::amdgcn_buffer_atomic_and:
6895  case Intrinsic::amdgcn_buffer_atomic_or:
6896  case Intrinsic::amdgcn_buffer_atomic_xor: {
6897    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
6898    unsigned IdxEn = 1;
6899    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6900      IdxEn = Idx->getZExtValue() != 0;
6901    SDValue Ops[] = {
6902      Op.getOperand(0), // Chain
6903      Op.getOperand(2), // vdata
6904      Op.getOperand(3), // rsrc
6905      Op.getOperand(4), // vindex
6906      SDValue(),        // voffset -- will be set by setBufferOffsets
6907      SDValue(),        // soffset -- will be set by setBufferOffsets
6908      SDValue(),        // offset -- will be set by setBufferOffsets
6909      DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
6910      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
6911    };
6912    unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
6913    // We don't know the offset if vindex is non-zero, so clear it.
6914    if (IdxEn)
6915      Offset = 0;
6916    EVT VT = Op.getValueType();
6917
6918    auto *M = cast<MemSDNode>(Op);
6919    M->getMemOperand()->setOffset(Offset);
6920    unsigned Opcode = 0;
6921
6922    switch (IntrID) {
6923    case Intrinsic::amdgcn_buffer_atomic_swap:
6924      Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
6925      break;
6926    case Intrinsic::amdgcn_buffer_atomic_add:
6927      Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
6928      break;
6929    case Intrinsic::amdgcn_buffer_atomic_sub:
6930      Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
6931      break;
6932    case Intrinsic::amdgcn_buffer_atomic_csub:
6933      Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
6934      break;
6935    case Intrinsic::amdgcn_buffer_atomic_smin:
6936      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
6937      break;
6938    case Intrinsic::amdgcn_buffer_atomic_umin:
6939      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
6940      break;
6941    case Intrinsic::amdgcn_buffer_atomic_smax:
6942      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
6943      break;
6944    case Intrinsic::amdgcn_buffer_atomic_umax:
6945      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
6946      break;
6947    case Intrinsic::amdgcn_buffer_atomic_and:
6948      Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
6949      break;
6950    case Intrinsic::amdgcn_buffer_atomic_or:
6951      Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
6952      break;
6953    case Intrinsic::amdgcn_buffer_atomic_xor:
6954      Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
6955      break;
6956    default:
6957      llvm_unreachable("unhandled atomic opcode");
6958    }
6959
6960    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
6961                                   M->getMemOperand());
6962  }
6963  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6964  case Intrinsic::amdgcn_raw_buffer_atomic_add:
6965  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6966  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6967  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6968  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6969  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6970  case Intrinsic::amdgcn_raw_buffer_atomic_and:
6971  case Intrinsic::amdgcn_raw_buffer_atomic_or:
6972  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6973  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6974  case Intrinsic::amdgcn_raw_buffer_atomic_dec: {
6975    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6976    SDValue Ops[] = {
6977      Op.getOperand(0), // Chain
6978      Op.getOperand(2), // vdata
6979      Op.getOperand(3), // rsrc
6980      DAG.getConstant(0, DL, MVT::i32), // vindex
6981      Offsets.first,    // voffset
6982      Op.getOperand(5), // soffset
6983      Offsets.second,   // offset
6984      Op.getOperand(6), // cachepolicy
6985      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6986    };
6987    EVT VT = Op.getValueType();
6988
6989    auto *M = cast<MemSDNode>(Op);
6990    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
6991    unsigned Opcode = 0;
6992
6993    switch (IntrID) {
6994    case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6995      Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
6996      break;
6997    case Intrinsic::amdgcn_raw_buffer_atomic_add:
6998      Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
6999      break;
7000    case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7001      Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
7002      break;
7003    case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7004      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
7005      break;
7006    case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7007      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
7008      break;
7009    case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7010      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
7011      break;
7012    case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7013      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
7014      break;
7015    case Intrinsic::amdgcn_raw_buffer_atomic_and:
7016      Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
7017      break;
7018    case Intrinsic::amdgcn_raw_buffer_atomic_or:
7019      Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
7020      break;
7021    case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7022      Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
7023      break;
7024    case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7025      Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
7026      break;
7027    case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7028      Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
7029      break;
7030    default:
7031      llvm_unreachable("unhandled atomic opcode");
7032    }
7033
7034    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
7035                                   M->getMemOperand());
7036  }
7037  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7038  case Intrinsic::amdgcn_struct_buffer_atomic_add:
7039  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7040  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7041  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7042  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7043  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7044  case Intrinsic::amdgcn_struct_buffer_atomic_and:
7045  case Intrinsic::amdgcn_struct_buffer_atomic_or:
7046  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7047  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7048  case Intrinsic::amdgcn_struct_buffer_atomic_dec: {
7049    auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
7050    SDValue Ops[] = {
7051      Op.getOperand(0), // Chain
7052      Op.getOperand(2), // vdata
7053      Op.getOperand(3), // rsrc
7054      Op.getOperand(4), // vindex
7055      Offsets.first,    // voffset
7056      Op.getOperand(6), // soffset
7057      Offsets.second,   // offset
7058      Op.getOperand(7), // cachepolicy
7059      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7060    };
7061    EVT VT = Op.getValueType();
7062
7063    auto *M = cast<MemSDNode>(Op);
7064    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
7065                                                        Ops[3]));
7066    unsigned Opcode = 0;
7067
7068    switch (IntrID) {
7069    case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7070      Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
7071      break;
7072    case Intrinsic::amdgcn_struct_buffer_atomic_add:
7073      Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
7074      break;
7075    case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7076      Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
7077      break;
7078    case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7079      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
7080      break;
7081    case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7082      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
7083      break;
7084    case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7085      Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
7086      break;
7087    case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7088      Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
7089      break;
7090    case Intrinsic::amdgcn_struct_buffer_atomic_and:
7091      Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
7092      break;
7093    case Intrinsic::amdgcn_struct_buffer_atomic_or:
7094      Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
7095      break;
7096    case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7097      Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
7098      break;
7099    case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7100      Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
7101      break;
7102    case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7103      Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
7104      break;
7105    default:
7106      llvm_unreachable("unhandled atomic opcode");
7107    }
7108
7109    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
7110                                   M->getMemOperand());
7111  }
7112  case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
7113    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
7114    unsigned IdxEn = 1;
7115    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
7116      IdxEn = Idx->getZExtValue() != 0;
7117    SDValue Ops[] = {
7118      Op.getOperand(0), // Chain
7119      Op.getOperand(2), // src
7120      Op.getOperand(3), // cmp
7121      Op.getOperand(4), // rsrc
7122      Op.getOperand(5), // vindex
7123      SDValue(),        // voffset -- will be set by setBufferOffsets
7124      SDValue(),        // soffset -- will be set by setBufferOffsets
7125      SDValue(),        // offset -- will be set by setBufferOffsets
7126      DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
7127      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7128    };
7129    unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
7130    // We don't know the offset if vindex is non-zero, so clear it.
7131    if (IdxEn)
7132      Offset = 0;
7133    EVT VT = Op.getValueType();
7134    auto *M = cast<MemSDNode>(Op);
7135    M->getMemOperand()->setOffset(Offset);
7136
7137    return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
7138                                   Op->getVTList(), Ops, VT, M->getMemOperand());
7139  }
7140  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
7141    auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
7142    SDValue Ops[] = {
7143      Op.getOperand(0), // Chain
7144      Op.getOperand(2), // src
7145      Op.getOperand(3), // cmp
7146      Op.getOperand(4), // rsrc
7147      DAG.getConstant(0, DL, MVT::i32), // vindex
7148      Offsets.first,    // voffset
7149      Op.getOperand(6), // soffset
7150      Offsets.second,   // offset
7151      Op.getOperand(7), // cachepolicy
7152      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7153    };
7154    EVT VT = Op.getValueType();
7155    auto *M = cast<MemSDNode>(Op);
7156    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7]));
7157
7158    return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
7159                                   Op->getVTList(), Ops, VT, M->getMemOperand());
7160  }
7161  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
7162    auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
7163    SDValue Ops[] = {
7164      Op.getOperand(0), // Chain
7165      Op.getOperand(2), // src
7166      Op.getOperand(3), // cmp
7167      Op.getOperand(4), // rsrc
7168      Op.getOperand(5), // vindex
7169      Offsets.first,    // voffset
7170      Op.getOperand(7), // soffset
7171      Offsets.second,   // offset
7172      Op.getOperand(8), // cachepolicy
7173      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7174    };
7175    EVT VT = Op.getValueType();
7176    auto *M = cast<MemSDNode>(Op);
7177    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7],
7178                                                        Ops[4]));
7179
7180    return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
7181                                   Op->getVTList(), Ops, VT, M->getMemOperand());
7182  }
7183  case Intrinsic::amdgcn_global_atomic_csub: {
7184    MemSDNode *M = cast<MemSDNode>(Op);
7185    SDValue Ops[] = {
7186      M->getOperand(0), // Chain
7187      M->getOperand(2), // Ptr
7188      M->getOperand(3)  // Value
7189    };
7190
7191    return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op),
7192                                   M->getVTList(), Ops, M->getMemoryVT(),
7193                                   M->getMemOperand());
7194  }
7195
7196  default:
7197    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7198            AMDGPU::getImageDimIntrinsicInfo(IntrID))
7199      return lowerImage(Op, ImageDimIntr, DAG);
7200
7201    return SDValue();
7202  }
7203}
7204
7205// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
7206// dwordx4 if on SI.
7207SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
7208                                              SDVTList VTList,
7209                                              ArrayRef<SDValue> Ops, EVT MemVT,
7210                                              MachineMemOperand *MMO,
7211                                              SelectionDAG &DAG) const {
7212  EVT VT = VTList.VTs[0];
7213  EVT WidenedVT = VT;
7214  EVT WidenedMemVT = MemVT;
7215  if (!Subtarget->hasDwordx3LoadStores() &&
7216      (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) {
7217    WidenedVT = EVT::getVectorVT(*DAG.getContext(),
7218                                 WidenedVT.getVectorElementType(), 4);
7219    WidenedMemVT = EVT::getVectorVT(*DAG.getContext(),
7220                                    WidenedMemVT.getVectorElementType(), 4);
7221    MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16);
7222  }
7223
7224  assert(VTList.NumVTs == 2);
7225  SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
7226
7227  auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
7228                                       WidenedMemVT, MMO);
7229  if (WidenedVT != VT) {
7230    auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
7231                               DAG.getVectorIdxConstant(0, DL));
7232    NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
7233  }
7234  return NewOp;
7235}
7236
7237SDValue SITargetLowering::handleD16VData(SDValue VData,
7238                                         SelectionDAG &DAG) const {
7239  EVT StoreVT = VData.getValueType();
7240
7241  // No change for f16 and legal vector D16 types.
7242  if (!StoreVT.isVector())
7243    return VData;
7244
7245  SDLoc DL(VData);
7246  assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
7247
7248  if (Subtarget->hasUnpackedD16VMem()) {
7249    // We need to unpack the packed data to store.
7250    EVT IntStoreVT = StoreVT.changeTypeToInteger();
7251    SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
7252
7253    EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7254                                        StoreVT.getVectorNumElements());
7255    SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
7256    return DAG.UnrollVectorOp(ZExt.getNode());
7257  }
7258
7259  assert(isTypeLegal(StoreVT));
7260  return VData;
7261}
7262
7263SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
7264                                              SelectionDAG &DAG) const {
7265  SDLoc DL(Op);
7266  SDValue Chain = Op.getOperand(0);
7267  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7268  MachineFunction &MF = DAG.getMachineFunction();
7269
7270  switch (IntrinsicID) {
7271  case Intrinsic::amdgcn_exp_compr: {
7272    SDValue Src0 = Op.getOperand(4);
7273    SDValue Src1 = Op.getOperand(5);
7274    // Hack around illegal type on SI by directly selecting it.
7275    if (isTypeLegal(Src0.getValueType()))
7276      return SDValue();
7277
7278    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
7279    SDValue Undef = DAG.getUNDEF(MVT::f32);
7280    const SDValue Ops[] = {
7281      Op.getOperand(2), // tgt
7282      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
7283      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
7284      Undef, // src2
7285      Undef, // src3
7286      Op.getOperand(7), // vm
7287      DAG.getTargetConstant(1, DL, MVT::i1), // compr
7288      Op.getOperand(3), // en
7289      Op.getOperand(0) // Chain
7290    };
7291
7292    unsigned Opc = Done->isNullValue() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
7293    return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
7294  }
7295  case Intrinsic::amdgcn_s_barrier: {
7296    if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
7297      const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
7298      unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
7299      if (WGSize <= ST.getWavefrontSize())
7300        return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
7301                                          Op.getOperand(0)), 0);
7302    }
7303    return SDValue();
7304  };
7305  case Intrinsic::amdgcn_tbuffer_store: {
7306    SDValue VData = Op.getOperand(2);
7307    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
7308    if (IsD16)
7309      VData = handleD16VData(VData, DAG);
7310    unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
7311    unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
7312    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
7313    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
7314    unsigned IdxEn = 1;
7315    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
7316      IdxEn = Idx->getZExtValue() != 0;
7317    SDValue Ops[] = {
7318      Chain,
7319      VData,             // vdata
7320      Op.getOperand(3),  // rsrc
7321      Op.getOperand(4),  // vindex
7322      Op.getOperand(5),  // voffset
7323      Op.getOperand(6),  // soffset
7324      Op.getOperand(7),  // offset
7325      DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
7326      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
7327      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen
7328    };
7329    unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
7330                           AMDGPUISD::TBUFFER_STORE_FORMAT;
7331    MemSDNode *M = cast<MemSDNode>(Op);
7332    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
7333                                   M->getMemoryVT(), M->getMemOperand());
7334  }
7335
7336  case Intrinsic::amdgcn_struct_tbuffer_store: {
7337    SDValue VData = Op.getOperand(2);
7338    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
7339    if (IsD16)
7340      VData = handleD16VData(VData, DAG);
7341    auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
7342    SDValue Ops[] = {
7343      Chain,
7344      VData,             // vdata
7345      Op.getOperand(3),  // rsrc
7346      Op.getOperand(4),  // vindex
7347      Offsets.first,     // voffset
7348      Op.getOperand(6),  // soffset
7349      Offsets.second,    // offset
7350      Op.getOperand(7),  // format
7351      Op.getOperand(8),  // cachepolicy, swizzled buffer
7352      DAG.getTargetConstant(1, DL, MVT::i1), // idexen
7353    };
7354    unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
7355                           AMDGPUISD::TBUFFER_STORE_FORMAT;
7356    MemSDNode *M = cast<MemSDNode>(Op);
7357    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
7358                                   M->getMemoryVT(), M->getMemOperand());
7359  }
7360
7361  case Intrinsic::amdgcn_raw_tbuffer_store: {
7362    SDValue VData = Op.getOperand(2);
7363    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
7364    if (IsD16)
7365      VData = handleD16VData(VData, DAG);
7366    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7367    SDValue Ops[] = {
7368      Chain,
7369      VData,             // vdata
7370      Op.getOperand(3),  // rsrc
7371      DAG.getConstant(0, DL, MVT::i32), // vindex
7372      Offsets.first,     // voffset
7373      Op.getOperand(5),  // soffset
7374      Offsets.second,    // offset
7375      Op.getOperand(6),  // format
7376      Op.getOperand(7),  // cachepolicy, swizzled buffer
7377      DAG.getTargetConstant(0, DL, MVT::i1), // idexen
7378    };
7379    unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
7380                           AMDGPUISD::TBUFFER_STORE_FORMAT;
7381    MemSDNode *M = cast<MemSDNode>(Op);
7382    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
7383                                   M->getMemoryVT(), M->getMemOperand());
7384  }
7385
7386  case Intrinsic::amdgcn_buffer_store:
7387  case Intrinsic::amdgcn_buffer_store_format: {
7388    SDValue VData = Op.getOperand(2);
7389    bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
7390    if (IsD16)
7391      VData = handleD16VData(VData, DAG);
7392    unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
7393    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
7394    unsigned IdxEn = 1;
7395    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
7396      IdxEn = Idx->getZExtValue() != 0;
7397    SDValue Ops[] = {
7398      Chain,
7399      VData,
7400      Op.getOperand(3), // rsrc
7401      Op.getOperand(4), // vindex
7402      SDValue(), // voffset -- will be set by setBufferOffsets
7403      SDValue(), // soffset -- will be set by setBufferOffsets
7404      SDValue(), // offset -- will be set by setBufferOffsets
7405      DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
7406      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7407    };
7408    unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
7409    // We don't know the offset if vindex is non-zero, so clear it.
7410    if (IdxEn)
7411      Offset = 0;
7412    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
7413                   AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
7414    Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
7415    MemSDNode *M = cast<MemSDNode>(Op);
7416    M->getMemOperand()->setOffset(Offset);
7417
7418    // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
7419    EVT VDataType = VData.getValueType().getScalarType();
7420    if (VDataType == MVT::i8 || VDataType == MVT::i16)
7421      return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
7422
7423    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
7424                                   M->getMemoryVT(), M->getMemOperand());
7425  }
7426
7427  case Intrinsic::amdgcn_raw_buffer_store:
7428  case Intrinsic::amdgcn_raw_buffer_store_format: {
7429    const bool IsFormat =
7430        IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format;
7431
7432    SDValue VData = Op.getOperand(2);
7433    EVT VDataVT = VData.getValueType();
7434    EVT EltType = VDataVT.getScalarType();
7435    bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7436    if (IsD16)
7437      VData = handleD16VData(VData, DAG);
7438
7439    if (!isTypeLegal(VDataVT)) {
7440      VData =
7441          DAG.getNode(ISD::BITCAST, DL,
7442                      getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
7443    }
7444
7445    auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7446    SDValue Ops[] = {
7447      Chain,
7448      VData,
7449      Op.getOperand(3), // rsrc
7450      DAG.getConstant(0, DL, MVT::i32), // vindex
7451      Offsets.first,    // voffset
7452      Op.getOperand(5), // soffset
7453      Offsets.second,   // offset
7454      Op.getOperand(6), // cachepolicy, swizzled buffer
7455      DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7456    };
7457    unsigned Opc =
7458        IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
7459    Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
7460    MemSDNode *M = cast<MemSDNode>(Op);
7461    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
7462
7463    // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
7464    if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
7465      return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
7466
7467    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
7468                                   M->getMemoryVT(), M->getMemOperand());
7469  }
7470
7471  case Intrinsic::amdgcn_struct_buffer_store:
7472  case Intrinsic::amdgcn_struct_buffer_store_format: {
7473    const bool IsFormat =
7474        IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format;
7475
7476    SDValue VData = Op.getOperand(2);
7477    EVT VDataVT = VData.getValueType();
7478    EVT EltType = VDataVT.getScalarType();
7479    bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7480
7481    if (IsD16)
7482      VData = handleD16VData(VData, DAG);
7483
7484    if (!isTypeLegal(VDataVT)) {
7485      VData =
7486          DAG.getNode(ISD::BITCAST, DL,
7487                      getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
7488    }
7489
7490    auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
7491    SDValue Ops[] = {
7492      Chain,
7493      VData,
7494      Op.getOperand(3), // rsrc
7495      Op.getOperand(4), // vindex
7496      Offsets.first,    // voffset
7497      Op.getOperand(6), // soffset
7498      Offsets.second,   // offset
7499      Op.getOperand(7), // cachepolicy, swizzled buffer
7500      DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7501    };
7502    unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
7503                   AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
7504    Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
7505    MemSDNode *M = cast<MemSDNode>(Op);
7506    M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
7507                                                        Ops[3]));
7508
7509    // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
7510    EVT VDataType = VData.getValueType().getScalarType();
7511    if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
7512      return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
7513
7514    return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
7515                                   M->getMemoryVT(), M->getMemOperand());
7516  }
7517
7518  case Intrinsic::amdgcn_buffer_atomic_fadd: {
7519    unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
7520    unsigned IdxEn = 1;
7521    if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
7522      IdxEn = Idx->getZExtValue() != 0;
7523    SDValue Ops[] = {
7524      Chain,
7525      Op.getOperand(2), // vdata
7526      Op.getOperand(3), // rsrc
7527      Op.getOperand(4), // vindex
7528      SDValue(),        // voffset -- will be set by setBufferOffsets
7529      SDValue(),        // soffset -- will be set by setBufferOffsets
7530      SDValue(),        // offset -- will be set by setBufferOffsets
7531      DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
7532      DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7533    };
7534    unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
7535    // We don't know the offset if vindex is non-zero, so clear it.
7536    if (IdxEn)
7537      Offset = 0;
7538    EVT VT = Op.getOperand(2).getValueType();
7539
7540    auto *M = cast<MemSDNode>(Op);
7541    M->getMemOperand()->setOffset(Offset);
7542    unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
7543                                    : AMDGPUISD::BUFFER_ATOMIC_FADD;
7544
7545    return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
7546                                   M->getMemOperand());
7547  }
7548
7549  case Intrinsic::amdgcn_global_atomic_fadd: {
7550    SDValue Ops[] = {
7551      Chain,
7552      Op.getOperand(2), // ptr
7553      Op.getOperand(3)  // vdata
7554    };
7555    EVT VT = Op.getOperand(3).getValueType();
7556
7557    auto *M = cast<MemSDNode>(Op);
7558    if (VT.isVector()) {
7559      return DAG.getMemIntrinsicNode(
7560        AMDGPUISD::ATOMIC_PK_FADD, DL, Op->getVTList(), Ops, VT,
7561        M->getMemOperand());
7562    }
7563
7564    return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
7565                         DAG.getVTList(VT, MVT::Other), Ops,
7566                         M->getMemOperand()).getValue(1);
7567  }
7568  case Intrinsic::amdgcn_end_cf:
7569    return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
7570                                      Op->getOperand(2), Chain), 0);
7571
7572  default: {
7573    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7574            AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
7575      return lowerImage(Op, ImageDimIntr, DAG);
7576
7577    return Op;
7578  }
7579  }
7580}
7581
7582// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
7583// offset (the offset that is included in bounds checking and swizzling, to be
7584// split between the instruction's voffset and immoffset fields) and soffset
7585// (the offset that is excluded from bounds checking and swizzling, to go in
7586// the instruction's soffset field).  This function takes the first kind of
7587// offset and figures out how to split it between voffset and immoffset.
7588std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
7589    SDValue Offset, SelectionDAG &DAG) const {
7590  SDLoc DL(Offset);
7591  const unsigned MaxImm = 4095;
7592  SDValue N0 = Offset;
7593  ConstantSDNode *C1 = nullptr;
7594
7595  if ((C1 = dyn_cast<ConstantSDNode>(N0)))
7596    N0 = SDValue();
7597  else if (DAG.isBaseWithConstantOffset(N0)) {
7598    C1 = cast<ConstantSDNode>(N0.getOperand(1));
7599    N0 = N0.getOperand(0);
7600  }
7601
7602  if (C1) {
7603    unsigned ImmOffset = C1->getZExtValue();
7604    // If the immediate value is too big for the immoffset field, put the value
7605    // and -4096 into the immoffset field so that the value that is copied/added
7606    // for the voffset field is a multiple of 4096, and it stands more chance
7607    // of being CSEd with the copy/add for another similar load/store.
7608    // However, do not do that rounding down to a multiple of 4096 if that is a
7609    // negative number, as it appears to be illegal to have a negative offset
7610    // in the vgpr, even if adding the immediate offset makes it positive.
7611    unsigned Overflow = ImmOffset & ~MaxImm;
7612    ImmOffset -= Overflow;
7613    if ((int32_t)Overflow < 0) {
7614      Overflow += ImmOffset;
7615      ImmOffset = 0;
7616    }
7617    C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
7618    if (Overflow) {
7619      auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
7620      if (!N0)
7621        N0 = OverflowVal;
7622      else {
7623        SDValue Ops[] = { N0, OverflowVal };
7624        N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
7625      }
7626    }
7627  }
7628  if (!N0)
7629    N0 = DAG.getConstant(0, DL, MVT::i32);
7630  if (!C1)
7631    C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
7632  return {N0, SDValue(C1, 0)};
7633}
7634
7635// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
7636// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
7637// pointed to by Offsets.
7638unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
7639                                            SelectionDAG &DAG, SDValue *Offsets,
7640                                            Align Alignment) const {
7641  SDLoc DL(CombinedOffset);
7642  if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
7643    uint32_t Imm = C->getZExtValue();
7644    uint32_t SOffset, ImmOffset;
7645    if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget,
7646                                 Alignment)) {
7647      Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
7648      Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
7649      Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
7650      return SOffset + ImmOffset;
7651    }
7652  }
7653  if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
7654    SDValue N0 = CombinedOffset.getOperand(0);
7655    SDValue N1 = CombinedOffset.getOperand(1);
7656    uint32_t SOffset, ImmOffset;
7657    int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
7658    if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
7659                                                Subtarget, Alignment)) {
7660      Offsets[0] = N0;
7661      Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
7662      Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
7663      return 0;
7664    }
7665  }
7666  Offsets[0] = CombinedOffset;
7667  Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
7668  Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
7669  return 0;
7670}
7671
7672// Handle 8 bit and 16 bit buffer loads
7673SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
7674                                                     EVT LoadVT, SDLoc DL,
7675                                                     ArrayRef<SDValue> Ops,
7676                                                     MemSDNode *M) const {
7677  EVT IntVT = LoadVT.changeTypeToInteger();
7678  unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
7679         AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
7680
7681  SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
7682  SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
7683                                               Ops, IntVT,
7684                                               M->getMemOperand());
7685  SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
7686  LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
7687
7688  return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
7689}
7690
7691// Handle 8 bit and 16 bit buffer stores
7692SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
7693                                                      EVT VDataType, SDLoc DL,
7694                                                      SDValue Ops[],
7695                                                      MemSDNode *M) const {
7696  if (VDataType == MVT::f16)
7697    Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
7698
7699  SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
7700  Ops[1] = BufferStoreExt;
7701  unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
7702                                 AMDGPUISD::BUFFER_STORE_SHORT;
7703  ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9);
7704  return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
7705                                     M->getMemOperand());
7706}
7707
7708static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
7709                                 ISD::LoadExtType ExtType, SDValue Op,
7710                                 const SDLoc &SL, EVT VT) {
7711  if (VT.bitsLT(Op.getValueType()))
7712    return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
7713
7714  switch (ExtType) {
7715  case ISD::SEXTLOAD:
7716    return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
7717  case ISD::ZEXTLOAD:
7718    return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
7719  case ISD::EXTLOAD:
7720    return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
7721  case ISD::NON_EXTLOAD:
7722    return Op;
7723  }
7724
7725  llvm_unreachable("invalid ext type");
7726}
7727
7728SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
7729  SelectionDAG &DAG = DCI.DAG;
7730  if (Ld->getAlignment() < 4 || Ld->isDivergent())
7731    return SDValue();
7732
7733  // FIXME: Constant loads should all be marked invariant.
7734  unsigned AS = Ld->getAddressSpace();
7735  if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
7736      AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7737      (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
7738    return SDValue();
7739
7740  // Don't do this early, since it may interfere with adjacent load merging for
7741  // illegal types. We can avoid losing alignment information for exotic types
7742  // pre-legalize.
7743  EVT MemVT = Ld->getMemoryVT();
7744  if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
7745      MemVT.getSizeInBits() >= 32)
7746    return SDValue();
7747
7748  SDLoc SL(Ld);
7749
7750  assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
7751         "unexpected vector extload");
7752
7753  // TODO: Drop only high part of range.
7754  SDValue Ptr = Ld->getBasePtr();
7755  SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
7756                                MVT::i32, SL, Ld->getChain(), Ptr,
7757                                Ld->getOffset(),
7758                                Ld->getPointerInfo(), MVT::i32,
7759                                Ld->getAlignment(),
7760                                Ld->getMemOperand()->getFlags(),
7761                                Ld->getAAInfo(),
7762                                nullptr); // Drop ranges
7763
7764  EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
7765  if (MemVT.isFloatingPoint()) {
7766    assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
7767           "unexpected fp extload");
7768    TruncVT = MemVT.changeTypeToInteger();
7769  }
7770
7771  SDValue Cvt = NewLoad;
7772  if (Ld->getExtensionType() == ISD::SEXTLOAD) {
7773    Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
7774                      DAG.getValueType(TruncVT));
7775  } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
7776             Ld->getExtensionType() == ISD::NON_EXTLOAD) {
7777    Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
7778  } else {
7779    assert(Ld->getExtensionType() == ISD::EXTLOAD);
7780  }
7781
7782  EVT VT = Ld->getValueType(0);
7783  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
7784
7785  DCI.AddToWorklist(Cvt.getNode());
7786
7787  // We may need to handle exotic cases, such as i16->i64 extloads, so insert
7788  // the appropriate extension from the 32-bit load.
7789  Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
7790  DCI.AddToWorklist(Cvt.getNode());
7791
7792  // Handle conversion back to floating point if necessary.
7793  Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
7794
7795  return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
7796}
7797
7798SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
7799  SDLoc DL(Op);
7800  LoadSDNode *Load = cast<LoadSDNode>(Op);
7801  ISD::LoadExtType ExtType = Load->getExtensionType();
7802  EVT MemVT = Load->getMemoryVT();
7803
7804  if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
7805    if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
7806      return SDValue();
7807
7808    // FIXME: Copied from PPC
7809    // First, load into 32 bits, then truncate to 1 bit.
7810
7811    SDValue Chain = Load->getChain();
7812    SDValue BasePtr = Load->getBasePtr();
7813    MachineMemOperand *MMO = Load->getMemOperand();
7814
7815    EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
7816
7817    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
7818                                   BasePtr, RealMemVT, MMO);
7819
7820    if (!MemVT.isVector()) {
7821      SDValue Ops[] = {
7822        DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
7823        NewLD.getValue(1)
7824      };
7825
7826      return DAG.getMergeValues(Ops, DL);
7827    }
7828
7829    SmallVector<SDValue, 3> Elts;
7830    for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
7831      SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
7832                                DAG.getConstant(I, DL, MVT::i32));
7833
7834      Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
7835    }
7836
7837    SDValue Ops[] = {
7838      DAG.getBuildVector(MemVT, DL, Elts),
7839      NewLD.getValue(1)
7840    };
7841
7842    return DAG.getMergeValues(Ops, DL);
7843  }
7844
7845  if (!MemVT.isVector())
7846    return SDValue();
7847
7848  assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
7849         "Custom lowering for non-i32 vectors hasn't been implemented.");
7850
7851  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
7852                                      MemVT, *Load->getMemOperand())) {
7853    SDValue Ops[2];
7854    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
7855    return DAG.getMergeValues(Ops, DL);
7856  }
7857
7858  unsigned Alignment = Load->getAlignment();
7859  unsigned AS = Load->getAddressSpace();
7860  if (Subtarget->hasLDSMisalignedBug() &&
7861      AS == AMDGPUAS::FLAT_ADDRESS &&
7862      Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
7863    return SplitVectorLoad(Op, DAG);
7864  }
7865
7866  MachineFunction &MF = DAG.getMachineFunction();
7867  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
7868  // If there is a possibilty that flat instruction access scratch memory
7869  // then we need to use the same legalization rules we use for private.
7870  if (AS == AMDGPUAS::FLAT_ADDRESS &&
7871      !Subtarget->hasMultiDwordFlatScratchAddressing())
7872    AS = MFI->hasFlatScratchInit() ?
7873         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
7874
7875  unsigned NumElements = MemVT.getVectorNumElements();
7876
7877  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
7878      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
7879    if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
7880      if (MemVT.isPow2VectorType())
7881        return SDValue();
7882      if (NumElements == 3)
7883        return WidenVectorLoad(Op, DAG);
7884      return SplitVectorLoad(Op, DAG);
7885    }
7886    // Non-uniform loads will be selected to MUBUF instructions, so they
7887    // have the same legalization requirements as global and private
7888    // loads.
7889    //
7890  }
7891
7892  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
7893      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
7894      AS == AMDGPUAS::GLOBAL_ADDRESS) {
7895    if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
7896        Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
7897        Alignment >= 4 && NumElements < 32) {
7898      if (MemVT.isPow2VectorType())
7899        return SDValue();
7900      if (NumElements == 3)
7901        return WidenVectorLoad(Op, DAG);
7902      return SplitVectorLoad(Op, DAG);
7903    }
7904    // Non-uniform loads will be selected to MUBUF instructions, so they
7905    // have the same legalization requirements as global and private
7906    // loads.
7907    //
7908  }
7909  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
7910      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
7911      AS == AMDGPUAS::GLOBAL_ADDRESS ||
7912      AS == AMDGPUAS::FLAT_ADDRESS) {
7913    if (NumElements > 4)
7914      return SplitVectorLoad(Op, DAG);
7915    // v3 loads not supported on SI.
7916    if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
7917      return WidenVectorLoad(Op, DAG);
7918    // v3 and v4 loads are supported for private and global memory.
7919    return SDValue();
7920  }
7921  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
7922    // Depending on the setting of the private_element_size field in the
7923    // resource descriptor, we can only make private accesses up to a certain
7924    // size.
7925    switch (Subtarget->getMaxPrivateElementSize()) {
7926    case 4: {
7927      SDValue Ops[2];
7928      std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
7929      return DAG.getMergeValues(Ops, DL);
7930    }
7931    case 8:
7932      if (NumElements > 2)
7933        return SplitVectorLoad(Op, DAG);
7934      return SDValue();
7935    case 16:
7936      // Same as global/flat
7937      if (NumElements > 4)
7938        return SplitVectorLoad(Op, DAG);
7939      // v3 loads not supported on SI.
7940      if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
7941        return WidenVectorLoad(Op, DAG);
7942      return SDValue();
7943    default:
7944      llvm_unreachable("unsupported private_element_size");
7945    }
7946  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
7947    // Use ds_read_b128 if possible.
7948    if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
7949        MemVT.getStoreSize() == 16)
7950      return SDValue();
7951
7952    if (NumElements > 2)
7953      return SplitVectorLoad(Op, DAG);
7954
7955    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7956    // address is negative, then the instruction is incorrectly treated as
7957    // out-of-bounds even if base + offsets is in bounds. Split vectorized
7958    // loads here to avoid emitting ds_read2_b32. We may re-combine the
7959    // load later in the SILoadStoreOptimizer.
7960    if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
7961        NumElements == 2 && MemVT.getStoreSize() == 8 &&
7962        Load->getAlignment() < 8) {
7963      return SplitVectorLoad(Op, DAG);
7964    }
7965  }
7966  return SDValue();
7967}
7968
7969SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
7970  EVT VT = Op.getValueType();
7971  assert(VT.getSizeInBits() == 64);
7972
7973  SDLoc DL(Op);
7974  SDValue Cond = Op.getOperand(0);
7975
7976  SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
7977  SDValue One = DAG.getConstant(1, DL, MVT::i32);
7978
7979  SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
7980  SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
7981
7982  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
7983  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
7984
7985  SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
7986
7987  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
7988  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
7989
7990  SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
7991
7992  SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
7993  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
7994}
7995
7996// Catch division cases where we can use shortcuts with rcp and rsq
7997// instructions.
7998SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
7999                                              SelectionDAG &DAG) const {
8000  SDLoc SL(Op);
8001  SDValue LHS = Op.getOperand(0);
8002  SDValue RHS = Op.getOperand(1);
8003  EVT VT = Op.getValueType();
8004  const SDNodeFlags Flags = Op->getFlags();
8005
8006  bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath ||
8007                            Flags.hasApproximateFuncs();
8008
8009  // Without !fpmath accuracy information, we can't do more because we don't
8010  // know exactly whether rcp is accurate enough to meet !fpmath requirement.
8011  if (!AllowInaccurateRcp)
8012    return SDValue();
8013
8014  if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
8015    if (CLHS->isExactlyValue(1.0)) {
8016      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
8017      // the CI documentation has a worst case error of 1 ulp.
8018      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
8019      // use it as long as we aren't trying to use denormals.
8020      //
8021      // v_rcp_f16 and v_rsq_f16 DO support denormals.
8022
8023      // 1.0 / sqrt(x) -> rsq(x)
8024
8025      // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
8026      // error seems really high at 2^29 ULP.
8027      if (RHS.getOpcode() == ISD::FSQRT)
8028        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
8029
8030      // 1.0 / x -> rcp(x)
8031      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
8032    }
8033
8034    // Same as for 1.0, but expand the sign out of the constant.
8035    if (CLHS->isExactlyValue(-1.0)) {
8036      // -1.0 / x -> rcp (fneg x)
8037      SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
8038      return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
8039    }
8040  }
8041
8042  // Turn into multiply by the reciprocal.
8043  // x / y -> x * (1.0 / y)
8044  SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
8045  return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
8046}
8047
8048static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
8049                          EVT VT, SDValue A, SDValue B, SDValue GlueChain,
8050                          SDNodeFlags Flags) {
8051  if (GlueChain->getNumValues() <= 1) {
8052    return DAG.getNode(Opcode, SL, VT, A, B, Flags);
8053  }
8054
8055  assert(GlueChain->getNumValues() == 3);
8056
8057  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
8058  switch (Opcode) {
8059  default: llvm_unreachable("no chain equivalent for opcode");
8060  case ISD::FMUL:
8061    Opcode = AMDGPUISD::FMUL_W_CHAIN;
8062    break;
8063  }
8064
8065  return DAG.getNode(Opcode, SL, VTList,
8066                     {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
8067                     Flags);
8068}
8069
8070static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
8071                           EVT VT, SDValue A, SDValue B, SDValue C,
8072                           SDValue GlueChain, SDNodeFlags Flags) {
8073  if (GlueChain->getNumValues() <= 1) {
8074    return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
8075  }
8076
8077  assert(GlueChain->getNumValues() == 3);
8078
8079  SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
8080  switch (Opcode) {
8081  default: llvm_unreachable("no chain equivalent for opcode");
8082  case ISD::FMA:
8083    Opcode = AMDGPUISD::FMA_W_CHAIN;
8084    break;
8085  }
8086
8087  return DAG.getNode(Opcode, SL, VTList,
8088                     {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
8089                     Flags);
8090}
8091
8092SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
8093  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
8094    return FastLowered;
8095
8096  SDLoc SL(Op);
8097  SDValue Src0 = Op.getOperand(0);
8098  SDValue Src1 = Op.getOperand(1);
8099
8100  SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
8101  SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
8102
8103  SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
8104  SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
8105
8106  SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
8107  SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
8108
8109  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
8110}
8111
8112// Faster 2.5 ULP division that does not support denormals.
8113SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
8114  SDLoc SL(Op);
8115  SDValue LHS = Op.getOperand(1);
8116  SDValue RHS = Op.getOperand(2);
8117
8118  SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
8119
8120  const APFloat K0Val(BitsToFloat(0x6f800000));
8121  const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
8122
8123  const APFloat K1Val(BitsToFloat(0x2f800000));
8124  const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
8125
8126  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
8127
8128  EVT SetCCVT =
8129    getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
8130
8131  SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
8132
8133  SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
8134
8135  // TODO: Should this propagate fast-math-flags?
8136  r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
8137
8138  // rcp does not support denormals.
8139  SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
8140
8141  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
8142
8143  return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
8144}
8145
8146// Returns immediate value for setting the F32 denorm mode when using the
8147// S_DENORM_MODE instruction.
8148static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
8149                                          const SDLoc &SL, const GCNSubtarget *ST) {
8150  assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
8151  int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction())
8152                                ? FP_DENORM_FLUSH_NONE
8153                                : FP_DENORM_FLUSH_IN_FLUSH_OUT;
8154
8155  int Mode = SPDenormMode | (DPDenormModeDefault << 2);
8156  return DAG.getTargetConstant(Mode, SL, MVT::i32);
8157}
8158
8159SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
8160  if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
8161    return FastLowered;
8162
8163  // The selection matcher assumes anything with a chain selecting to a
8164  // mayRaiseFPException machine instruction. Since we're introducing a chain
8165  // here, we need to explicitly report nofpexcept for the regular fdiv
8166  // lowering.
8167  SDNodeFlags Flags = Op->getFlags();
8168  Flags.setNoFPExcept(true);
8169
8170  SDLoc SL(Op);
8171  SDValue LHS = Op.getOperand(0);
8172  SDValue RHS = Op.getOperand(1);
8173
8174  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
8175
8176  SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
8177
8178  SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
8179                                          {RHS, RHS, LHS}, Flags);
8180  SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
8181                                        {LHS, RHS, LHS}, Flags);
8182
8183  // Denominator is scaled to not be denormal, so using rcp is ok.
8184  SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
8185                                  DenominatorScaled, Flags);
8186  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
8187                                     DenominatorScaled, Flags);
8188
8189  const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
8190                               (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
8191                               (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
8192  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
8193
8194  const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());
8195
8196  if (!HasFP32Denormals) {
8197    // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
8198    // lowering. The chain dependence is insufficient, and we need glue. We do
8199    // not need the glue variants in a strictfp function.
8200
8201    SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
8202
8203    SDNode *EnableDenorm;
8204    if (Subtarget->hasDenormModeInst()) {
8205      const SDValue EnableDenormValue =
8206          getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);
8207
8208      EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
8209                                 DAG.getEntryNode(), EnableDenormValue).getNode();
8210    } else {
8211      const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
8212                                                        SL, MVT::i32);
8213      EnableDenorm =
8214          DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
8215                             {EnableDenormValue, BitField, DAG.getEntryNode()});
8216    }
8217
8218    SDValue Ops[3] = {
8219      NegDivScale0,
8220      SDValue(EnableDenorm, 0),
8221      SDValue(EnableDenorm, 1)
8222    };
8223
8224    NegDivScale0 = DAG.getMergeValues(Ops, SL);
8225  }
8226
8227  SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
8228                             ApproxRcp, One, NegDivScale0, Flags);
8229
8230  SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
8231                             ApproxRcp, Fma0, Flags);
8232
8233  SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
8234                           Fma1, Fma1, Flags);
8235
8236  SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
8237                             NumeratorScaled, Mul, Flags);
8238
8239  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
8240                             Fma2, Fma1, Mul, Fma2, Flags);
8241
8242  SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
8243                             NumeratorScaled, Fma3, Flags);
8244
8245  if (!HasFP32Denormals) {
8246    SDNode *DisableDenorm;
8247    if (Subtarget->hasDenormModeInst()) {
8248      const SDValue DisableDenormValue =
8249          getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);
8250
8251      DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
8252                                  Fma4.getValue(1), DisableDenormValue,
8253                                  Fma4.getValue(2)).getNode();
8254    } else {
8255      const SDValue DisableDenormValue =
8256          DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
8257
8258      DisableDenorm = DAG.getMachineNode(
8259          AMDGPU::S_SETREG_B32, SL, MVT::Other,
8260          {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
8261    }
8262
8263    SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
8264                                      SDValue(DisableDenorm, 0), DAG.getRoot());
8265    DAG.setRoot(OutputChain);
8266  }
8267
8268  SDValue Scale = NumeratorScaled.getValue(1);
8269  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
8270                             {Fma4, Fma1, Fma3, Scale}, Flags);
8271
8272  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
8273}
8274
8275SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
8276  if (DAG.getTarget().Options.UnsafeFPMath)
8277    return lowerFastUnsafeFDIV(Op, DAG);
8278
8279  SDLoc SL(Op);
8280  SDValue X = Op.getOperand(0);
8281  SDValue Y = Op.getOperand(1);
8282
8283  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
8284
8285  SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
8286
8287  SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
8288
8289  SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
8290
8291  SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
8292
8293  SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
8294
8295  SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
8296
8297  SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
8298
8299  SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
8300
8301  SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
8302  SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
8303
8304  SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
8305                             NegDivScale0, Mul, DivScale1);
8306
8307  SDValue Scale;
8308
8309  if (!Subtarget->hasUsableDivScaleConditionOutput()) {
8310    // Workaround a hardware bug on SI where the condition output from div_scale
8311    // is not usable.
8312
8313    const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
8314
8315    // Figure out if the scale to use for div_fmas.
8316    SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
8317    SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
8318    SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
8319    SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
8320
8321    SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
8322    SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
8323
8324    SDValue Scale0Hi
8325      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
8326    SDValue Scale1Hi
8327      = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
8328
8329    SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
8330    SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
8331    Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
8332  } else {
8333    Scale = DivScale1.getValue(1);
8334  }
8335
8336  SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
8337                             Fma4, Fma3, Mul, Scale);
8338
8339  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
8340}
8341
8342SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
8343  EVT VT = Op.getValueType();
8344
8345  if (VT == MVT::f32)
8346    return LowerFDIV32(Op, DAG);
8347
8348  if (VT == MVT::f64)
8349    return LowerFDIV64(Op, DAG);
8350
8351  if (VT == MVT::f16)
8352    return LowerFDIV16(Op, DAG);
8353
8354  llvm_unreachable("Unexpected type for fdiv");
8355}
8356
8357SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
8358  SDLoc DL(Op);
8359  StoreSDNode *Store = cast<StoreSDNode>(Op);
8360  EVT VT = Store->getMemoryVT();
8361
8362  if (VT == MVT::i1) {
8363    return DAG.getTruncStore(Store->getChain(), DL,
8364       DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
8365       Store->getBasePtr(), MVT::i1, Store->getMemOperand());
8366  }
8367
8368  assert(VT.isVector() &&
8369         Store->getValue().getValueType().getScalarType() == MVT::i32);
8370
8371  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
8372                                      VT, *Store->getMemOperand())) {
8373    return expandUnalignedStore(Store, DAG);
8374  }
8375
8376  unsigned AS = Store->getAddressSpace();
8377  if (Subtarget->hasLDSMisalignedBug() &&
8378      AS == AMDGPUAS::FLAT_ADDRESS &&
8379      Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
8380    return SplitVectorStore(Op, DAG);
8381  }
8382
8383  MachineFunction &MF = DAG.getMachineFunction();
8384  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8385  // If there is a possibilty that flat instruction access scratch memory
8386  // then we need to use the same legalization rules we use for private.
8387  if (AS == AMDGPUAS::FLAT_ADDRESS &&
8388      !Subtarget->hasMultiDwordFlatScratchAddressing())
8389    AS = MFI->hasFlatScratchInit() ?
8390         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
8391
8392  unsigned NumElements = VT.getVectorNumElements();
8393  if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
8394      AS == AMDGPUAS::FLAT_ADDRESS) {
8395    if (NumElements > 4)
8396      return SplitVectorStore(Op, DAG);
8397    // v3 stores not supported on SI.
8398    if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
8399      return SplitVectorStore(Op, DAG);
8400    return SDValue();
8401  } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
8402    switch (Subtarget->getMaxPrivateElementSize()) {
8403    case 4:
8404      return scalarizeVectorStore(Store, DAG);
8405    case 8:
8406      if (NumElements > 2)
8407        return SplitVectorStore(Op, DAG);
8408      return SDValue();
8409    case 16:
8410      if (NumElements > 4 || NumElements == 3)
8411        return SplitVectorStore(Op, DAG);
8412      return SDValue();
8413    default:
8414      llvm_unreachable("unsupported private_element_size");
8415    }
8416  } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
8417    // Use ds_write_b128 if possible.
8418    if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
8419        VT.getStoreSize() == 16 && NumElements != 3)
8420      return SDValue();
8421
8422    if (NumElements > 2)
8423      return SplitVectorStore(Op, DAG);
8424
8425    // SI has a hardware bug in the LDS / GDS boounds checking: if the base
8426    // address is negative, then the instruction is incorrectly treated as
8427    // out-of-bounds even if base + offsets is in bounds. Split vectorized
8428    // stores here to avoid emitting ds_write2_b32. We may re-combine the
8429    // store later in the SILoadStoreOptimizer.
8430    if (!Subtarget->hasUsableDSOffset() &&
8431        NumElements == 2 && VT.getStoreSize() == 8 &&
8432        Store->getAlignment() < 8) {
8433      return SplitVectorStore(Op, DAG);
8434    }
8435
8436    return SDValue();
8437  } else {
8438    llvm_unreachable("unhandled address space");
8439  }
8440}
8441
8442SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
8443  SDLoc DL(Op);
8444  EVT VT = Op.getValueType();
8445  SDValue Arg = Op.getOperand(0);
8446  SDValue TrigVal;
8447
8448  // Propagate fast-math flags so that the multiply we introduce can be folded
8449  // if Arg is already the result of a multiply by constant.
8450  auto Flags = Op->getFlags();
8451
8452  SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
8453
8454  if (Subtarget->hasTrigReducedRange()) {
8455    SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
8456    TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
8457  } else {
8458    TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
8459  }
8460
8461  switch (Op.getOpcode()) {
8462  case ISD::FCOS:
8463    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
8464  case ISD::FSIN:
8465    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
8466  default:
8467    llvm_unreachable("Wrong trig opcode");
8468  }
8469}
8470
8471SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
8472  AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
8473  assert(AtomicNode->isCompareAndSwap());
8474  unsigned AS = AtomicNode->getAddressSpace();
8475
8476  // No custom lowering required for local address space
8477  if (!isFlatGlobalAddrSpace(AS))
8478    return Op;
8479
8480  // Non-local address space requires custom lowering for atomic compare
8481  // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
8482  SDLoc DL(Op);
8483  SDValue ChainIn = Op.getOperand(0);
8484  SDValue Addr = Op.getOperand(1);
8485  SDValue Old = Op.getOperand(2);
8486  SDValue New = Op.getOperand(3);
8487  EVT VT = Op.getValueType();
8488  MVT SimpleVT = VT.getSimpleVT();
8489  MVT VecType = MVT::getVectorVT(SimpleVT, 2);
8490
8491  SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
8492  SDValue Ops[] = { ChainIn, Addr, NewOld };
8493
8494  return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
8495                                 Ops, VT, AtomicNode->getMemOperand());
8496}
8497
8498//===----------------------------------------------------------------------===//
8499// Custom DAG optimizations
8500//===----------------------------------------------------------------------===//
8501
8502SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
8503                                                     DAGCombinerInfo &DCI) const {
8504  EVT VT = N->getValueType(0);
8505  EVT ScalarVT = VT.getScalarType();
8506  if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
8507    return SDValue();
8508
8509  SelectionDAG &DAG = DCI.DAG;
8510  SDLoc DL(N);
8511
8512  SDValue Src = N->getOperand(0);
8513  EVT SrcVT = Src.getValueType();
8514
8515  // TODO: We could try to match extracting the higher bytes, which would be
8516  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
8517  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
8518  // about in practice.
8519  if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
8520    if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
8521      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
8522      DCI.AddToWorklist(Cvt.getNode());
8523
8524      // For the f16 case, fold to a cast to f32 and then cast back to f16.
8525      if (ScalarVT != MVT::f32) {
8526        Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
8527                          DAG.getTargetConstant(0, DL, MVT::i32));
8528      }
8529      return Cvt;
8530    }
8531  }
8532
8533  return SDValue();
8534}
8535
8536// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
8537
8538// This is a variant of
8539// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
8540//
8541// The normal DAG combiner will do this, but only if the add has one use since
8542// that would increase the number of instructions.
8543//
8544// This prevents us from seeing a constant offset that can be folded into a
8545// memory instruction's addressing mode. If we know the resulting add offset of
8546// a pointer can be folded into an addressing offset, we can replace the pointer
8547// operand with the add of new constant offset. This eliminates one of the uses,
8548// and may allow the remaining use to also be simplified.
8549//
8550SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
8551                                               unsigned AddrSpace,
8552                                               EVT MemVT,
8553                                               DAGCombinerInfo &DCI) const {
8554  SDValue N0 = N->getOperand(0);
8555  SDValue N1 = N->getOperand(1);
8556
8557  // We only do this to handle cases where it's profitable when there are
8558  // multiple uses of the add, so defer to the standard combine.
8559  if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
8560      N0->hasOneUse())
8561    return SDValue();
8562
8563  const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
8564  if (!CN1)
8565    return SDValue();
8566
8567  const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
8568  if (!CAdd)
8569    return SDValue();
8570
8571  // If the resulting offset is too large, we can't fold it into the addressing
8572  // mode offset.
8573  APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
8574  Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
8575
8576  AddrMode AM;
8577  AM.HasBaseReg = true;
8578  AM.BaseOffs = Offset.getSExtValue();
8579  if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
8580    return SDValue();
8581
8582  SelectionDAG &DAG = DCI.DAG;
8583  SDLoc SL(N);
8584  EVT VT = N->getValueType(0);
8585
8586  SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
8587  SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
8588
8589  SDNodeFlags Flags;
8590  Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
8591                          (N0.getOpcode() == ISD::OR ||
8592                           N0->getFlags().hasNoUnsignedWrap()));
8593
8594  return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
8595}
8596
8597SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
8598                                                  DAGCombinerInfo &DCI) const {
8599  SDValue Ptr = N->getBasePtr();
8600  SelectionDAG &DAG = DCI.DAG;
8601  SDLoc SL(N);
8602
8603  // TODO: We could also do this for multiplies.
8604  if (Ptr.getOpcode() == ISD::SHL) {
8605    SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(),  N->getAddressSpace(),
8606                                          N->getMemoryVT(), DCI);
8607    if (NewPtr) {
8608      SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
8609
8610      NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
8611      return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
8612    }
8613  }
8614
8615  return SDValue();
8616}
8617
8618static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
8619  return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
8620         (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
8621         (Opc == ISD::XOR && Val == 0);
8622}
8623
8624// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
8625// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
8626// integer combine opportunities since most 64-bit operations are decomposed
8627// this way.  TODO: We won't want this for SALU especially if it is an inline
8628// immediate.
8629SDValue SITargetLowering::splitBinaryBitConstantOp(
8630  DAGCombinerInfo &DCI,
8631  const SDLoc &SL,
8632  unsigned Opc, SDValue LHS,
8633  const ConstantSDNode *CRHS) const {
8634  uint64_t Val = CRHS->getZExtValue();
8635  uint32_t ValLo = Lo_32(Val);
8636  uint32_t ValHi = Hi_32(Val);
8637  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8638
8639    if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
8640         bitOpWithConstantIsReducible(Opc, ValHi)) ||
8641        (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
8642    // If we need to materialize a 64-bit immediate, it will be split up later
8643    // anyway. Avoid creating the harder to understand 64-bit immediate
8644    // materialization.
8645    return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
8646  }
8647
8648  return SDValue();
8649}
8650
8651// Returns true if argument is a boolean value which is not serialized into
8652// memory or argument and does not require v_cmdmask_b32 to be deserialized.
8653static bool isBoolSGPR(SDValue V) {
8654  if (V.getValueType() != MVT::i1)
8655    return false;
8656  switch (V.getOpcode()) {
8657  default: break;
8658  case ISD::SETCC:
8659  case ISD::AND:
8660  case ISD::OR:
8661  case ISD::XOR:
8662  case AMDGPUISD::FP_CLASS:
8663    return true;
8664  }
8665  return false;
8666}
8667
8668// If a constant has all zeroes or all ones within each byte return it.
8669// Otherwise return 0.
8670static uint32_t getConstantPermuteMask(uint32_t C) {
8671  // 0xff for any zero byte in the mask
8672  uint32_t ZeroByteMask = 0;
8673  if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
8674  if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
8675  if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
8676  if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
8677  uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
8678  if ((NonZeroByteMask & C) != NonZeroByteMask)
8679    return 0; // Partial bytes selected.
8680  return C;
8681}
8682
8683// Check if a node selects whole bytes from its operand 0 starting at a byte
8684// boundary while masking the rest. Returns select mask as in the v_perm_b32
8685// or -1 if not succeeded.
8686// Note byte select encoding:
8687// value 0-3 selects corresponding source byte;
8688// value 0xc selects zero;
8689// value 0xff selects 0xff.
8690static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
8691  assert(V.getValueSizeInBits() == 32);
8692
8693  if (V.getNumOperands() != 2)
8694    return ~0;
8695
8696  ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
8697  if (!N1)
8698    return ~0;
8699
8700  uint32_t C = N1->getZExtValue();
8701
8702  switch (V.getOpcode()) {
8703  default:
8704    break;
8705  case ISD::AND:
8706    if (uint32_t ConstMask = getConstantPermuteMask(C)) {
8707      return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
8708    }
8709    break;
8710
8711  case ISD::OR:
8712    if (uint32_t ConstMask = getConstantPermuteMask(C)) {
8713      return (0x03020100 & ~ConstMask) | ConstMask;
8714    }
8715    break;
8716
8717  case ISD::SHL:
8718    if (C % 8)
8719      return ~0;
8720
8721    return uint32_t((0x030201000c0c0c0cull << C) >> 32);
8722
8723  case ISD::SRL:
8724    if (C % 8)
8725      return ~0;
8726
8727    return uint32_t(0x0c0c0c0c03020100ull >> C);
8728  }
8729
8730  return ~0;
8731}
8732
8733SDValue SITargetLowering::performAndCombine(SDNode *N,
8734                                            DAGCombinerInfo &DCI) const {
8735  if (DCI.isBeforeLegalize())
8736    return SDValue();
8737
8738  SelectionDAG &DAG = DCI.DAG;
8739  EVT VT = N->getValueType(0);
8740  SDValue LHS = N->getOperand(0);
8741  SDValue RHS = N->getOperand(1);
8742
8743
8744  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
8745  if (VT == MVT::i64 && CRHS) {
8746    if (SDValue Split
8747        = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
8748      return Split;
8749  }
8750
8751  if (CRHS && VT == MVT::i32) {
8752    // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
8753    // nb = number of trailing zeroes in mask
8754    // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
8755    // given that we are selecting 8 or 16 bit fields starting at byte boundary.
8756    uint64_t Mask = CRHS->getZExtValue();
8757    unsigned Bits = countPopulation(Mask);
8758    if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
8759        (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
8760      if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
8761        unsigned Shift = CShift->getZExtValue();
8762        unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
8763        unsigned Offset = NB + Shift;
8764        if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
8765          SDLoc SL(N);
8766          SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
8767                                    LHS->getOperand(0),
8768                                    DAG.getConstant(Offset, SL, MVT::i32),
8769                                    DAG.getConstant(Bits, SL, MVT::i32));
8770          EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
8771          SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
8772                                    DAG.getValueType(NarrowVT));
8773          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
8774                                    DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
8775          return Shl;
8776        }
8777      }
8778    }
8779
8780    // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8781    if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
8782        isa<ConstantSDNode>(LHS.getOperand(2))) {
8783      uint32_t Sel = getConstantPermuteMask(Mask);
8784      if (!Sel)
8785        return SDValue();
8786
8787      // Select 0xc for all zero bytes
8788      Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
8789      SDLoc DL(N);
8790      return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
8791                         LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
8792    }
8793  }
8794
8795  // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
8796  // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
8797  if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
8798    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
8799    ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
8800
8801    SDValue X = LHS.getOperand(0);
8802    SDValue Y = RHS.getOperand(0);
8803    if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
8804      return SDValue();
8805
8806    if (LCC == ISD::SETO) {
8807      if (X != LHS.getOperand(1))
8808        return SDValue();
8809
8810      if (RCC == ISD::SETUNE) {
8811        const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
8812        if (!C1 || !C1->isInfinity() || C1->isNegative())
8813          return SDValue();
8814
8815        const uint32_t Mask = SIInstrFlags::N_NORMAL |
8816                              SIInstrFlags::N_SUBNORMAL |
8817                              SIInstrFlags::N_ZERO |
8818                              SIInstrFlags::P_ZERO |
8819                              SIInstrFlags::P_SUBNORMAL |
8820                              SIInstrFlags::P_NORMAL;
8821
8822        static_assert(((~(SIInstrFlags::S_NAN |
8823                          SIInstrFlags::Q_NAN |
8824                          SIInstrFlags::N_INFINITY |
8825                          SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
8826                      "mask not equal");
8827
8828        SDLoc DL(N);
8829        return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
8830                           X, DAG.getConstant(Mask, DL, MVT::i32));
8831      }
8832    }
8833  }
8834
8835  if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
8836    std::swap(LHS, RHS);
8837
8838  if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
8839      RHS.hasOneUse()) {
8840    ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
8841    // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
8842    // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
8843    const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
8844    if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
8845        (RHS.getOperand(0) == LHS.getOperand(0) &&
8846         LHS.getOperand(0) == LHS.getOperand(1))) {
8847      const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
8848      unsigned NewMask = LCC == ISD::SETO ?
8849        Mask->getZExtValue() & ~OrdMask :
8850        Mask->getZExtValue() & OrdMask;
8851
8852      SDLoc DL(N);
8853      return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
8854                         DAG.getConstant(NewMask, DL, MVT::i32));
8855    }
8856  }
8857
8858  if (VT == MVT::i32 &&
8859      (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
8860    // and x, (sext cc from i1) => select cc, x, 0
8861    if (RHS.getOpcode() != ISD::SIGN_EXTEND)
8862      std::swap(LHS, RHS);
8863    if (isBoolSGPR(RHS.getOperand(0)))
8864      return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
8865                           LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
8866  }
8867
8868  // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8869  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8870  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
8871      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
8872    uint32_t LHSMask = getPermuteMask(DAG, LHS);
8873    uint32_t RHSMask = getPermuteMask(DAG, RHS);
8874    if (LHSMask != ~0u && RHSMask != ~0u) {
8875      // Canonicalize the expression in an attempt to have fewer unique masks
8876      // and therefore fewer registers used to hold the masks.
8877      if (LHSMask > RHSMask) {
8878        std::swap(LHSMask, RHSMask);
8879        std::swap(LHS, RHS);
8880      }
8881
8882      // Select 0xc for each lane used from source operand. Zero has 0xc mask
8883      // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8884      uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
8885      uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
8886
8887      // Check of we need to combine values from two sources within a byte.
8888      if (!(LHSUsedLanes & RHSUsedLanes) &&
8889          // If we select high and lower word keep it for SDWA.
8890          // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8891          !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
8892        // Each byte in each mask is either selector mask 0-3, or has higher
8893        // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
8894        // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
8895        // mask which is not 0xff wins. By anding both masks we have a correct
8896        // result except that 0x0c shall be corrected to give 0x0c only.
8897        uint32_t Mask = LHSMask & RHSMask;
8898        for (unsigned I = 0; I < 32; I += 8) {
8899          uint32_t ByteSel = 0xff << I;
8900          if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
8901            Mask &= (0x0c << I) & 0xffffffff;
8902        }
8903
8904        // Add 4 to each active LHS lane. It will not affect any existing 0xff
8905        // or 0x0c.
8906        uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
8907        SDLoc DL(N);
8908
8909        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
8910                           LHS.getOperand(0), RHS.getOperand(0),
8911                           DAG.getConstant(Sel, DL, MVT::i32));
8912      }
8913    }
8914  }
8915
8916  return SDValue();
8917}
8918
8919SDValue SITargetLowering::performOrCombine(SDNode *N,
8920                                           DAGCombinerInfo &DCI) const {
8921  SelectionDAG &DAG = DCI.DAG;
8922  SDValue LHS = N->getOperand(0);
8923  SDValue RHS = N->getOperand(1);
8924
8925  EVT VT = N->getValueType(0);
8926  if (VT == MVT::i1) {
8927    // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
8928    if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
8929        RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
8930      SDValue Src = LHS.getOperand(0);
8931      if (Src != RHS.getOperand(0))
8932        return SDValue();
8933
8934      const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
8935      const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
8936      if (!CLHS || !CRHS)
8937        return SDValue();
8938
8939      // Only 10 bits are used.
8940      static const uint32_t MaxMask = 0x3ff;
8941
8942      uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
8943      SDLoc DL(N);
8944      return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
8945                         Src, DAG.getConstant(NewMask, DL, MVT::i32));
8946    }
8947
8948    return SDValue();
8949  }
8950
8951  // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
8952  if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
8953      LHS.getOpcode() == AMDGPUISD::PERM &&
8954      isa<ConstantSDNode>(LHS.getOperand(2))) {
8955    uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
8956    if (!Sel)
8957      return SDValue();
8958
8959    Sel |= LHS.getConstantOperandVal(2);
8960    SDLoc DL(N);
8961    return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
8962                       LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
8963  }
8964
8965  // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
8966  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8967  if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
8968      N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
8969    uint32_t LHSMask = getPermuteMask(DAG, LHS);
8970    uint32_t RHSMask = getPermuteMask(DAG, RHS);
8971    if (LHSMask != ~0u && RHSMask != ~0u) {
8972      // Canonicalize the expression in an attempt to have fewer unique masks
8973      // and therefore fewer registers used to hold the masks.
8974      if (LHSMask > RHSMask) {
8975        std::swap(LHSMask, RHSMask);
8976        std::swap(LHS, RHS);
8977      }
8978
8979      // Select 0xc for each lane used from source operand. Zero has 0xc mask
8980      // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
8981      uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
8982      uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
8983
8984      // Check of we need to combine values from two sources within a byte.
8985      if (!(LHSUsedLanes & RHSUsedLanes) &&
8986          // If we select high and lower word keep it for SDWA.
8987          // TODO: teach SDWA to work with v_perm_b32 and remove the check.
8988          !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
8989        // Kill zero bytes selected by other mask. Zero value is 0xc.
8990        LHSMask &= ~RHSUsedLanes;
8991        RHSMask &= ~LHSUsedLanes;
8992        // Add 4 to each active LHS lane
8993        LHSMask |= LHSUsedLanes & 0x04040404;
8994        // Combine masks
8995        uint32_t Sel = LHSMask | RHSMask;
8996        SDLoc DL(N);
8997
8998        return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
8999                           LHS.getOperand(0), RHS.getOperand(0),
9000                           DAG.getConstant(Sel, DL, MVT::i32));
9001      }
9002    }
9003  }
9004
9005  if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
9006    return SDValue();
9007
9008  // TODO: This could be a generic combine with a predicate for extracting the
9009  // high half of an integer being free.
9010
9011  // (or i64:x, (zero_extend i32:y)) ->
9012  //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
9013  if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
9014      RHS.getOpcode() != ISD::ZERO_EXTEND)
9015    std::swap(LHS, RHS);
9016
9017  if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
9018    SDValue ExtSrc = RHS.getOperand(0);
9019    EVT SrcVT = ExtSrc.getValueType();
9020    if (SrcVT == MVT::i32) {
9021      SDLoc SL(N);
9022      SDValue LowLHS, HiBits;
9023      std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
9024      SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
9025
9026      DCI.AddToWorklist(LowOr.getNode());
9027      DCI.AddToWorklist(HiBits.getNode());
9028
9029      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
9030                                LowOr, HiBits);
9031      return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
9032    }
9033  }
9034
9035  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
9036  if (CRHS) {
9037    if (SDValue Split
9038          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
9039      return Split;
9040  }
9041
9042  return SDValue();
9043}
9044
9045SDValue SITargetLowering::performXorCombine(SDNode *N,
9046                                            DAGCombinerInfo &DCI) const {
9047  EVT VT = N->getValueType(0);
9048  if (VT != MVT::i64)
9049    return SDValue();
9050
9051  SDValue LHS = N->getOperand(0);
9052  SDValue RHS = N->getOperand(1);
9053
9054  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
9055  if (CRHS) {
9056    if (SDValue Split
9057          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
9058      return Split;
9059  }
9060
9061  return SDValue();
9062}
9063
9064// Instructions that will be lowered with a final instruction that zeros the
9065// high result bits.
9066// XXX - probably only need to list legal operations.
9067static bool fp16SrcZerosHighBits(unsigned Opc) {
9068  switch (Opc) {
9069  case ISD::FADD:
9070  case ISD::FSUB:
9071  case ISD::FMUL:
9072  case ISD::FDIV:
9073  case ISD::FREM:
9074  case ISD::FMA:
9075  case ISD::FMAD:
9076  case ISD::FCANONICALIZE:
9077  case ISD::FP_ROUND:
9078  case ISD::UINT_TO_FP:
9079  case ISD::SINT_TO_FP:
9080  case ISD::FABS:
9081    // Fabs is lowered to a bit operation, but it's an and which will clear the
9082    // high bits anyway.
9083  case ISD::FSQRT:
9084  case ISD::FSIN:
9085  case ISD::FCOS:
9086  case ISD::FPOWI:
9087  case ISD::FPOW:
9088  case ISD::FLOG:
9089  case ISD::FLOG2:
9090  case ISD::FLOG10:
9091  case ISD::FEXP:
9092  case ISD::FEXP2:
9093  case ISD::FCEIL:
9094  case ISD::FTRUNC:
9095  case ISD::FRINT:
9096  case ISD::FNEARBYINT:
9097  case ISD::FROUND:
9098  case ISD::FFLOOR:
9099  case ISD::FMINNUM:
9100  case ISD::FMAXNUM:
9101  case AMDGPUISD::FRACT:
9102  case AMDGPUISD::CLAMP:
9103  case AMDGPUISD::COS_HW:
9104  case AMDGPUISD::SIN_HW:
9105  case AMDGPUISD::FMIN3:
9106  case AMDGPUISD::FMAX3:
9107  case AMDGPUISD::FMED3:
9108  case AMDGPUISD::FMAD_FTZ:
9109  case AMDGPUISD::RCP:
9110  case AMDGPUISD::RSQ:
9111  case AMDGPUISD::RCP_IFLAG:
9112  case AMDGPUISD::LDEXP:
9113    return true;
9114  default:
9115    // fcopysign, select and others may be lowered to 32-bit bit operations
9116    // which don't zero the high bits.
9117    return false;
9118  }
9119}
9120
9121SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
9122                                                   DAGCombinerInfo &DCI) const {
9123  if (!Subtarget->has16BitInsts() ||
9124      DCI.getDAGCombineLevel() < AfterLegalizeDAG)
9125    return SDValue();
9126
9127  EVT VT = N->getValueType(0);
9128  if (VT != MVT::i32)
9129    return SDValue();
9130
9131  SDValue Src = N->getOperand(0);
9132  if (Src.getValueType() != MVT::i16)
9133    return SDValue();
9134
9135  // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
9136  // FIXME: It is not universally true that the high bits are zeroed on gfx9.
9137  if (Src.getOpcode() == ISD::BITCAST) {
9138    SDValue BCSrc = Src.getOperand(0);
9139    if (BCSrc.getValueType() == MVT::f16 &&
9140        fp16SrcZerosHighBits(BCSrc.getOpcode()))
9141      return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
9142  }
9143
9144  return SDValue();
9145}
9146
9147SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
9148                                                        DAGCombinerInfo &DCI)
9149                                                        const {
9150  SDValue Src = N->getOperand(0);
9151  auto *VTSign = cast<VTSDNode>(N->getOperand(1));
9152
9153  if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
9154      VTSign->getVT() == MVT::i8) ||
9155      (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
9156      VTSign->getVT() == MVT::i16)) &&
9157      Src.hasOneUse()) {
9158    auto *M = cast<MemSDNode>(Src);
9159    SDValue Ops[] = {
9160      Src.getOperand(0), // Chain
9161      Src.getOperand(1), // rsrc
9162      Src.getOperand(2), // vindex
9163      Src.getOperand(3), // voffset
9164      Src.getOperand(4), // soffset
9165      Src.getOperand(5), // offset
9166      Src.getOperand(6),
9167      Src.getOperand(7)
9168    };
9169    // replace with BUFFER_LOAD_BYTE/SHORT
9170    SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
9171                                         Src.getOperand(0).getValueType());
9172    unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
9173                   AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
9174    SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
9175                                                          ResList,
9176                                                          Ops, M->getMemoryVT(),
9177                                                          M->getMemOperand());
9178    return DCI.DAG.getMergeValues({BufferLoadSignExt,
9179                                  BufferLoadSignExt.getValue(1)}, SDLoc(N));
9180  }
9181  return SDValue();
9182}
9183
9184SDValue SITargetLowering::performClassCombine(SDNode *N,
9185                                              DAGCombinerInfo &DCI) const {
9186  SelectionDAG &DAG = DCI.DAG;
9187  SDValue Mask = N->getOperand(1);
9188
9189  // fp_class x, 0 -> false
9190  if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
9191    if (CMask->isNullValue())
9192      return DAG.getConstant(0, SDLoc(N), MVT::i1);
9193  }
9194
9195  if (N->getOperand(0).isUndef())
9196    return DAG.getUNDEF(MVT::i1);
9197
9198  return SDValue();
9199}
9200
9201SDValue SITargetLowering::performRcpCombine(SDNode *N,
9202                                            DAGCombinerInfo &DCI) const {
9203  EVT VT = N->getValueType(0);
9204  SDValue N0 = N->getOperand(0);
9205
9206  if (N0.isUndef())
9207    return N0;
9208
9209  if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
9210                         N0.getOpcode() == ISD::SINT_TO_FP)) {
9211    return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
9212                           N->getFlags());
9213  }
9214
9215  if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
9216    return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
9217                           N0.getOperand(0), N->getFlags());
9218  }
9219
9220  return AMDGPUTargetLowering::performRcpCombine(N, DCI);
9221}
9222
9223bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
9224                                       unsigned MaxDepth) const {
9225  unsigned Opcode = Op.getOpcode();
9226  if (Opcode == ISD::FCANONICALIZE)
9227    return true;
9228
9229  if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
9230    auto F = CFP->getValueAPF();
9231    if (F.isNaN() && F.isSignaling())
9232      return false;
9233    return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType());
9234  }
9235
9236  // If source is a result of another standard FP operation it is already in
9237  // canonical form.
9238  if (MaxDepth == 0)
9239    return false;
9240
9241  switch (Opcode) {
9242  // These will flush denorms if required.
9243  case ISD::FADD:
9244  case ISD::FSUB:
9245  case ISD::FMUL:
9246  case ISD::FCEIL:
9247  case ISD::FFLOOR:
9248  case ISD::FMA:
9249  case ISD::FMAD:
9250  case ISD::FSQRT:
9251  case ISD::FDIV:
9252  case ISD::FREM:
9253  case ISD::FP_ROUND:
9254  case ISD::FP_EXTEND:
9255  case AMDGPUISD::FMUL_LEGACY:
9256  case AMDGPUISD::FMAD_FTZ:
9257  case AMDGPUISD::RCP:
9258  case AMDGPUISD::RSQ:
9259  case AMDGPUISD::RSQ_CLAMP:
9260  case AMDGPUISD::RCP_LEGACY:
9261  case AMDGPUISD::RCP_IFLAG:
9262  case AMDGPUISD::DIV_SCALE:
9263  case AMDGPUISD::DIV_FMAS:
9264  case AMDGPUISD::DIV_FIXUP:
9265  case AMDGPUISD::FRACT:
9266  case AMDGPUISD::LDEXP:
9267  case AMDGPUISD::CVT_PKRTZ_F16_F32:
9268  case AMDGPUISD::CVT_F32_UBYTE0:
9269  case AMDGPUISD::CVT_F32_UBYTE1:
9270  case AMDGPUISD::CVT_F32_UBYTE2:
9271  case AMDGPUISD::CVT_F32_UBYTE3:
9272    return true;
9273
9274  // It can/will be lowered or combined as a bit operation.
9275  // Need to check their input recursively to handle.
9276  case ISD::FNEG:
9277  case ISD::FABS:
9278  case ISD::FCOPYSIGN:
9279    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
9280
9281  case ISD::FSIN:
9282  case ISD::FCOS:
9283  case ISD::FSINCOS:
9284    return Op.getValueType().getScalarType() != MVT::f16;
9285
9286  case ISD::FMINNUM:
9287  case ISD::FMAXNUM:
9288  case ISD::FMINNUM_IEEE:
9289  case ISD::FMAXNUM_IEEE:
9290  case AMDGPUISD::CLAMP:
9291  case AMDGPUISD::FMED3:
9292  case AMDGPUISD::FMAX3:
9293  case AMDGPUISD::FMIN3: {
9294    // FIXME: Shouldn't treat the generic operations different based these.
9295    // However, we aren't really required to flush the result from
9296    // minnum/maxnum..
9297
9298    // snans will be quieted, so we only need to worry about denormals.
9299    if (Subtarget->supportsMinMaxDenormModes() ||
9300        denormalsEnabledForType(DAG, Op.getValueType()))
9301      return true;
9302
9303    // Flushing may be required.
9304    // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
9305    // targets need to check their input recursively.
9306
9307    // FIXME: Does this apply with clamp? It's implemented with max.
9308    for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
9309      if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
9310        return false;
9311    }
9312
9313    return true;
9314  }
9315  case ISD::SELECT: {
9316    return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
9317           isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
9318  }
9319  case ISD::BUILD_VECTOR: {
9320    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
9321      SDValue SrcOp = Op.getOperand(i);
9322      if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
9323        return false;
9324    }
9325
9326    return true;
9327  }
9328  case ISD::EXTRACT_VECTOR_ELT:
9329  case ISD::EXTRACT_SUBVECTOR: {
9330    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
9331  }
9332  case ISD::INSERT_VECTOR_ELT: {
9333    return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
9334           isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
9335  }
9336  case ISD::UNDEF:
9337    // Could be anything.
9338    return false;
9339
9340  case ISD::BITCAST: {
9341    // Hack round the mess we make when legalizing extract_vector_elt
9342    SDValue Src = Op.getOperand(0);
9343    if (Src.getValueType() == MVT::i16 &&
9344        Src.getOpcode() == ISD::TRUNCATE) {
9345      SDValue TruncSrc = Src.getOperand(0);
9346      if (TruncSrc.getValueType() == MVT::i32 &&
9347          TruncSrc.getOpcode() == ISD::BITCAST &&
9348          TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
9349        return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
9350      }
9351    }
9352
9353    return false;
9354  }
9355  case ISD::INTRINSIC_WO_CHAIN: {
9356    unsigned IntrinsicID
9357      = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
9358    // TODO: Handle more intrinsics
9359    switch (IntrinsicID) {
9360    case Intrinsic::amdgcn_cvt_pkrtz:
9361    case Intrinsic::amdgcn_cubeid:
9362    case Intrinsic::amdgcn_frexp_mant:
9363    case Intrinsic::amdgcn_fdot2:
9364    case Intrinsic::amdgcn_rcp:
9365    case Intrinsic::amdgcn_rsq:
9366    case Intrinsic::amdgcn_rsq_clamp:
9367    case Intrinsic::amdgcn_rcp_legacy:
9368    case Intrinsic::amdgcn_rsq_legacy:
9369    case Intrinsic::amdgcn_trig_preop:
9370      return true;
9371    default:
9372      break;
9373    }
9374
9375    LLVM_FALLTHROUGH;
9376  }
9377  default:
9378    return denormalsEnabledForType(DAG, Op.getValueType()) &&
9379           DAG.isKnownNeverSNaN(Op);
9380  }
9381
9382  llvm_unreachable("invalid operation");
9383}
9384
9385// Constant fold canonicalize.
9386SDValue SITargetLowering::getCanonicalConstantFP(
9387  SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
9388  // Flush denormals to 0 if not enabled.
9389  if (C.isDenormal() && !denormalsEnabledForType(DAG, VT))
9390    return DAG.getConstantFP(0.0, SL, VT);
9391
9392  if (C.isNaN()) {
9393    APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
9394    if (C.isSignaling()) {
9395      // Quiet a signaling NaN.
9396      // FIXME: Is this supposed to preserve payload bits?
9397      return DAG.getConstantFP(CanonicalQNaN, SL, VT);
9398    }
9399
9400    // Make sure it is the canonical NaN bitpattern.
9401    //
9402    // TODO: Can we use -1 as the canonical NaN value since it's an inline
9403    // immediate?
9404    if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
9405      return DAG.getConstantFP(CanonicalQNaN, SL, VT);
9406  }
9407
9408  // Already canonical.
9409  return DAG.getConstantFP(C, SL, VT);
9410}
9411
9412static bool vectorEltWillFoldAway(SDValue Op) {
9413  return Op.isUndef() || isa<ConstantFPSDNode>(Op);
9414}
9415
9416SDValue SITargetLowering::performFCanonicalizeCombine(
9417  SDNode *N,
9418  DAGCombinerInfo &DCI) const {
9419  SelectionDAG &DAG = DCI.DAG;
9420  SDValue N0 = N->getOperand(0);
9421  EVT VT = N->getValueType(0);
9422
9423  // fcanonicalize undef -> qnan
9424  if (N0.isUndef()) {
9425    APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
9426    return DAG.getConstantFP(QNaN, SDLoc(N), VT);
9427  }
9428
9429  if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
9430    EVT VT = N->getValueType(0);
9431    return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
9432  }
9433
9434  // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
9435  //                                                   (fcanonicalize k)
9436  //
9437  // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
9438
9439  // TODO: This could be better with wider vectors that will be split to v2f16,
9440  // and to consider uses since there aren't that many packed operations.
9441  if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
9442      isTypeLegal(MVT::v2f16)) {
9443    SDLoc SL(N);
9444    SDValue NewElts[2];
9445    SDValue Lo = N0.getOperand(0);
9446    SDValue Hi = N0.getOperand(1);
9447    EVT EltVT = Lo.getValueType();
9448
9449    if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
9450      for (unsigned I = 0; I != 2; ++I) {
9451        SDValue Op = N0.getOperand(I);
9452        if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
9453          NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
9454                                              CFP->getValueAPF());
9455        } else if (Op.isUndef()) {
9456          // Handled below based on what the other operand is.
9457          NewElts[I] = Op;
9458        } else {
9459          NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
9460        }
9461      }
9462
9463      // If one half is undef, and one is constant, perfer a splat vector rather
9464      // than the normal qNaN. If it's a register, prefer 0.0 since that's
9465      // cheaper to use and may be free with a packed operation.
9466      if (NewElts[0].isUndef()) {
9467        if (isa<ConstantFPSDNode>(NewElts[1]))
9468          NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
9469            NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
9470      }
9471
9472      if (NewElts[1].isUndef()) {
9473        NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
9474          NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
9475      }
9476
9477      return DAG.getBuildVector(VT, SL, NewElts);
9478    }
9479  }
9480
9481  unsigned SrcOpc = N0.getOpcode();
9482
9483  // If it's free to do so, push canonicalizes further up the source, which may
9484  // find a canonical source.
9485  //
9486  // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
9487  // sNaNs.
9488  if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
9489    auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
9490    if (CRHS && N0.hasOneUse()) {
9491      SDLoc SL(N);
9492      SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
9493                                   N0.getOperand(0));
9494      SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
9495      DCI.AddToWorklist(Canon0.getNode());
9496
9497      return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
9498    }
9499  }
9500
9501  return isCanonicalized(DAG, N0) ? N0 : SDValue();
9502}
9503
9504static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
9505  switch (Opc) {
9506  case ISD::FMAXNUM:
9507  case ISD::FMAXNUM_IEEE:
9508    return AMDGPUISD::FMAX3;
9509  case ISD::SMAX:
9510    return AMDGPUISD::SMAX3;
9511  case ISD::UMAX:
9512    return AMDGPUISD::UMAX3;
9513  case ISD::FMINNUM:
9514  case ISD::FMINNUM_IEEE:
9515    return AMDGPUISD::FMIN3;
9516  case ISD::SMIN:
9517    return AMDGPUISD::SMIN3;
9518  case ISD::UMIN:
9519    return AMDGPUISD::UMIN3;
9520  default:
9521    llvm_unreachable("Not a min/max opcode");
9522  }
9523}
9524
9525SDValue SITargetLowering::performIntMed3ImmCombine(
9526  SelectionDAG &DAG, const SDLoc &SL,
9527  SDValue Op0, SDValue Op1, bool Signed) const {
9528  ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
9529  if (!K1)
9530    return SDValue();
9531
9532  ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
9533  if (!K0)
9534    return SDValue();
9535
9536  if (Signed) {
9537    if (K0->getAPIntValue().sge(K1->getAPIntValue()))
9538      return SDValue();
9539  } else {
9540    if (K0->getAPIntValue().uge(K1->getAPIntValue()))
9541      return SDValue();
9542  }
9543
9544  EVT VT = K0->getValueType(0);
9545  unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
9546  if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
9547    return DAG.getNode(Med3Opc, SL, VT,
9548                       Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
9549  }
9550
9551  // If there isn't a 16-bit med3 operation, convert to 32-bit.
9552  MVT NVT = MVT::i32;
9553  unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9554
9555  SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
9556  SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
9557  SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
9558
9559  SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
9560  return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
9561}
9562
9563static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
9564  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
9565    return C;
9566
9567  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
9568    if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
9569      return C;
9570  }
9571
9572  return nullptr;
9573}
9574
9575SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
9576                                                  const SDLoc &SL,
9577                                                  SDValue Op0,
9578                                                  SDValue Op1) const {
9579  ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
9580  if (!K1)
9581    return SDValue();
9582
9583  ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
9584  if (!K0)
9585    return SDValue();
9586
9587  // Ordered >= (although NaN inputs should have folded away by now).
9588  if (K0->getValueAPF() > K1->getValueAPF())
9589    return SDValue();
9590
9591  const MachineFunction &MF = DAG.getMachineFunction();
9592  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9593
9594  // TODO: Check IEEE bit enabled?
9595  EVT VT = Op0.getValueType();
9596  if (Info->getMode().DX10Clamp) {
9597    // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
9598    // hardware fmed3 behavior converting to a min.
9599    // FIXME: Should this be allowing -0.0?
9600    if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
9601      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
9602  }
9603
9604  // med3 for f16 is only available on gfx9+, and not available for v2f16.
9605  if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
9606    // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
9607    // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
9608    // then give the other result, which is different from med3 with a NaN
9609    // input.
9610    SDValue Var = Op0.getOperand(0);
9611    if (!DAG.isKnownNeverSNaN(Var))
9612      return SDValue();
9613
9614    const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9615
9616    if ((!K0->hasOneUse() ||
9617         TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
9618        (!K1->hasOneUse() ||
9619         TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
9620      return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
9621                         Var, SDValue(K0, 0), SDValue(K1, 0));
9622    }
9623  }
9624
9625  return SDValue();
9626}
9627
9628SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
9629                                               DAGCombinerInfo &DCI) const {
9630  SelectionDAG &DAG = DCI.DAG;
9631
9632  EVT VT = N->getValueType(0);
9633  unsigned Opc = N->getOpcode();
9634  SDValue Op0 = N->getOperand(0);
9635  SDValue Op1 = N->getOperand(1);
9636
9637  // Only do this if the inner op has one use since this will just increases
9638  // register pressure for no benefit.
9639
9640  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
9641      !VT.isVector() &&
9642      (VT == MVT::i32 || VT == MVT::f32 ||
9643       ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
9644    // max(max(a, b), c) -> max3(a, b, c)
9645    // min(min(a, b), c) -> min3(a, b, c)
9646    if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
9647      SDLoc DL(N);
9648      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
9649                         DL,
9650                         N->getValueType(0),
9651                         Op0.getOperand(0),
9652                         Op0.getOperand(1),
9653                         Op1);
9654    }
9655
9656    // Try commuted.
9657    // max(a, max(b, c)) -> max3(a, b, c)
9658    // min(a, min(b, c)) -> min3(a, b, c)
9659    if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
9660      SDLoc DL(N);
9661      return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
9662                         DL,
9663                         N->getValueType(0),
9664                         Op0,
9665                         Op1.getOperand(0),
9666                         Op1.getOperand(1));
9667    }
9668  }
9669
9670  // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
9671  if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
9672    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
9673      return Med3;
9674  }
9675
9676  if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
9677    if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
9678      return Med3;
9679  }
9680
9681  // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
9682  if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
9683       (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
9684       (Opc == AMDGPUISD::FMIN_LEGACY &&
9685        Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
9686      (VT == MVT::f32 || VT == MVT::f64 ||
9687       (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
9688       (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
9689      Op0.hasOneUse()) {
9690    if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
9691      return Res;
9692  }
9693
9694  return SDValue();
9695}
9696
9697static bool isClampZeroToOne(SDValue A, SDValue B) {
9698  if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
9699    if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
9700      // FIXME: Should this be allowing -0.0?
9701      return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
9702             (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
9703    }
9704  }
9705
9706  return false;
9707}
9708
9709// FIXME: Should only worry about snans for version with chain.
9710SDValue SITargetLowering::performFMed3Combine(SDNode *N,
9711                                              DAGCombinerInfo &DCI) const {
9712  EVT VT = N->getValueType(0);
9713  // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
9714  // NaNs. With a NaN input, the order of the operands may change the result.
9715
9716  SelectionDAG &DAG = DCI.DAG;
9717  SDLoc SL(N);
9718
9719  SDValue Src0 = N->getOperand(0);
9720  SDValue Src1 = N->getOperand(1);
9721  SDValue Src2 = N->getOperand(2);
9722
9723  if (isClampZeroToOne(Src0, Src1)) {
9724    // const_a, const_b, x -> clamp is safe in all cases including signaling
9725    // nans.
9726    // FIXME: Should this be allowing -0.0?
9727    return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
9728  }
9729
9730  const MachineFunction &MF = DAG.getMachineFunction();
9731  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9732
9733  // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
9734  // handling no dx10-clamp?
9735  if (Info->getMode().DX10Clamp) {
9736    // If NaNs is clamped to 0, we are free to reorder the inputs.
9737
9738    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
9739      std::swap(Src0, Src1);
9740
9741    if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
9742      std::swap(Src1, Src2);
9743
9744    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
9745      std::swap(Src0, Src1);
9746
9747    if (isClampZeroToOne(Src1, Src2))
9748      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
9749  }
9750
9751  return SDValue();
9752}
9753
9754SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
9755                                                 DAGCombinerInfo &DCI) const {
9756  SDValue Src0 = N->getOperand(0);
9757  SDValue Src1 = N->getOperand(1);
9758  if (Src0.isUndef() && Src1.isUndef())
9759    return DCI.DAG.getUNDEF(N->getValueType(0));
9760  return SDValue();
9761}
9762
9763// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
9764// expanded into a set of cmp/select instructions.
9765bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
9766                                                unsigned NumElem,
9767                                                bool IsDivergentIdx) {
9768  if (UseDivergentRegisterIndexing)
9769    return false;
9770
9771  unsigned VecSize = EltSize * NumElem;
9772
9773  // Sub-dword vectors of size 2 dword or less have better implementation.
9774  if (VecSize <= 64 && EltSize < 32)
9775    return false;
9776
9777  // Always expand the rest of sub-dword instructions, otherwise it will be
9778  // lowered via memory.
9779  if (EltSize < 32)
9780    return true;
9781
9782  // Always do this if var-idx is divergent, otherwise it will become a loop.
9783  if (IsDivergentIdx)
9784    return true;
9785
9786  // Large vectors would yield too many compares and v_cndmask_b32 instructions.
9787  unsigned NumInsts = NumElem /* Number of compares */ +
9788                      ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
9789  return NumInsts <= 16;
9790}
9791
9792static bool shouldExpandVectorDynExt(SDNode *N) {
9793  SDValue Idx = N->getOperand(N->getNumOperands() - 1);
9794  if (isa<ConstantSDNode>(Idx))
9795    return false;
9796
9797  SDValue Vec = N->getOperand(0);
9798  EVT VecVT = Vec.getValueType();
9799  EVT EltVT = VecVT.getVectorElementType();
9800  unsigned EltSize = EltVT.getSizeInBits();
9801  unsigned NumElem = VecVT.getVectorNumElements();
9802
9803  return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
9804                                                    Idx->isDivergent());
9805}
9806
9807SDValue SITargetLowering::performExtractVectorEltCombine(
9808  SDNode *N, DAGCombinerInfo &DCI) const {
9809  SDValue Vec = N->getOperand(0);
9810  SelectionDAG &DAG = DCI.DAG;
9811
9812  EVT VecVT = Vec.getValueType();
9813  EVT EltVT = VecVT.getVectorElementType();
9814
9815  if ((Vec.getOpcode() == ISD::FNEG ||
9816       Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
9817    SDLoc SL(N);
9818    EVT EltVT = N->getValueType(0);
9819    SDValue Idx = N->getOperand(1);
9820    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
9821                              Vec.getOperand(0), Idx);
9822    return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
9823  }
9824
9825  // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
9826  //    =>
9827  // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
9828  // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
9829  // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
9830  if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
9831    SDLoc SL(N);
9832    EVT EltVT = N->getValueType(0);
9833    SDValue Idx = N->getOperand(1);
9834    unsigned Opc = Vec.getOpcode();
9835
9836    switch(Opc) {
9837    default:
9838      break;
9839      // TODO: Support other binary operations.
9840    case ISD::FADD:
9841    case ISD::FSUB:
9842    case ISD::FMUL:
9843    case ISD::ADD:
9844    case ISD::UMIN:
9845    case ISD::UMAX:
9846    case ISD::SMIN:
9847    case ISD::SMAX:
9848    case ISD::FMAXNUM:
9849    case ISD::FMINNUM:
9850    case ISD::FMAXNUM_IEEE:
9851    case ISD::FMINNUM_IEEE: {
9852      SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
9853                                 Vec.getOperand(0), Idx);
9854      SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
9855                                 Vec.getOperand(1), Idx);
9856
9857      DCI.AddToWorklist(Elt0.getNode());
9858      DCI.AddToWorklist(Elt1.getNode());
9859      return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
9860    }
9861    }
9862  }
9863
9864  unsigned VecSize = VecVT.getSizeInBits();
9865  unsigned EltSize = EltVT.getSizeInBits();
9866
9867  // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
9868  if (::shouldExpandVectorDynExt(N)) {
9869    SDLoc SL(N);
9870    SDValue Idx = N->getOperand(1);
9871    SDValue V;
9872    for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
9873      SDValue IC = DAG.getVectorIdxConstant(I, SL);
9874      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
9875      if (I == 0)
9876        V = Elt;
9877      else
9878        V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
9879    }
9880    return V;
9881  }
9882
9883  if (!DCI.isBeforeLegalize())
9884    return SDValue();
9885
9886  // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
9887  // elements. This exposes more load reduction opportunities by replacing
9888  // multiple small extract_vector_elements with a single 32-bit extract.
9889  auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
9890  if (isa<MemSDNode>(Vec) &&
9891      EltSize <= 16 &&
9892      EltVT.isByteSized() &&
9893      VecSize > 32 &&
9894      VecSize % 32 == 0 &&
9895      Idx) {
9896    EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
9897
9898    unsigned BitIndex = Idx->getZExtValue() * EltSize;
9899    unsigned EltIdx = BitIndex / 32;
9900    unsigned LeftoverBitIdx = BitIndex % 32;
9901    SDLoc SL(N);
9902
9903    SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
9904    DCI.AddToWorklist(Cast.getNode());
9905
9906    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
9907                              DAG.getConstant(EltIdx, SL, MVT::i32));
9908    DCI.AddToWorklist(Elt.getNode());
9909    SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
9910                              DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
9911    DCI.AddToWorklist(Srl.getNode());
9912
9913    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
9914    DCI.AddToWorklist(Trunc.getNode());
9915    return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
9916  }
9917
9918  return SDValue();
9919}
9920
9921SDValue
9922SITargetLowering::performInsertVectorEltCombine(SDNode *N,
9923                                                DAGCombinerInfo &DCI) const {
9924  SDValue Vec = N->getOperand(0);
9925  SDValue Idx = N->getOperand(2);
9926  EVT VecVT = Vec.getValueType();
9927  EVT EltVT = VecVT.getVectorElementType();
9928
9929  // INSERT_VECTOR_ELT (<n x e>, var-idx)
9930  // => BUILD_VECTOR n x select (e, const-idx)
9931  if (!::shouldExpandVectorDynExt(N))
9932    return SDValue();
9933
9934  SelectionDAG &DAG = DCI.DAG;
9935  SDLoc SL(N);
9936  SDValue Ins = N->getOperand(1);
9937  EVT IdxVT = Idx.getValueType();
9938
9939  SmallVector<SDValue, 16> Ops;
9940  for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
9941    SDValue IC = DAG.getConstant(I, SL, IdxVT);
9942    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
9943    SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
9944    Ops.push_back(V);
9945  }
9946
9947  return DAG.getBuildVector(VecVT, SL, Ops);
9948}
9949
9950unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
9951                                          const SDNode *N0,
9952                                          const SDNode *N1) const {
9953  EVT VT = N0->getValueType(0);
9954
9955  // Only do this if we are not trying to support denormals. v_mad_f32 does not
9956  // support denormals ever.
9957  if (((VT == MVT::f32 && !hasFP32Denormals(DAG.getMachineFunction())) ||
9958       (VT == MVT::f16 && !hasFP64FP16Denormals(DAG.getMachineFunction()) &&
9959        getSubtarget()->hasMadF16())) &&
9960       isOperationLegal(ISD::FMAD, VT))
9961    return ISD::FMAD;
9962
9963  const TargetOptions &Options = DAG.getTarget().Options;
9964  if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
9965       (N0->getFlags().hasAllowContract() &&
9966        N1->getFlags().hasAllowContract())) &&
9967      isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
9968    return ISD::FMA;
9969  }
9970
9971  return 0;
9972}
9973
9974// For a reassociatable opcode perform:
9975// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
9976SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
9977                                               SelectionDAG &DAG) const {
9978  EVT VT = N->getValueType(0);
9979  if (VT != MVT::i32 && VT != MVT::i64)
9980    return SDValue();
9981
9982  unsigned Opc = N->getOpcode();
9983  SDValue Op0 = N->getOperand(0);
9984  SDValue Op1 = N->getOperand(1);
9985
9986  if (!(Op0->isDivergent() ^ Op1->isDivergent()))
9987    return SDValue();
9988
9989  if (Op0->isDivergent())
9990    std::swap(Op0, Op1);
9991
9992  if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
9993    return SDValue();
9994
9995  SDValue Op2 = Op1.getOperand(1);
9996  Op1 = Op1.getOperand(0);
9997  if (!(Op1->isDivergent() ^ Op2->isDivergent()))
9998    return SDValue();
9999
10000  if (Op1->isDivergent())
10001    std::swap(Op1, Op2);
10002
10003  // If either operand is constant this will conflict with
10004  // DAGCombiner::ReassociateOps().
10005  if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
10006      DAG.isConstantIntBuildVectorOrConstantInt(Op1))
10007    return SDValue();
10008
10009  SDLoc SL(N);
10010  SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
10011  return DAG.getNode(Opc, SL, VT, Add1, Op2);
10012}
10013
10014static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
10015                           EVT VT,
10016                           SDValue N0, SDValue N1, SDValue N2,
10017                           bool Signed) {
10018  unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
10019  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
10020  SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
10021  return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
10022}
10023
10024SDValue SITargetLowering::performAddCombine(SDNode *N,
10025                                            DAGCombinerInfo &DCI) const {
10026  SelectionDAG &DAG = DCI.DAG;
10027  EVT VT = N->getValueType(0);
10028  SDLoc SL(N);
10029  SDValue LHS = N->getOperand(0);
10030  SDValue RHS = N->getOperand(1);
10031
10032  if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
10033      && Subtarget->hasMad64_32() &&
10034      !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
10035      VT.getScalarSizeInBits() <= 64) {
10036    if (LHS.getOpcode() != ISD::MUL)
10037      std::swap(LHS, RHS);
10038
10039    SDValue MulLHS = LHS.getOperand(0);
10040    SDValue MulRHS = LHS.getOperand(1);
10041    SDValue AddRHS = RHS;
10042
10043    // TODO: Maybe restrict if SGPR inputs.
10044    if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
10045        numBitsUnsigned(MulRHS, DAG) <= 32) {
10046      MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
10047      MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
10048      AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
10049      return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
10050    }
10051
10052    if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
10053      MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
10054      MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
10055      AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
10056      return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
10057    }
10058
10059    return SDValue();
10060  }
10061
10062  if (SDValue V = reassociateScalarOps(N, DAG)) {
10063    return V;
10064  }
10065
10066  if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
10067    return SDValue();
10068
10069  // add x, zext (setcc) => addcarry x, 0, setcc
10070  // add x, sext (setcc) => subcarry x, 0, setcc
10071  unsigned Opc = LHS.getOpcode();
10072  if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
10073      Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
10074    std::swap(RHS, LHS);
10075
10076  Opc = RHS.getOpcode();
10077  switch (Opc) {
10078  default: break;
10079  case ISD::ZERO_EXTEND:
10080  case ISD::SIGN_EXTEND:
10081  case ISD::ANY_EXTEND: {
10082    auto Cond = RHS.getOperand(0);
10083    // If this won't be a real VOPC output, we would still need to insert an
10084    // extra instruction anyway.
10085    if (!isBoolSGPR(Cond))
10086      break;
10087    SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
10088    SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
10089    Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
10090    return DAG.getNode(Opc, SL, VTList, Args);
10091  }
10092  case ISD::ADDCARRY: {
10093    // add x, (addcarry y, 0, cc) => addcarry x, y, cc
10094    auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
10095    if (!C || C->getZExtValue() != 0) break;
10096    SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
10097    return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
10098  }
10099  }
10100  return SDValue();
10101}
10102
10103SDValue SITargetLowering::performSubCombine(SDNode *N,
10104                                            DAGCombinerInfo &DCI) const {
10105  SelectionDAG &DAG = DCI.DAG;
10106  EVT VT = N->getValueType(0);
10107
10108  if (VT != MVT::i32)
10109    return SDValue();
10110
10111  SDLoc SL(N);
10112  SDValue LHS = N->getOperand(0);
10113  SDValue RHS = N->getOperand(1);
10114
10115  // sub x, zext (setcc) => subcarry x, 0, setcc
10116  // sub x, sext (setcc) => addcarry x, 0, setcc
10117  unsigned Opc = RHS.getOpcode();
10118  switch (Opc) {
10119  default: break;
10120  case ISD::ZERO_EXTEND:
10121  case ISD::SIGN_EXTEND:
10122  case ISD::ANY_EXTEND: {
10123    auto Cond = RHS.getOperand(0);
10124    // If this won't be a real VOPC output, we would still need to insert an
10125    // extra instruction anyway.
10126    if (!isBoolSGPR(Cond))
10127      break;
10128    SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
10129    SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
10130    Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY;
10131    return DAG.getNode(Opc, SL, VTList, Args);
10132  }
10133  }
10134
10135  if (LHS.getOpcode() == ISD::SUBCARRY) {
10136    // sub (subcarry x, 0, cc), y => subcarry x, y, cc
10137    auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
10138    if (!C || !C->isNullValue())
10139      return SDValue();
10140    SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
10141    return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
10142  }
10143  return SDValue();
10144}
10145
10146SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
10147  DAGCombinerInfo &DCI) const {
10148
10149  if (N->getValueType(0) != MVT::i32)
10150    return SDValue();
10151
10152  auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
10153  if (!C || C->getZExtValue() != 0)
10154    return SDValue();
10155
10156  SelectionDAG &DAG = DCI.DAG;
10157  SDValue LHS = N->getOperand(0);
10158
10159  // addcarry (add x, y), 0, cc => addcarry x, y, cc
10160  // subcarry (sub x, y), 0, cc => subcarry x, y, cc
10161  unsigned LHSOpc = LHS.getOpcode();
10162  unsigned Opc = N->getOpcode();
10163  if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
10164      (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
10165    SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
10166    return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
10167  }
10168  return SDValue();
10169}
10170
10171SDValue SITargetLowering::performFAddCombine(SDNode *N,
10172                                             DAGCombinerInfo &DCI) const {
10173  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
10174    return SDValue();
10175
10176  SelectionDAG &DAG = DCI.DAG;
10177  EVT VT = N->getValueType(0);
10178
10179  SDLoc SL(N);
10180  SDValue LHS = N->getOperand(0);
10181  SDValue RHS = N->getOperand(1);
10182
10183  // These should really be instruction patterns, but writing patterns with
10184  // source modiifiers is a pain.
10185
10186  // fadd (fadd (a, a), b) -> mad 2.0, a, b
10187  if (LHS.getOpcode() == ISD::FADD) {
10188    SDValue A = LHS.getOperand(0);
10189    if (A == LHS.getOperand(1)) {
10190      unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
10191      if (FusedOp != 0) {
10192        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
10193        return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
10194      }
10195    }
10196  }
10197
10198  // fadd (b, fadd (a, a)) -> mad 2.0, a, b
10199  if (RHS.getOpcode() == ISD::FADD) {
10200    SDValue A = RHS.getOperand(0);
10201    if (A == RHS.getOperand(1)) {
10202      unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
10203      if (FusedOp != 0) {
10204        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
10205        return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
10206      }
10207    }
10208  }
10209
10210  return SDValue();
10211}
10212
10213SDValue SITargetLowering::performFSubCombine(SDNode *N,
10214                                             DAGCombinerInfo &DCI) const {
10215  if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
10216    return SDValue();
10217
10218  SelectionDAG &DAG = DCI.DAG;
10219  SDLoc SL(N);
10220  EVT VT = N->getValueType(0);
10221  assert(!VT.isVector());
10222
10223  // Try to get the fneg to fold into the source modifier. This undoes generic
10224  // DAG combines and folds them into the mad.
10225  //
10226  // Only do this if we are not trying to support denormals. v_mad_f32 does
10227  // not support denormals ever.
10228  SDValue LHS = N->getOperand(0);
10229  SDValue RHS = N->getOperand(1);
10230  if (LHS.getOpcode() == ISD::FADD) {
10231    // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
10232    SDValue A = LHS.getOperand(0);
10233    if (A == LHS.getOperand(1)) {
10234      unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
10235      if (FusedOp != 0){
10236        const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
10237        SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10238
10239        return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
10240      }
10241    }
10242  }
10243
10244  if (RHS.getOpcode() == ISD::FADD) {
10245    // (fsub c, (fadd a, a)) -> mad -2.0, a, c
10246
10247    SDValue A = RHS.getOperand(0);
10248    if (A == RHS.getOperand(1)) {
10249      unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
10250      if (FusedOp != 0){
10251        const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
10252        return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
10253      }
10254    }
10255  }
10256
10257  return SDValue();
10258}
10259
10260SDValue SITargetLowering::performFMACombine(SDNode *N,
10261                                            DAGCombinerInfo &DCI) const {
10262  SelectionDAG &DAG = DCI.DAG;
10263  EVT VT = N->getValueType(0);
10264  SDLoc SL(N);
10265
10266  if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
10267    return SDValue();
10268
10269  // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
10270  //   FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
10271  SDValue Op1 = N->getOperand(0);
10272  SDValue Op2 = N->getOperand(1);
10273  SDValue FMA = N->getOperand(2);
10274
10275  if (FMA.getOpcode() != ISD::FMA ||
10276      Op1.getOpcode() != ISD::FP_EXTEND ||
10277      Op2.getOpcode() != ISD::FP_EXTEND)
10278    return SDValue();
10279
10280  // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
10281  // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
10282  // is sufficient to allow generaing fdot2.
10283  const TargetOptions &Options = DAG.getTarget().Options;
10284  if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
10285      (N->getFlags().hasAllowContract() &&
10286       FMA->getFlags().hasAllowContract())) {
10287    Op1 = Op1.getOperand(0);
10288    Op2 = Op2.getOperand(0);
10289    if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10290        Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10291      return SDValue();
10292
10293    SDValue Vec1 = Op1.getOperand(0);
10294    SDValue Idx1 = Op1.getOperand(1);
10295    SDValue Vec2 = Op2.getOperand(0);
10296
10297    SDValue FMAOp1 = FMA.getOperand(0);
10298    SDValue FMAOp2 = FMA.getOperand(1);
10299    SDValue FMAAcc = FMA.getOperand(2);
10300
10301    if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
10302        FMAOp2.getOpcode() != ISD::FP_EXTEND)
10303      return SDValue();
10304
10305    FMAOp1 = FMAOp1.getOperand(0);
10306    FMAOp2 = FMAOp2.getOperand(0);
10307    if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
10308        FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
10309      return SDValue();
10310
10311    SDValue Vec3 = FMAOp1.getOperand(0);
10312    SDValue Vec4 = FMAOp2.getOperand(0);
10313    SDValue Idx2 = FMAOp1.getOperand(1);
10314
10315    if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
10316        // Idx1 and Idx2 cannot be the same.
10317        Idx1 == Idx2)
10318      return SDValue();
10319
10320    if (Vec1 == Vec2 || Vec3 == Vec4)
10321      return SDValue();
10322
10323    if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
10324      return SDValue();
10325
10326    if ((Vec1 == Vec3 && Vec2 == Vec4) ||
10327        (Vec1 == Vec4 && Vec2 == Vec3)) {
10328      return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
10329                         DAG.getTargetConstant(0, SL, MVT::i1));
10330    }
10331  }
10332  return SDValue();
10333}
10334
10335SDValue SITargetLowering::performSetCCCombine(SDNode *N,
10336                                              DAGCombinerInfo &DCI) const {
10337  SelectionDAG &DAG = DCI.DAG;
10338  SDLoc SL(N);
10339
10340  SDValue LHS = N->getOperand(0);
10341  SDValue RHS = N->getOperand(1);
10342  EVT VT = LHS.getValueType();
10343  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
10344
10345  auto CRHS = dyn_cast<ConstantSDNode>(RHS);
10346  if (!CRHS) {
10347    CRHS = dyn_cast<ConstantSDNode>(LHS);
10348    if (CRHS) {
10349      std::swap(LHS, RHS);
10350      CC = getSetCCSwappedOperands(CC);
10351    }
10352  }
10353
10354  if (CRHS) {
10355    if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
10356        isBoolSGPR(LHS.getOperand(0))) {
10357      // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
10358      // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
10359      // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
10360      // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
10361      if ((CRHS->isAllOnesValue() &&
10362           (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
10363          (CRHS->isNullValue() &&
10364           (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
10365        return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
10366                           DAG.getConstant(-1, SL, MVT::i1));
10367      if ((CRHS->isAllOnesValue() &&
10368           (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
10369          (CRHS->isNullValue() &&
10370           (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
10371        return LHS.getOperand(0);
10372    }
10373
10374    uint64_t CRHSVal = CRHS->getZExtValue();
10375    if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
10376        LHS.getOpcode() == ISD::SELECT &&
10377        isa<ConstantSDNode>(LHS.getOperand(1)) &&
10378        isa<ConstantSDNode>(LHS.getOperand(2)) &&
10379        LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
10380        isBoolSGPR(LHS.getOperand(0))) {
10381      // Given CT != FT:
10382      // setcc (select cc, CT, CF), CF, eq => xor cc, -1
10383      // setcc (select cc, CT, CF), CF, ne => cc
10384      // setcc (select cc, CT, CF), CT, ne => xor cc, -1
10385      // setcc (select cc, CT, CF), CT, eq => cc
10386      uint64_t CT = LHS.getConstantOperandVal(1);
10387      uint64_t CF = LHS.getConstantOperandVal(2);
10388
10389      if ((CF == CRHSVal && CC == ISD::SETEQ) ||
10390          (CT == CRHSVal && CC == ISD::SETNE))
10391        return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
10392                           DAG.getConstant(-1, SL, MVT::i1));
10393      if ((CF == CRHSVal && CC == ISD::SETNE) ||
10394          (CT == CRHSVal && CC == ISD::SETEQ))
10395        return LHS.getOperand(0);
10396    }
10397  }
10398
10399  if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
10400                                           VT != MVT::f16))
10401    return SDValue();
10402
10403  // Match isinf/isfinite pattern
10404  // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
10405  // (fcmp one (fabs x), inf) -> (fp_class x,
10406  // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
10407  if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
10408    const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
10409    if (!CRHS)
10410      return SDValue();
10411
10412    const APFloat &APF = CRHS->getValueAPF();
10413    if (APF.isInfinity() && !APF.isNegative()) {
10414      const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
10415                                 SIInstrFlags::N_INFINITY;
10416      const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
10417                                    SIInstrFlags::P_ZERO |
10418                                    SIInstrFlags::N_NORMAL |
10419                                    SIInstrFlags::P_NORMAL |
10420                                    SIInstrFlags::N_SUBNORMAL |
10421                                    SIInstrFlags::P_SUBNORMAL;
10422      unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
10423      return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
10424                         DAG.getConstant(Mask, SL, MVT::i32));
10425    }
10426  }
10427
10428  return SDValue();
10429}
10430
10431SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
10432                                                     DAGCombinerInfo &DCI) const {
10433  SelectionDAG &DAG = DCI.DAG;
10434  SDLoc SL(N);
10435  unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
10436
10437  SDValue Src = N->getOperand(0);
10438  SDValue Shift = N->getOperand(0);
10439
10440  // TODO: Extend type shouldn't matter (assuming legal types).
10441  if (Shift.getOpcode() == ISD::ZERO_EXTEND)
10442    Shift = Shift.getOperand(0);
10443
10444  if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
10445    // cvt_f32_ubyte1 (shl x,  8) -> cvt_f32_ubyte0 x
10446    // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
10447    // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
10448    // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
10449    // cvt_f32_ubyte0 (srl x,  8) -> cvt_f32_ubyte1 x
10450    if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
10451      Shift = DAG.getZExtOrTrunc(Shift.getOperand(0),
10452                                 SDLoc(Shift.getOperand(0)), MVT::i32);
10453
10454      unsigned ShiftOffset = 8 * Offset;
10455      if (Shift.getOpcode() == ISD::SHL)
10456        ShiftOffset -= C->getZExtValue();
10457      else
10458        ShiftOffset += C->getZExtValue();
10459
10460      if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
10461        return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
10462                           MVT::f32, Shift);
10463      }
10464    }
10465  }
10466
10467  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10468  APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
10469  if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
10470    // We simplified Src. If this node is not dead, visit it again so it is
10471    // folded properly.
10472    if (N->getOpcode() != ISD::DELETED_NODE)
10473      DCI.AddToWorklist(N);
10474    return SDValue(N, 0);
10475  }
10476
10477  // Handle (or x, (srl y, 8)) pattern when known bits are zero.
10478  if (SDValue DemandedSrc =
10479          TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
10480    return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
10481
10482  return SDValue();
10483}
10484
10485SDValue SITargetLowering::performClampCombine(SDNode *N,
10486                                              DAGCombinerInfo &DCI) const {
10487  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
10488  if (!CSrc)
10489    return SDValue();
10490
10491  const MachineFunction &MF = DCI.DAG.getMachineFunction();
10492  const APFloat &F = CSrc->getValueAPF();
10493  APFloat Zero = APFloat::getZero(F.getSemantics());
10494  if (F < Zero ||
10495      (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
10496    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
10497  }
10498
10499  APFloat One(F.getSemantics(), "1.0");
10500  if (F > One)
10501    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
10502
10503  return SDValue(CSrc, 0);
10504}
10505
10506
10507SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
10508                                            DAGCombinerInfo &DCI) const {
10509  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
10510    return SDValue();
10511  switch (N->getOpcode()) {
10512  default:
10513    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
10514  case ISD::ADD:
10515    return performAddCombine(N, DCI);
10516  case ISD::SUB:
10517    return performSubCombine(N, DCI);
10518  case ISD::ADDCARRY:
10519  case ISD::SUBCARRY:
10520    return performAddCarrySubCarryCombine(N, DCI);
10521  case ISD::FADD:
10522    return performFAddCombine(N, DCI);
10523  case ISD::FSUB:
10524    return performFSubCombine(N, DCI);
10525  case ISD::SETCC:
10526    return performSetCCCombine(N, DCI);
10527  case ISD::FMAXNUM:
10528  case ISD::FMINNUM:
10529  case ISD::FMAXNUM_IEEE:
10530  case ISD::FMINNUM_IEEE:
10531  case ISD::SMAX:
10532  case ISD::SMIN:
10533  case ISD::UMAX:
10534  case ISD::UMIN:
10535  case AMDGPUISD::FMIN_LEGACY:
10536  case AMDGPUISD::FMAX_LEGACY:
10537    return performMinMaxCombine(N, DCI);
10538  case ISD::FMA:
10539    return performFMACombine(N, DCI);
10540  case ISD::LOAD: {
10541    if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
10542      return Widended;
10543    LLVM_FALLTHROUGH;
10544  }
10545  case ISD::STORE:
10546  case ISD::ATOMIC_LOAD:
10547  case ISD::ATOMIC_STORE:
10548  case ISD::ATOMIC_CMP_SWAP:
10549  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
10550  case ISD::ATOMIC_SWAP:
10551  case ISD::ATOMIC_LOAD_ADD:
10552  case ISD::ATOMIC_LOAD_SUB:
10553  case ISD::ATOMIC_LOAD_AND:
10554  case ISD::ATOMIC_LOAD_OR:
10555  case ISD::ATOMIC_LOAD_XOR:
10556  case ISD::ATOMIC_LOAD_NAND:
10557  case ISD::ATOMIC_LOAD_MIN:
10558  case ISD::ATOMIC_LOAD_MAX:
10559  case ISD::ATOMIC_LOAD_UMIN:
10560  case ISD::ATOMIC_LOAD_UMAX:
10561  case ISD::ATOMIC_LOAD_FADD:
10562  case AMDGPUISD::ATOMIC_INC:
10563  case AMDGPUISD::ATOMIC_DEC:
10564  case AMDGPUISD::ATOMIC_LOAD_FMIN:
10565  case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
10566    if (DCI.isBeforeLegalize())
10567      break;
10568    return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
10569  case ISD::AND:
10570    return performAndCombine(N, DCI);
10571  case ISD::OR:
10572    return performOrCombine(N, DCI);
10573  case ISD::XOR:
10574    return performXorCombine(N, DCI);
10575  case ISD::ZERO_EXTEND:
10576    return performZeroExtendCombine(N, DCI);
10577  case ISD::SIGN_EXTEND_INREG:
10578    return performSignExtendInRegCombine(N , DCI);
10579  case AMDGPUISD::FP_CLASS:
10580    return performClassCombine(N, DCI);
10581  case ISD::FCANONICALIZE:
10582    return performFCanonicalizeCombine(N, DCI);
10583  case AMDGPUISD::RCP:
10584    return performRcpCombine(N, DCI);
10585  case AMDGPUISD::FRACT:
10586  case AMDGPUISD::RSQ:
10587  case AMDGPUISD::RCP_LEGACY:
10588  case AMDGPUISD::RCP_IFLAG:
10589  case AMDGPUISD::RSQ_CLAMP:
10590  case AMDGPUISD::LDEXP: {
10591    // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
10592    SDValue Src = N->getOperand(0);
10593    if (Src.isUndef())
10594      return Src;
10595    break;
10596  }
10597  case ISD::SINT_TO_FP:
10598  case ISD::UINT_TO_FP:
10599    return performUCharToFloatCombine(N, DCI);
10600  case AMDGPUISD::CVT_F32_UBYTE0:
10601  case AMDGPUISD::CVT_F32_UBYTE1:
10602  case AMDGPUISD::CVT_F32_UBYTE2:
10603  case AMDGPUISD::CVT_F32_UBYTE3:
10604    return performCvtF32UByteNCombine(N, DCI);
10605  case AMDGPUISD::FMED3:
10606    return performFMed3Combine(N, DCI);
10607  case AMDGPUISD::CVT_PKRTZ_F16_F32:
10608    return performCvtPkRTZCombine(N, DCI);
10609  case AMDGPUISD::CLAMP:
10610    return performClampCombine(N, DCI);
10611  case ISD::SCALAR_TO_VECTOR: {
10612    SelectionDAG &DAG = DCI.DAG;
10613    EVT VT = N->getValueType(0);
10614
10615    // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
10616    if (VT == MVT::v2i16 || VT == MVT::v2f16) {
10617      SDLoc SL(N);
10618      SDValue Src = N->getOperand(0);
10619      EVT EltVT = Src.getValueType();
10620      if (EltVT == MVT::f16)
10621        Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
10622
10623      SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
10624      return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
10625    }
10626
10627    break;
10628  }
10629  case ISD::EXTRACT_VECTOR_ELT:
10630    return performExtractVectorEltCombine(N, DCI);
10631  case ISD::INSERT_VECTOR_ELT:
10632    return performInsertVectorEltCombine(N, DCI);
10633  }
10634  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
10635}
10636
10637/// Helper function for adjustWritemask
10638static unsigned SubIdx2Lane(unsigned Idx) {
10639  switch (Idx) {
10640  default: return 0;
10641  case AMDGPU::sub0: return 0;
10642  case AMDGPU::sub1: return 1;
10643  case AMDGPU::sub2: return 2;
10644  case AMDGPU::sub3: return 3;
10645  case AMDGPU::sub4: return 4; // Possible with TFE/LWE
10646  }
10647}
10648
10649/// Adjust the writemask of MIMG instructions
10650SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
10651                                          SelectionDAG &DAG) const {
10652  unsigned Opcode = Node->getMachineOpcode();
10653
10654  // Subtract 1 because the vdata output is not a MachineSDNode operand.
10655  int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
10656  if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
10657    return Node; // not implemented for D16
10658
10659  SDNode *Users[5] = { nullptr };
10660  unsigned Lane = 0;
10661  unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
10662  unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
10663  unsigned NewDmask = 0;
10664  unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
10665  unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
10666  bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
10667                  Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
10668  unsigned TFCLane = 0;
10669  bool HasChain = Node->getNumValues() > 1;
10670
10671  if (OldDmask == 0) {
10672    // These are folded out, but on the chance it happens don't assert.
10673    return Node;
10674  }
10675
10676  unsigned OldBitsSet = countPopulation(OldDmask);
10677  // Work out which is the TFE/LWE lane if that is enabled.
10678  if (UsesTFC) {
10679    TFCLane = OldBitsSet;
10680  }
10681
10682  // Try to figure out the used register components
10683  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
10684       I != E; ++I) {
10685
10686    // Don't look at users of the chain.
10687    if (I.getUse().getResNo() != 0)
10688      continue;
10689
10690    // Abort if we can't understand the usage
10691    if (!I->isMachineOpcode() ||
10692        I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
10693      return Node;
10694
10695    // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
10696    // Note that subregs are packed, i.e. Lane==0 is the first bit set
10697    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
10698    // set, etc.
10699    Lane = SubIdx2Lane(I->getConstantOperandVal(1));
10700
10701    // Check if the use is for the TFE/LWE generated result at VGPRn+1.
10702    if (UsesTFC && Lane == TFCLane) {
10703      Users[Lane] = *I;
10704    } else {
10705      // Set which texture component corresponds to the lane.
10706      unsigned Comp;
10707      for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
10708        Comp = countTrailingZeros(Dmask);
10709        Dmask &= ~(1 << Comp);
10710      }
10711
10712      // Abort if we have more than one user per component.
10713      if (Users[Lane])
10714        return Node;
10715
10716      Users[Lane] = *I;
10717      NewDmask |= 1 << Comp;
10718    }
10719  }
10720
10721  // Don't allow 0 dmask, as hardware assumes one channel enabled.
10722  bool NoChannels = !NewDmask;
10723  if (NoChannels) {
10724    if (!UsesTFC) {
10725      // No uses of the result and not using TFC. Then do nothing.
10726      return Node;
10727    }
10728    // If the original dmask has one channel - then nothing to do
10729    if (OldBitsSet == 1)
10730      return Node;
10731    // Use an arbitrary dmask - required for the instruction to work
10732    NewDmask = 1;
10733  }
10734  // Abort if there's no change
10735  if (NewDmask == OldDmask)
10736    return Node;
10737
10738  unsigned BitsSet = countPopulation(NewDmask);
10739
10740  // Check for TFE or LWE - increase the number of channels by one to account
10741  // for the extra return value
10742  // This will need adjustment for D16 if this is also included in
10743  // adjustWriteMask (this function) but at present D16 are excluded.
10744  unsigned NewChannels = BitsSet + UsesTFC;
10745
10746  int NewOpcode =
10747      AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
10748  assert(NewOpcode != -1 &&
10749         NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
10750         "failed to find equivalent MIMG op");
10751
10752  // Adjust the writemask in the node
10753  SmallVector<SDValue, 12> Ops;
10754  Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
10755  Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
10756  Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
10757
10758  MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
10759
10760  MVT ResultVT = NewChannels == 1 ?
10761    SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
10762                           NewChannels == 5 ? 8 : NewChannels);
10763  SDVTList NewVTList = HasChain ?
10764    DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
10765
10766
10767  MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
10768                                              NewVTList, Ops);
10769
10770  if (HasChain) {
10771    // Update chain.
10772    DAG.setNodeMemRefs(NewNode, Node->memoperands());
10773    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
10774  }
10775
10776  if (NewChannels == 1) {
10777    assert(Node->hasNUsesOfValue(1, 0));
10778    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
10779                                      SDLoc(Node), Users[Lane]->getValueType(0),
10780                                      SDValue(NewNode, 0));
10781    DAG.ReplaceAllUsesWith(Users[Lane], Copy);
10782    return nullptr;
10783  }
10784
10785  // Update the users of the node with the new indices
10786  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
10787    SDNode *User = Users[i];
10788    if (!User) {
10789      // Handle the special case of NoChannels. We set NewDmask to 1 above, but
10790      // Users[0] is still nullptr because channel 0 doesn't really have a use.
10791      if (i || !NoChannels)
10792        continue;
10793    } else {
10794      SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
10795      DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
10796    }
10797
10798    switch (Idx) {
10799    default: break;
10800    case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
10801    case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
10802    case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
10803    case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
10804    }
10805  }
10806
10807  DAG.RemoveDeadNode(Node);
10808  return nullptr;
10809}
10810
10811static bool isFrameIndexOp(SDValue Op) {
10812  if (Op.getOpcode() == ISD::AssertZext)
10813    Op = Op.getOperand(0);
10814
10815  return isa<FrameIndexSDNode>(Op);
10816}
10817
10818/// Legalize target independent instructions (e.g. INSERT_SUBREG)
10819/// with frame index operands.
10820/// LLVM assumes that inputs are to these instructions are registers.
10821SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
10822                                                        SelectionDAG &DAG) const {
10823  if (Node->getOpcode() == ISD::CopyToReg) {
10824    RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
10825    SDValue SrcVal = Node->getOperand(2);
10826
10827    // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
10828    // to try understanding copies to physical registers.
10829    if (SrcVal.getValueType() == MVT::i1 &&
10830        Register::isPhysicalRegister(DestReg->getReg())) {
10831      SDLoc SL(Node);
10832      MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
10833      SDValue VReg = DAG.getRegister(
10834        MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
10835
10836      SDNode *Glued = Node->getGluedNode();
10837      SDValue ToVReg
10838        = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
10839                         SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
10840      SDValue ToResultReg
10841        = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
10842                           VReg, ToVReg.getValue(1));
10843      DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
10844      DAG.RemoveDeadNode(Node);
10845      return ToResultReg.getNode();
10846    }
10847  }
10848
10849  SmallVector<SDValue, 8> Ops;
10850  for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
10851    if (!isFrameIndexOp(Node->getOperand(i))) {
10852      Ops.push_back(Node->getOperand(i));
10853      continue;
10854    }
10855
10856    SDLoc DL(Node);
10857    Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
10858                                     Node->getOperand(i).getValueType(),
10859                                     Node->getOperand(i)), 0));
10860  }
10861
10862  return DAG.UpdateNodeOperands(Node, Ops);
10863}
10864
10865/// Fold the instructions after selecting them.
10866/// Returns null if users were already updated.
10867SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
10868                                          SelectionDAG &DAG) const {
10869  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10870  unsigned Opcode = Node->getMachineOpcode();
10871
10872  if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
10873      !TII->isGather4(Opcode)) {
10874    return adjustWritemask(Node, DAG);
10875  }
10876
10877  if (Opcode == AMDGPU::INSERT_SUBREG ||
10878      Opcode == AMDGPU::REG_SEQUENCE) {
10879    legalizeTargetIndependentNode(Node, DAG);
10880    return Node;
10881  }
10882
10883  switch (Opcode) {
10884  case AMDGPU::V_DIV_SCALE_F32:
10885  case AMDGPU::V_DIV_SCALE_F64: {
10886    // Satisfy the operand register constraint when one of the inputs is
10887    // undefined. Ordinarily each undef value will have its own implicit_def of
10888    // a vreg, so force these to use a single register.
10889    SDValue Src0 = Node->getOperand(0);
10890    SDValue Src1 = Node->getOperand(1);
10891    SDValue Src2 = Node->getOperand(2);
10892
10893    if ((Src0.isMachineOpcode() &&
10894         Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
10895        (Src0 == Src1 || Src0 == Src2))
10896      break;
10897
10898    MVT VT = Src0.getValueType().getSimpleVT();
10899    const TargetRegisterClass *RC =
10900        getRegClassFor(VT, Src0.getNode()->isDivergent());
10901
10902    MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
10903    SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
10904
10905    SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
10906                                      UndefReg, Src0, SDValue());
10907
10908    // src0 must be the same register as src1 or src2, even if the value is
10909    // undefined, so make sure we don't violate this constraint.
10910    if (Src0.isMachineOpcode() &&
10911        Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
10912      if (Src1.isMachineOpcode() &&
10913          Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
10914        Src0 = Src1;
10915      else if (Src2.isMachineOpcode() &&
10916               Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
10917        Src0 = Src2;
10918      else {
10919        assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
10920        Src0 = UndefReg;
10921        Src1 = UndefReg;
10922      }
10923    } else
10924      break;
10925
10926    SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
10927    for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
10928      Ops.push_back(Node->getOperand(I));
10929
10930    Ops.push_back(ImpDef.getValue(1));
10931    return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
10932  }
10933  default:
10934    break;
10935  }
10936
10937  return Node;
10938}
10939
10940/// Assign the register class depending on the number of
10941/// bits set in the writemask
10942void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
10943                                                     SDNode *Node) const {
10944  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10945
10946  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
10947
10948  if (TII->isVOP3(MI.getOpcode())) {
10949    // Make sure constant bus requirements are respected.
10950    TII->legalizeOperandsVOP3(MRI, MI);
10951
10952    // Prefer VGPRs over AGPRs in mAI instructions where possible.
10953    // This saves a chain-copy of registers and better ballance register
10954    // use between vgpr and agpr as agpr tuples tend to be big.
10955    if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) {
10956      unsigned Opc = MI.getOpcode();
10957      const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
10958      for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
10959                      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) {
10960        if (I == -1)
10961          break;
10962        MachineOperand &Op = MI.getOperand(I);
10963        if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID &&
10964             OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) ||
10965            !Register::isVirtualRegister(Op.getReg()) ||
10966            !TRI->isAGPR(MRI, Op.getReg()))
10967          continue;
10968        auto *Src = MRI.getUniqueVRegDef(Op.getReg());
10969        if (!Src || !Src->isCopy() ||
10970            !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
10971          continue;
10972        auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
10973        auto *NewRC = TRI->getEquivalentVGPRClass(RC);
10974        // All uses of agpr64 and agpr32 can also accept vgpr except for
10975        // v_accvgpr_read, but we do not produce agpr reads during selection,
10976        // so no use checks are needed.
10977        MRI.setRegClass(Op.getReg(), NewRC);
10978      }
10979    }
10980
10981    return;
10982  }
10983
10984  // Replace unused atomics with the no return version.
10985  int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
10986  if (NoRetAtomicOp != -1) {
10987    if (!Node->hasAnyUseOfValue(0)) {
10988      MI.setDesc(TII->get(NoRetAtomicOp));
10989      MI.RemoveOperand(0);
10990      return;
10991    }
10992
10993    // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
10994    // instruction, because the return type of these instructions is a vec2 of
10995    // the memory type, so it can be tied to the input operand.
10996    // This means these instructions always have a use, so we need to add a
10997    // special case to check if the atomic has only one extract_subreg use,
10998    // which itself has no uses.
10999    if ((Node->hasNUsesOfValue(1, 0) &&
11000         Node->use_begin()->isMachineOpcode() &&
11001         Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
11002         !Node->use_begin()->hasAnyUseOfValue(0))) {
11003      Register Def = MI.getOperand(0).getReg();
11004
11005      // Change this into a noret atomic.
11006      MI.setDesc(TII->get(NoRetAtomicOp));
11007      MI.RemoveOperand(0);
11008
11009      // If we only remove the def operand from the atomic instruction, the
11010      // extract_subreg will be left with a use of a vreg without a def.
11011      // So we need to insert an implicit_def to avoid machine verifier
11012      // errors.
11013      BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
11014              TII->get(AMDGPU::IMPLICIT_DEF), Def);
11015    }
11016    return;
11017  }
11018}
11019
11020static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
11021                              uint64_t Val) {
11022  SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
11023  return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
11024}
11025
11026MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
11027                                                const SDLoc &DL,
11028                                                SDValue Ptr) const {
11029  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11030
11031  // Build the half of the subregister with the constants before building the
11032  // full 128-bit register. If we are building multiple resource descriptors,
11033  // this will allow CSEing of the 2-component register.
11034  const SDValue Ops0[] = {
11035    DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
11036    buildSMovImm32(DAG, DL, 0),
11037    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
11038    buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
11039    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
11040  };
11041
11042  SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
11043                                                MVT::v2i32, Ops0), 0);
11044
11045  // Combine the constants and the pointer.
11046  const SDValue Ops1[] = {
11047    DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
11048    Ptr,
11049    DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
11050    SubRegHi,
11051    DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
11052  };
11053
11054  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
11055}
11056
11057/// Return a resource descriptor with the 'Add TID' bit enabled
11058///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
11059///        of the resource descriptor) to create an offset, which is added to
11060///        the resource pointer.
11061MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
11062                                           SDValue Ptr, uint32_t RsrcDword1,
11063                                           uint64_t RsrcDword2And3) const {
11064  SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
11065  SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
11066  if (RsrcDword1) {
11067    PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
11068                                     DAG.getConstant(RsrcDword1, DL, MVT::i32)),
11069                    0);
11070  }
11071
11072  SDValue DataLo = buildSMovImm32(DAG, DL,
11073                                  RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
11074  SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
11075
11076  const SDValue Ops[] = {
11077    DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
11078    PtrLo,
11079    DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
11080    PtrHi,
11081    DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
11082    DataLo,
11083    DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
11084    DataHi,
11085    DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
11086  };
11087
11088  return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
11089}
11090
11091//===----------------------------------------------------------------------===//
11092//                         SI Inline Assembly Support
11093//===----------------------------------------------------------------------===//
11094
11095std::pair<unsigned, const TargetRegisterClass *>
11096SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
11097                                               StringRef Constraint,
11098                                               MVT VT) const {
11099  const TargetRegisterClass *RC = nullptr;
11100  if (Constraint.size() == 1) {
11101    const unsigned BitWidth = VT.getSizeInBits();
11102    switch (Constraint[0]) {
11103    default:
11104      return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
11105    case 's':
11106    case 'r':
11107      switch (BitWidth) {
11108      case 16:
11109        RC = &AMDGPU::SReg_32RegClass;
11110        break;
11111      case 64:
11112        RC = &AMDGPU::SGPR_64RegClass;
11113        break;
11114      default:
11115        RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
11116        if (!RC)
11117          return std::make_pair(0U, nullptr);
11118        break;
11119      }
11120      break;
11121    case 'v':
11122      switch (BitWidth) {
11123      case 16:
11124        RC = &AMDGPU::VGPR_32RegClass;
11125        break;
11126      default:
11127        RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth);
11128        if (!RC)
11129          return std::make_pair(0U, nullptr);
11130        break;
11131      }
11132      break;
11133    case 'a':
11134      if (!Subtarget->hasMAIInsts())
11135        break;
11136      switch (BitWidth) {
11137      case 16:
11138        RC = &AMDGPU::AGPR_32RegClass;
11139        break;
11140      default:
11141        RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth);
11142        if (!RC)
11143          return std::make_pair(0U, nullptr);
11144        break;
11145      }
11146      break;
11147    }
11148    // We actually support i128, i16 and f16 as inline parameters
11149    // even if they are not reported as legal
11150    if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
11151               VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
11152      return std::make_pair(0U, RC);
11153  }
11154
11155  if (Constraint.size() > 1) {
11156    if (Constraint[1] == 'v') {
11157      RC = &AMDGPU::VGPR_32RegClass;
11158    } else if (Constraint[1] == 's') {
11159      RC = &AMDGPU::SGPR_32RegClass;
11160    } else if (Constraint[1] == 'a') {
11161      RC = &AMDGPU::AGPR_32RegClass;
11162    }
11163
11164    if (RC) {
11165      uint32_t Idx;
11166      bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
11167      if (!Failed && Idx < RC->getNumRegs())
11168        return std::make_pair(RC->getRegister(Idx), RC);
11169    }
11170  }
11171
11172  // FIXME: Returns VS_32 for physical SGPR constraints
11173  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
11174}
11175
11176static bool isImmConstraint(StringRef Constraint) {
11177  if (Constraint.size() == 1) {
11178    switch (Constraint[0]) {
11179    default: break;
11180    case 'I':
11181    case 'J':
11182    case 'A':
11183    case 'B':
11184    case 'C':
11185      return true;
11186    }
11187  } else if (Constraint == "DA" ||
11188             Constraint == "DB") {
11189    return true;
11190  }
11191  return false;
11192}
11193
11194SITargetLowering::ConstraintType
11195SITargetLowering::getConstraintType(StringRef Constraint) const {
11196  if (Constraint.size() == 1) {
11197    switch (Constraint[0]) {
11198    default: break;
11199    case 's':
11200    case 'v':
11201    case 'a':
11202      return C_RegisterClass;
11203    }
11204  }
11205  if (isImmConstraint(Constraint)) {
11206    return C_Other;
11207  }
11208  return TargetLowering::getConstraintType(Constraint);
11209}
11210
11211static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
11212  if (!AMDGPU::isInlinableIntLiteral(Val)) {
11213    Val = Val & maskTrailingOnes<uint64_t>(Size);
11214  }
11215  return Val;
11216}
11217
11218void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
11219                                                    std::string &Constraint,
11220                                                    std::vector<SDValue> &Ops,
11221                                                    SelectionDAG &DAG) const {
11222  if (isImmConstraint(Constraint)) {
11223    uint64_t Val;
11224    if (getAsmOperandConstVal(Op, Val) &&
11225        checkAsmConstraintVal(Op, Constraint, Val)) {
11226      Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
11227      Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
11228    }
11229  } else {
11230    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11231  }
11232}
11233
11234bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
11235  unsigned Size = Op.getScalarValueSizeInBits();
11236  if (Size > 64)
11237    return false;
11238
11239  if (Size == 16 && !Subtarget->has16BitInsts())
11240    return false;
11241
11242  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
11243    Val = C->getSExtValue();
11244    return true;
11245  }
11246  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
11247    Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
11248    return true;
11249  }
11250  if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
11251    if (Size != 16 || Op.getNumOperands() != 2)
11252      return false;
11253    if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
11254      return false;
11255    if (ConstantSDNode *C = V->getConstantSplatNode()) {
11256      Val = C->getSExtValue();
11257      return true;
11258    }
11259    if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
11260      Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
11261      return true;
11262    }
11263  }
11264
11265  return false;
11266}
11267
11268bool SITargetLowering::checkAsmConstraintVal(SDValue Op,
11269                                             const std::string &Constraint,
11270                                             uint64_t Val) const {
11271  if (Constraint.size() == 1) {
11272    switch (Constraint[0]) {
11273    case 'I':
11274      return AMDGPU::isInlinableIntLiteral(Val);
11275    case 'J':
11276      return isInt<16>(Val);
11277    case 'A':
11278      return checkAsmConstraintValA(Op, Val);
11279    case 'B':
11280      return isInt<32>(Val);
11281    case 'C':
11282      return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
11283             AMDGPU::isInlinableIntLiteral(Val);
11284    default:
11285      break;
11286    }
11287  } else if (Constraint.size() == 2) {
11288    if (Constraint == "DA") {
11289      int64_t HiBits = static_cast<int32_t>(Val >> 32);
11290      int64_t LoBits = static_cast<int32_t>(Val);
11291      return checkAsmConstraintValA(Op, HiBits, 32) &&
11292             checkAsmConstraintValA(Op, LoBits, 32);
11293    }
11294    if (Constraint == "DB") {
11295      return true;
11296    }
11297  }
11298  llvm_unreachable("Invalid asm constraint");
11299}
11300
11301bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
11302                                              uint64_t Val,
11303                                              unsigned MaxSize) const {
11304  unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
11305  bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
11306  if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) ||
11307      (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
11308      (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) {
11309    return true;
11310  }
11311  return false;
11312}
11313
11314// Figure out which registers should be reserved for stack access. Only after
11315// the function is legalized do we know all of the non-spill stack objects or if
11316// calls are present.
11317void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
11318  MachineRegisterInfo &MRI = MF.getRegInfo();
11319  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
11320  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
11321  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
11322
11323  if (Info->isEntryFunction()) {
11324    // Callable functions have fixed registers used for stack access.
11325    reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
11326  }
11327
11328  assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
11329                             Info->getStackPtrOffsetReg()));
11330  if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
11331    MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
11332
11333  // We need to worry about replacing the default register with itself in case
11334  // of MIR testcases missing the MFI.
11335  if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
11336    MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
11337
11338  if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
11339    MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
11340
11341  Info->limitOccupancy(MF);
11342
11343  if (ST.isWave32() && !MF.empty()) {
11344    // Add VCC_HI def because many instructions marked as imp-use VCC where
11345    // we may only define VCC_LO. If nothing defines VCC_HI we may end up
11346    // having a use of undef.
11347
11348    const SIInstrInfo *TII = ST.getInstrInfo();
11349    DebugLoc DL;
11350
11351    MachineBasicBlock &MBB = MF.front();
11352    MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr();
11353    BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI);
11354
11355    for (auto &MBB : MF) {
11356      for (auto &MI : MBB) {
11357        TII->fixImplicitOperands(MI);
11358      }
11359    }
11360  }
11361
11362  TargetLoweringBase::finalizeLowering(MF);
11363
11364  // Allocate a VGPR for future SGPR Spill if
11365  // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
11366  // FIXME: We won't need this hack if we split SGPR allocation from VGPR
11367  if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill &&
11368      !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects())
11369    Info->reserveVGPRforSGPRSpills(MF);
11370}
11371
11372void SITargetLowering::computeKnownBitsForFrameIndex(
11373  const int FI, KnownBits &Known, const MachineFunction &MF) const {
11374  TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);
11375
11376  // Set the high bits to zero based on the maximum allowed scratch size per
11377  // wave. We can't use vaddr in MUBUF instructions if we don't know the address
11378  // calculation won't overflow, so assume the sign bit is never set.
11379  Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
11380}
11381
11382Align SITargetLowering::computeKnownAlignForTargetInstr(
11383  GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
11384  unsigned Depth) const {
11385  const MachineInstr *MI = MRI.getVRegDef(R);
11386  switch (MI->getOpcode()) {
11387  case AMDGPU::G_INTRINSIC:
11388  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
11389    // FIXME: Can this move to generic code? What about the case where the call
11390    // site specifies a lower alignment?
11391    Intrinsic::ID IID = MI->getIntrinsicID();
11392    LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
11393    AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
11394    if (MaybeAlign RetAlign = Attrs.getRetAlignment())
11395      return *RetAlign;
11396    return Align(1);
11397  }
11398  default:
11399    return Align(1);
11400  }
11401}
11402
11403Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
11404  const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
11405  const Align CacheLineAlign = Align(64);
11406
11407  // Pre-GFX10 target did not benefit from loop alignment
11408  if (!ML || DisableLoopAlignment ||
11409      (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
11410      getSubtarget()->hasInstFwdPrefetchBug())
11411    return PrefAlign;
11412
11413  // On GFX10 I$ is 4 x 64 bytes cache lines.
11414  // By default prefetcher keeps one cache line behind and reads two ahead.
11415  // We can modify it with S_INST_PREFETCH for larger loops to have two lines
11416  // behind and one ahead.
11417  // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
11418  // If loop fits 64 bytes it always spans no more than two cache lines and
11419  // does not need an alignment.
11420  // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
11421  // Else if loop is less or equal 192 bytes we need two lines behind.
11422
11423  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11424  const MachineBasicBlock *Header = ML->getHeader();
11425  if (Header->getAlignment() != PrefAlign)
11426    return Header->getAlignment(); // Already processed.
11427
11428  unsigned LoopSize = 0;
11429  for (const MachineBasicBlock *MBB : ML->blocks()) {
11430    // If inner loop block is aligned assume in average half of the alignment
11431    // size to be added as nops.
11432    if (MBB != Header)
11433      LoopSize += MBB->getAlignment().value() / 2;
11434
11435    for (const MachineInstr &MI : *MBB) {
11436      LoopSize += TII->getInstSizeInBytes(MI);
11437      if (LoopSize > 192)
11438        return PrefAlign;
11439    }
11440  }
11441
11442  if (LoopSize <= 64)
11443    return PrefAlign;
11444
11445  if (LoopSize <= 128)
11446    return CacheLineAlign;
11447
11448  // If any of parent loops is surrounded by prefetch instructions do not
11449  // insert new for inner loop, which would reset parent's settings.
11450  for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
11451    if (MachineBasicBlock *Exit = P->getExitBlock()) {
11452      auto I = Exit->getFirstNonDebugInstr();
11453      if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
11454        return CacheLineAlign;
11455    }
11456  }
11457
11458  MachineBasicBlock *Pre = ML->getLoopPreheader();
11459  MachineBasicBlock *Exit = ML->getExitBlock();
11460
11461  if (Pre && Exit) {
11462    BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
11463            TII->get(AMDGPU::S_INST_PREFETCH))
11464      .addImm(1); // prefetch 2 lines behind PC
11465
11466    BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
11467            TII->get(AMDGPU::S_INST_PREFETCH))
11468      .addImm(2); // prefetch 1 line behind PC
11469  }
11470
11471  return CacheLineAlign;
11472}
11473
11474LLVM_ATTRIBUTE_UNUSED
11475static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
11476  assert(N->getOpcode() == ISD::CopyFromReg);
11477  do {
11478    // Follow the chain until we find an INLINEASM node.
11479    N = N->getOperand(0).getNode();
11480    if (N->getOpcode() == ISD::INLINEASM ||
11481        N->getOpcode() == ISD::INLINEASM_BR)
11482      return true;
11483  } while (N->getOpcode() == ISD::CopyFromReg);
11484  return false;
11485}
11486
11487bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
11488  FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
11489{
11490  switch (N->getOpcode()) {
11491    case ISD::CopyFromReg:
11492    {
11493      const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
11494      const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
11495      const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
11496      Register Reg = R->getReg();
11497
11498      // FIXME: Why does this need to consider isLiveIn?
11499      if (Reg.isPhysical() || MRI.isLiveIn(Reg))
11500        return !TRI->isSGPRReg(MRI, Reg);
11501
11502      if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
11503        return KDA->isDivergent(V);
11504
11505      assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
11506      return !TRI->isSGPRReg(MRI, Reg);
11507    }
11508    break;
11509    case ISD::LOAD: {
11510      const LoadSDNode *L = cast<LoadSDNode>(N);
11511      unsigned AS = L->getAddressSpace();
11512      // A flat load may access private memory.
11513      return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
11514    } break;
11515    case ISD::CALLSEQ_END:
11516    return true;
11517    break;
11518    case ISD::INTRINSIC_WO_CHAIN:
11519    {
11520
11521    }
11522      return AMDGPU::isIntrinsicSourceOfDivergence(
11523      cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
11524    case ISD::INTRINSIC_W_CHAIN:
11525      return AMDGPU::isIntrinsicSourceOfDivergence(
11526      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
11527  }
11528  return false;
11529}
11530
11531bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
11532                                               EVT VT) const {
11533  switch (VT.getScalarType().getSimpleVT().SimpleTy) {
11534  case MVT::f32:
11535    return hasFP32Denormals(DAG.getMachineFunction());
11536  case MVT::f64:
11537  case MVT::f16:
11538    return hasFP64FP16Denormals(DAG.getMachineFunction());
11539  default:
11540    return false;
11541  }
11542}
11543
11544bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
11545                                                    const SelectionDAG &DAG,
11546                                                    bool SNaN,
11547                                                    unsigned Depth) const {
11548  if (Op.getOpcode() == AMDGPUISD::CLAMP) {
11549    const MachineFunction &MF = DAG.getMachineFunction();
11550    const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
11551
11552    if (Info->getMode().DX10Clamp)
11553      return true; // Clamped to 0.
11554    return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
11555  }
11556
11557  return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
11558                                                            SNaN, Depth);
11559}
11560
11561TargetLowering::AtomicExpansionKind
11562SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
11563  switch (RMW->getOperation()) {
11564  case AtomicRMWInst::FAdd: {
11565    Type *Ty = RMW->getType();
11566
11567    // We don't have a way to support 16-bit atomics now, so just leave them
11568    // as-is.
11569    if (Ty->isHalfTy())
11570      return AtomicExpansionKind::None;
11571
11572    if (!Ty->isFloatTy())
11573      return AtomicExpansionKind::CmpXChg;
11574
11575    // TODO: Do have these for flat. Older targets also had them for buffers.
11576    unsigned AS = RMW->getPointerAddressSpace();
11577
11578    if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
11579      return RMW->use_empty() ? AtomicExpansionKind::None :
11580                                AtomicExpansionKind::CmpXChg;
11581    }
11582
11583    return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
11584      AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
11585  }
11586  default:
11587    break;
11588  }
11589
11590  return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
11591}
11592
11593const TargetRegisterClass *
11594SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
11595  const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
11596  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
11597  if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
11598    return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
11599                                               : &AMDGPU::SReg_32RegClass;
11600  if (!TRI->isSGPRClass(RC) && !isDivergent)
11601    return TRI->getEquivalentSGPRClass(RC);
11602  else if (TRI->isSGPRClass(RC) && isDivergent)
11603    return TRI->getEquivalentVGPRClass(RC);
11604
11605  return RC;
11606}
11607
11608// FIXME: This is a workaround for DivergenceAnalysis not understanding always
11609// uniform values (as produced by the mask results of control flow intrinsics)
11610// used outside of divergent blocks. The phi users need to also be treated as
11611// always uniform.
11612static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
11613                      unsigned WaveSize) {
11614  // FIXME: We asssume we never cast the mask results of a control flow
11615  // intrinsic.
11616  // Early exit if the type won't be consistent as a compile time hack.
11617  IntegerType *IT = dyn_cast<IntegerType>(V->getType());
11618  if (!IT || IT->getBitWidth() != WaveSize)
11619    return false;
11620
11621  if (!isa<Instruction>(V))
11622    return false;
11623  if (!Visited.insert(V).second)
11624    return false;
11625  bool Result = false;
11626  for (auto U : V->users()) {
11627    if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
11628      if (V == U->getOperand(1)) {
11629        switch (Intrinsic->getIntrinsicID()) {
11630        default:
11631          Result = false;
11632          break;
11633        case Intrinsic::amdgcn_if_break:
11634        case Intrinsic::amdgcn_if:
11635        case Intrinsic::amdgcn_else:
11636          Result = true;
11637          break;
11638        }
11639      }
11640      if (V == U->getOperand(0)) {
11641        switch (Intrinsic->getIntrinsicID()) {
11642        default:
11643          Result = false;
11644          break;
11645        case Intrinsic::amdgcn_end_cf:
11646        case Intrinsic::amdgcn_loop:
11647          Result = true;
11648          break;
11649        }
11650      }
11651    } else {
11652      Result = hasCFUser(U, Visited, WaveSize);
11653    }
11654    if (Result)
11655      break;
11656  }
11657  return Result;
11658}
11659
11660bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
11661                                               const Value *V) const {
11662  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
11663    if (CI->isInlineAsm()) {
11664      // FIXME: This cannot give a correct answer. This should only trigger in
11665      // the case where inline asm returns mixed SGPR and VGPR results, used
11666      // outside the defining block. We don't have a specific result to
11667      // consider, so this assumes if any value is SGPR, the overall register
11668      // also needs to be SGPR.
11669      const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
11670      TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
11671          MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
11672      for (auto &TC : TargetConstraints) {
11673        if (TC.Type == InlineAsm::isOutput) {
11674          ComputeConstraintToUse(TC, SDValue());
11675          unsigned AssignedReg;
11676          const TargetRegisterClass *RC;
11677          std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
11678              SIRI, TC.ConstraintCode, TC.ConstraintVT);
11679          if (RC) {
11680            MachineRegisterInfo &MRI = MF.getRegInfo();
11681            if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
11682              return true;
11683            else if (SIRI->isSGPRClass(RC))
11684              return true;
11685          }
11686        }
11687      }
11688    }
11689  }
11690  SmallPtrSet<const Value *, 16> Visited;
11691  return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
11692}
11693
11694std::pair<int, MVT>
11695SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
11696                                          Type *Ty) const {
11697  auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
11698  auto Size = DL.getTypeSizeInBits(Ty);
11699  // Maximum load or store can handle 8 dwords for scalar and 4 for
11700  // vector ALU. Let's assume anything above 8 dwords is expensive
11701  // even if legal.
11702  if (Size <= 256)
11703    return Cost;
11704
11705  Cost.first = (Size + 255) / 256;
11706  return Cost;
11707}
11708