1//===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SystemZTargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "SystemZISelLowering.h"
14#include "SystemZCallingConv.h"
15#include "SystemZConstantPoolValue.h"
16#include "SystemZMachineFunctionInfo.h"
17#include "SystemZTargetMachine.h"
18#include "llvm/CodeGen/CallingConvLower.h"
19#include "llvm/CodeGen/MachineInstrBuilder.h"
20#include "llvm/CodeGen/MachineRegisterInfo.h"
21#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
22#include "llvm/IR/IntrinsicInst.h"
23#include "llvm/IR/Intrinsics.h"
24#include "llvm/IR/IntrinsicsS390.h"
25#include "llvm/Support/CommandLine.h"
26#include "llvm/Support/KnownBits.h"
27#include <cctype>
28
29using namespace llvm;
30
31#define DEBUG_TYPE "systemz-lower"
32
33namespace {
34// Represents information about a comparison.
35struct Comparison {
36  Comparison(SDValue Op0In, SDValue Op1In, SDValue ChainIn)
37    : Op0(Op0In), Op1(Op1In), Chain(ChainIn),
38      Opcode(0), ICmpType(0), CCValid(0), CCMask(0) {}
39
40  // The operands to the comparison.
41  SDValue Op0, Op1;
42
43  // Chain if this is a strict floating-point comparison.
44  SDValue Chain;
45
46  // The opcode that should be used to compare Op0 and Op1.
47  unsigned Opcode;
48
49  // A SystemZICMP value.  Only used for integer comparisons.
50  unsigned ICmpType;
51
52  // The mask of CC values that Opcode can produce.
53  unsigned CCValid;
54
55  // The mask of CC values for which the original condition is true.
56  unsigned CCMask;
57};
58} // end anonymous namespace
59
60// Classify VT as either 32 or 64 bit.
61static bool is32Bit(EVT VT) {
62  switch (VT.getSimpleVT().SimpleTy) {
63  case MVT::i32:
64    return true;
65  case MVT::i64:
66    return false;
67  default:
68    llvm_unreachable("Unsupported type");
69  }
70}
71
72// Return a version of MachineOperand that can be safely used before the
73// final use.
74static MachineOperand earlyUseOperand(MachineOperand Op) {
75  if (Op.isReg())
76    Op.setIsKill(false);
77  return Op;
78}
79
80SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
81                                             const SystemZSubtarget &STI)
82    : TargetLowering(TM), Subtarget(STI) {
83  MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize(0));
84
85  // Set up the register classes.
86  if (Subtarget.hasHighWord())
87    addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
88  else
89    addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
90  addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
91  if (!useSoftFloat()) {
92    if (Subtarget.hasVector()) {
93      addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
94      addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
95    } else {
96      addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
97      addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
98    }
99    if (Subtarget.hasVectorEnhancements1())
100      addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
101    else
102      addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
103
104    if (Subtarget.hasVector()) {
105      addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
106      addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
107      addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
108      addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
109      addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
110      addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
111    }
112  }
113
114  // Compute derived properties from the register classes
115  computeRegisterProperties(Subtarget.getRegisterInfo());
116
117  // Set up special registers.
118  setStackPointerRegisterToSaveRestore(SystemZ::R15D);
119
120  // TODO: It may be better to default to latency-oriented scheduling, however
121  // LLVM's current latency-oriented scheduler can't handle physreg definitions
122  // such as SystemZ has with CC, so set this to the register-pressure
123  // scheduler, because it can.
124  setSchedulingPreference(Sched::RegPressure);
125
126  setBooleanContents(ZeroOrOneBooleanContent);
127  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
128
129  // Instructions are strings of 2-byte aligned 2-byte values.
130  setMinFunctionAlignment(Align(2));
131  // For performance reasons we prefer 16-byte alignment.
132  setPrefFunctionAlignment(Align(16));
133
134  // Handle operations that are handled in a similar way for all types.
135  for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
136       I <= MVT::LAST_FP_VALUETYPE;
137       ++I) {
138    MVT VT = MVT::SimpleValueType(I);
139    if (isTypeLegal(VT)) {
140      // Lower SET_CC into an IPM-based sequence.
141      setOperationAction(ISD::SETCC, VT, Custom);
142      setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
143      setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
144
145      // Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
146      setOperationAction(ISD::SELECT, VT, Expand);
147
148      // Lower SELECT_CC and BR_CC into separate comparisons and branches.
149      setOperationAction(ISD::SELECT_CC, VT, Custom);
150      setOperationAction(ISD::BR_CC,     VT, Custom);
151    }
152  }
153
154  // Expand jump table branches as address arithmetic followed by an
155  // indirect jump.
156  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
157
158  // Expand BRCOND into a BR_CC (see above).
159  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
160
161  // Handle integer types.
162  for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
163       I <= MVT::LAST_INTEGER_VALUETYPE;
164       ++I) {
165    MVT VT = MVT::SimpleValueType(I);
166    if (isTypeLegal(VT)) {
167      // Expand individual DIV and REMs into DIVREMs.
168      setOperationAction(ISD::SDIV, VT, Expand);
169      setOperationAction(ISD::UDIV, VT, Expand);
170      setOperationAction(ISD::SREM, VT, Expand);
171      setOperationAction(ISD::UREM, VT, Expand);
172      setOperationAction(ISD::SDIVREM, VT, Custom);
173      setOperationAction(ISD::UDIVREM, VT, Custom);
174
175      // Support addition/subtraction with overflow.
176      setOperationAction(ISD::SADDO, VT, Custom);
177      setOperationAction(ISD::SSUBO, VT, Custom);
178
179      // Support addition/subtraction with carry.
180      setOperationAction(ISD::UADDO, VT, Custom);
181      setOperationAction(ISD::USUBO, VT, Custom);
182
183      // Support carry in as value rather than glue.
184      setOperationAction(ISD::ADDCARRY, VT, Custom);
185      setOperationAction(ISD::SUBCARRY, VT, Custom);
186
187      // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
188      // stores, putting a serialization instruction after the stores.
189      setOperationAction(ISD::ATOMIC_LOAD,  VT, Custom);
190      setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
191
192      // Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
193      // available, or if the operand is constant.
194      setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
195
196      // Use POPCNT on z196 and above.
197      if (Subtarget.hasPopulationCount())
198        setOperationAction(ISD::CTPOP, VT, Custom);
199      else
200        setOperationAction(ISD::CTPOP, VT, Expand);
201
202      // No special instructions for these.
203      setOperationAction(ISD::CTTZ,            VT, Expand);
204      setOperationAction(ISD::ROTR,            VT, Expand);
205
206      // Use *MUL_LOHI where possible instead of MULH*.
207      setOperationAction(ISD::MULHS, VT, Expand);
208      setOperationAction(ISD::MULHU, VT, Expand);
209      setOperationAction(ISD::SMUL_LOHI, VT, Custom);
210      setOperationAction(ISD::UMUL_LOHI, VT, Custom);
211
212      // Only z196 and above have native support for conversions to unsigned.
213      // On z10, promoting to i64 doesn't generate an inexact condition for
214      // values that are outside the i32 range but in the i64 range, so use
215      // the default expansion.
216      if (!Subtarget.hasFPExtension())
217        setOperationAction(ISD::FP_TO_UINT, VT, Expand);
218
219      // Mirror those settings for STRICT_FP_TO_[SU]INT.  Note that these all
220      // default to Expand, so need to be modified to Legal where appropriate.
221      setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Legal);
222      if (Subtarget.hasFPExtension())
223        setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Legal);
224
225      // And similarly for STRICT_[SU]INT_TO_FP.
226      setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Legal);
227      if (Subtarget.hasFPExtension())
228        setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Legal);
229    }
230  }
231
232  // Type legalization will convert 8- and 16-bit atomic operations into
233  // forms that operate on i32s (but still keeping the original memory VT).
234  // Lower them into full i32 operations.
235  setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Custom);
236  setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Custom);
237  setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Custom);
238  setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Custom);
239  setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Custom);
240  setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Custom);
241  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Custom);
242  setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i32, Custom);
243  setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i32, Custom);
244  setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Custom);
245  setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);
246
247  // Even though i128 is not a legal type, we still need to custom lower
248  // the atomic operations in order to exploit SystemZ instructions.
249  setOperationAction(ISD::ATOMIC_LOAD,     MVT::i128, Custom);
250  setOperationAction(ISD::ATOMIC_STORE,    MVT::i128, Custom);
251
252  // We can use the CC result of compare-and-swap to implement
253  // the "success" result of ATOMIC_CMP_SWAP_WITH_SUCCESS.
254  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Custom);
255  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Custom);
256  setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
257
258  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
259
260  // Traps are legal, as we will convert them to "j .+2".
261  setOperationAction(ISD::TRAP, MVT::Other, Legal);
262
263  // z10 has instructions for signed but not unsigned FP conversion.
264  // Handle unsigned 32-bit types as signed 64-bit types.
265  if (!Subtarget.hasFPExtension()) {
266    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote);
267    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
268    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Promote);
269    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Expand);
270  }
271
272  // We have native support for a 64-bit CTLZ, via FLOGR.
273  setOperationAction(ISD::CTLZ, MVT::i32, Promote);
274  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote);
275  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
276
277  // On z15 we have native support for a 64-bit CTPOP.
278  if (Subtarget.hasMiscellaneousExtensions3()) {
279    setOperationAction(ISD::CTPOP, MVT::i32, Promote);
280    setOperationAction(ISD::CTPOP, MVT::i64, Legal);
281  }
282
283  // Give LowerOperation the chance to replace 64-bit ORs with subregs.
284  setOperationAction(ISD::OR, MVT::i64, Custom);
285
286  // FIXME: Can we support these natively?
287  setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
288  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
289  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
290
291  // We have native instructions for i8, i16 and i32 extensions, but not i1.
292  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
293  for (MVT VT : MVT::integer_valuetypes()) {
294    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
295    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
296    setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i1, Promote);
297  }
298
299  // Handle the various types of symbolic address.
300  setOperationAction(ISD::ConstantPool,     PtrVT, Custom);
301  setOperationAction(ISD::GlobalAddress,    PtrVT, Custom);
302  setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
303  setOperationAction(ISD::BlockAddress,     PtrVT, Custom);
304  setOperationAction(ISD::JumpTable,        PtrVT, Custom);
305
306  // We need to handle dynamic allocations specially because of the
307  // 160-byte area at the bottom of the stack.
308  setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
309  setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, PtrVT, Custom);
310
311  // Use custom expanders so that we can force the function to use
312  // a frame pointer.
313  setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
314  setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
315
316  // Handle prefetches with PFD or PFDRL.
317  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
318
319  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
320    // Assume by default that all vector operations need to be expanded.
321    for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode)
322      if (getOperationAction(Opcode, VT) == Legal)
323        setOperationAction(Opcode, VT, Expand);
324
325    // Likewise all truncating stores and extending loads.
326    for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
327      setTruncStoreAction(VT, InnerVT, Expand);
328      setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
329      setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
330      setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
331    }
332
333    if (isTypeLegal(VT)) {
334      // These operations are legal for anything that can be stored in a
335      // vector register, even if there is no native support for the format
336      // as such.  In particular, we can do these for v4f32 even though there
337      // are no specific instructions for that format.
338      setOperationAction(ISD::LOAD, VT, Legal);
339      setOperationAction(ISD::STORE, VT, Legal);
340      setOperationAction(ISD::VSELECT, VT, Legal);
341      setOperationAction(ISD::BITCAST, VT, Legal);
342      setOperationAction(ISD::UNDEF, VT, Legal);
343
344      // Likewise, except that we need to replace the nodes with something
345      // more specific.
346      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
347      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
348    }
349  }
350
351  // Handle integer vector types.
352  for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
353    if (isTypeLegal(VT)) {
354      // These operations have direct equivalents.
355      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
356      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
357      setOperationAction(ISD::ADD, VT, Legal);
358      setOperationAction(ISD::SUB, VT, Legal);
359      if (VT != MVT::v2i64)
360        setOperationAction(ISD::MUL, VT, Legal);
361      setOperationAction(ISD::AND, VT, Legal);
362      setOperationAction(ISD::OR, VT, Legal);
363      setOperationAction(ISD::XOR, VT, Legal);
364      if (Subtarget.hasVectorEnhancements1())
365        setOperationAction(ISD::CTPOP, VT, Legal);
366      else
367        setOperationAction(ISD::CTPOP, VT, Custom);
368      setOperationAction(ISD::CTTZ, VT, Legal);
369      setOperationAction(ISD::CTLZ, VT, Legal);
370
371      // Convert a GPR scalar to a vector by inserting it into element 0.
372      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
373
374      // Use a series of unpacks for extensions.
375      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
376      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
377
378      // Detect shifts by a scalar amount and convert them into
379      // V*_BY_SCALAR.
380      setOperationAction(ISD::SHL, VT, Custom);
381      setOperationAction(ISD::SRA, VT, Custom);
382      setOperationAction(ISD::SRL, VT, Custom);
383
384      // At present ROTL isn't matched by DAGCombiner.  ROTR should be
385      // converted into ROTL.
386      setOperationAction(ISD::ROTL, VT, Expand);
387      setOperationAction(ISD::ROTR, VT, Expand);
388
389      // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
390      // and inverting the result as necessary.
391      setOperationAction(ISD::SETCC, VT, Custom);
392      setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
393      if (Subtarget.hasVectorEnhancements1())
394        setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
395    }
396  }
397
398  if (Subtarget.hasVector()) {
399    // There should be no need to check for float types other than v2f64
400    // since <2 x f32> isn't a legal type.
401    setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
402    setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
403    setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
404    setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
405    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
406    setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
407    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
408    setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
409
410    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal);
411    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f64, Legal);
412    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal);
413    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f64, Legal);
414    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal);
415    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f64, Legal);
416    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal);
417    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f64, Legal);
418  }
419
420  if (Subtarget.hasVectorEnhancements2()) {
421    setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
422    setOperationAction(ISD::FP_TO_SINT, MVT::v4f32, Legal);
423    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
424    setOperationAction(ISD::FP_TO_UINT, MVT::v4f32, Legal);
425    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
426    setOperationAction(ISD::SINT_TO_FP, MVT::v4f32, Legal);
427    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
428    setOperationAction(ISD::UINT_TO_FP, MVT::v4f32, Legal);
429
430    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
431    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f32, Legal);
432    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal);
433    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f32, Legal);
434    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
435    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f32, Legal);
436    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Legal);
437    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f32, Legal);
438  }
439
440  // Handle floating-point types.
441  for (unsigned I = MVT::FIRST_FP_VALUETYPE;
442       I <= MVT::LAST_FP_VALUETYPE;
443       ++I) {
444    MVT VT = MVT::SimpleValueType(I);
445    if (isTypeLegal(VT)) {
446      // We can use FI for FRINT.
447      setOperationAction(ISD::FRINT, VT, Legal);
448
449      // We can use the extended form of FI for other rounding operations.
450      if (Subtarget.hasFPExtension()) {
451        setOperationAction(ISD::FNEARBYINT, VT, Legal);
452        setOperationAction(ISD::FFLOOR, VT, Legal);
453        setOperationAction(ISD::FCEIL, VT, Legal);
454        setOperationAction(ISD::FTRUNC, VT, Legal);
455        setOperationAction(ISD::FROUND, VT, Legal);
456      }
457
458      // No special instructions for these.
459      setOperationAction(ISD::FSIN, VT, Expand);
460      setOperationAction(ISD::FCOS, VT, Expand);
461      setOperationAction(ISD::FSINCOS, VT, Expand);
462      setOperationAction(ISD::FREM, VT, Expand);
463      setOperationAction(ISD::FPOW, VT, Expand);
464
465      // Handle constrained floating-point operations.
466      setOperationAction(ISD::STRICT_FADD, VT, Legal);
467      setOperationAction(ISD::STRICT_FSUB, VT, Legal);
468      setOperationAction(ISD::STRICT_FMUL, VT, Legal);
469      setOperationAction(ISD::STRICT_FDIV, VT, Legal);
470      setOperationAction(ISD::STRICT_FMA, VT, Legal);
471      setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
472      setOperationAction(ISD::STRICT_FRINT, VT, Legal);
473      setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
474      setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
475      if (Subtarget.hasFPExtension()) {
476        setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
477        setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
478        setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
479        setOperationAction(ISD::STRICT_FROUND, VT, Legal);
480        setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
481      }
482    }
483  }
484
485  // Handle floating-point vector types.
486  if (Subtarget.hasVector()) {
487    // Scalar-to-vector conversion is just a subreg.
488    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
489    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
490
491    // Some insertions and extractions can be done directly but others
492    // need to go via integers.
493    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
494    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
495    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
496    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
497
498    // These operations have direct equivalents.
499    setOperationAction(ISD::FADD, MVT::v2f64, Legal);
500    setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
501    setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
502    setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
503    setOperationAction(ISD::FMA, MVT::v2f64, Legal);
504    setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
505    setOperationAction(ISD::FABS, MVT::v2f64, Legal);
506    setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
507    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
508    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
509    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
510    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
511    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
512    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
513
514    // Handle constrained floating-point operations.
515    setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
516    setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
517    setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
518    setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
519    setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
520    setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
521    setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal);
522    setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
523    setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
524    setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
525    setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
526    setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
527  }
528
529  // The vector enhancements facility 1 has instructions for these.
530  if (Subtarget.hasVectorEnhancements1()) {
531    setOperationAction(ISD::FADD, MVT::v4f32, Legal);
532    setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
533    setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
534    setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
535    setOperationAction(ISD::FMA, MVT::v4f32, Legal);
536    setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
537    setOperationAction(ISD::FABS, MVT::v4f32, Legal);
538    setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
539    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
540    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
541    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
542    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
543    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
544    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
545
546    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
547    setOperationAction(ISD::FMAXIMUM, MVT::f64, Legal);
548    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
549    setOperationAction(ISD::FMINIMUM, MVT::f64, Legal);
550
551    setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal);
552    setOperationAction(ISD::FMAXIMUM, MVT::v2f64, Legal);
553    setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal);
554    setOperationAction(ISD::FMINIMUM, MVT::v2f64, Legal);
555
556    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
557    setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
558    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
559    setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
560
561    setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
562    setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
563    setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
564    setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
565
566    setOperationAction(ISD::FMAXNUM, MVT::f128, Legal);
567    setOperationAction(ISD::FMAXIMUM, MVT::f128, Legal);
568    setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
569    setOperationAction(ISD::FMINIMUM, MVT::f128, Legal);
570
571    // Handle constrained floating-point operations.
572    setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
573    setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
574    setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
575    setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
576    setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
577    setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
578    setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal);
579    setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
580    setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
581    setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
582    setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);
583    setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
584    for (auto VT : { MVT::f32, MVT::f64, MVT::f128,
585                     MVT::v4f32, MVT::v2f64 }) {
586      setOperationAction(ISD::STRICT_FMAXNUM, VT, Legal);
587      setOperationAction(ISD::STRICT_FMINNUM, VT, Legal);
588      setOperationAction(ISD::STRICT_FMAXIMUM, VT, Legal);
589      setOperationAction(ISD::STRICT_FMINIMUM, VT, Legal);
590    }
591  }
592
593  // We only have fused f128 multiply-addition on vector registers.
594  if (!Subtarget.hasVectorEnhancements1()) {
595    setOperationAction(ISD::FMA, MVT::f128, Expand);
596    setOperationAction(ISD::STRICT_FMA, MVT::f128, Expand);
597  }
598
599  // We don't have a copysign instruction on vector registers.
600  if (Subtarget.hasVectorEnhancements1())
601    setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
602
603  // Needed so that we don't try to implement f128 constant loads using
604  // a load-and-extend of a f80 constant (in cases where the constant
605  // would fit in an f80).
606  for (MVT VT : MVT::fp_valuetypes())
607    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
608
609  // We don't have extending load instruction on vector registers.
610  if (Subtarget.hasVectorEnhancements1()) {
611    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
612    setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
613  }
614
615  // Floating-point truncation and stores need to be done separately.
616  setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
617  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
618  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
619
620  // We have 64-bit FPR<->GPR moves, but need special handling for
621  // 32-bit forms.
622  if (!Subtarget.hasVector()) {
623    setOperationAction(ISD::BITCAST, MVT::i32, Custom);
624    setOperationAction(ISD::BITCAST, MVT::f32, Custom);
625  }
626
627  // VASTART and VACOPY need to deal with the SystemZ-specific varargs
628  // structure, but VAEND is a no-op.
629  setOperationAction(ISD::VASTART, MVT::Other, Custom);
630  setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
631  setOperationAction(ISD::VAEND,   MVT::Other, Expand);
632
633  // Codes for which we want to perform some z-specific combinations.
634  setTargetDAGCombine(ISD::ZERO_EXTEND);
635  setTargetDAGCombine(ISD::SIGN_EXTEND);
636  setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
637  setTargetDAGCombine(ISD::LOAD);
638  setTargetDAGCombine(ISD::STORE);
639  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
640  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
641  setTargetDAGCombine(ISD::FP_ROUND);
642  setTargetDAGCombine(ISD::STRICT_FP_ROUND);
643  setTargetDAGCombine(ISD::FP_EXTEND);
644  setTargetDAGCombine(ISD::SINT_TO_FP);
645  setTargetDAGCombine(ISD::UINT_TO_FP);
646  setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
647  setTargetDAGCombine(ISD::BSWAP);
648  setTargetDAGCombine(ISD::SDIV);
649  setTargetDAGCombine(ISD::UDIV);
650  setTargetDAGCombine(ISD::SREM);
651  setTargetDAGCombine(ISD::UREM);
652  setTargetDAGCombine(ISD::INTRINSIC_VOID);
653  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
654
655  // Handle intrinsics.
656  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
657  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
658
659  // We want to use MVC in preference to even a single load/store pair.
660  MaxStoresPerMemcpy = 0;
661  MaxStoresPerMemcpyOptSize = 0;
662
663  // The main memset sequence is a byte store followed by an MVC.
664  // Two STC or MV..I stores win over that, but the kind of fused stores
665  // generated by target-independent code don't when the byte value is
666  // variable.  E.g.  "STC <reg>;MHI <reg>,257;STH <reg>" is not better
667  // than "STC;MVC".  Handle the choice in target-specific code instead.
668  MaxStoresPerMemset = 0;
669  MaxStoresPerMemsetOptSize = 0;
670
671  // Default to having -disable-strictnode-mutation on
672  IsStrictFPEnabled = true;
673}
674
675bool SystemZTargetLowering::useSoftFloat() const {
676  return Subtarget.hasSoftFloat();
677}
678
679EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
680                                              LLVMContext &, EVT VT) const {
681  if (!VT.isVector())
682    return MVT::i32;
683  return VT.changeVectorElementTypeToInteger();
684}
685
686bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(
687    const MachineFunction &MF, EVT VT) const {
688  VT = VT.getScalarType();
689
690  if (!VT.isSimple())
691    return false;
692
693  switch (VT.getSimpleVT().SimpleTy) {
694  case MVT::f32:
695  case MVT::f64:
696    return true;
697  case MVT::f128:
698    return Subtarget.hasVectorEnhancements1();
699  default:
700    break;
701  }
702
703  return false;
704}
705
706// Return true if the constant can be generated with a vector instruction,
707// such as VGM, VGMB or VREPI.
708bool SystemZVectorConstantInfo::isVectorConstantLegal(
709    const SystemZSubtarget &Subtarget) {
710  const SystemZInstrInfo *TII =
711      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
712  if (!Subtarget.hasVector() ||
713      (isFP128 && !Subtarget.hasVectorEnhancements1()))
714    return false;
715
716  // Try using VECTOR GENERATE BYTE MASK.  This is the architecturally-
717  // preferred way of creating all-zero and all-one vectors so give it
718  // priority over other methods below.
719  unsigned Mask = 0;
720  unsigned I = 0;
721  for (; I < SystemZ::VectorBytes; ++I) {
722    uint64_t Byte = IntBits.lshr(I * 8).trunc(8).getZExtValue();
723    if (Byte == 0xff)
724      Mask |= 1ULL << I;
725    else if (Byte != 0)
726      break;
727  }
728  if (I == SystemZ::VectorBytes) {
729    Opcode = SystemZISD::BYTE_MASK;
730    OpVals.push_back(Mask);
731    VecVT = MVT::getVectorVT(MVT::getIntegerVT(8), 16);
732    return true;
733  }
734
735  if (SplatBitSize > 64)
736    return false;
737
738  auto tryValue = [&](uint64_t Value) -> bool {
739    // Try VECTOR REPLICATE IMMEDIATE
740    int64_t SignedValue = SignExtend64(Value, SplatBitSize);
741    if (isInt<16>(SignedValue)) {
742      OpVals.push_back(((unsigned) SignedValue));
743      Opcode = SystemZISD::REPLICATE;
744      VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize),
745                               SystemZ::VectorBits / SplatBitSize);
746      return true;
747    }
748    // Try VECTOR GENERATE MASK
749    unsigned Start, End;
750    if (TII->isRxSBGMask(Value, SplatBitSize, Start, End)) {
751      // isRxSBGMask returns the bit numbers for a full 64-bit value, with 0
752      // denoting 1 << 63 and 63 denoting 1.  Convert them to bit numbers for
753      // an SplatBitSize value, so that 0 denotes 1 << (SplatBitSize-1).
754      OpVals.push_back(Start - (64 - SplatBitSize));
755      OpVals.push_back(End - (64 - SplatBitSize));
756      Opcode = SystemZISD::ROTATE_MASK;
757      VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize),
758                               SystemZ::VectorBits / SplatBitSize);
759      return true;
760    }
761    return false;
762  };
763
764  // First try assuming that any undefined bits above the highest set bit
765  // and below the lowest set bit are 1s.  This increases the likelihood of
766  // being able to use a sign-extended element value in VECTOR REPLICATE
767  // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK.
768  uint64_t SplatBitsZ = SplatBits.getZExtValue();
769  uint64_t SplatUndefZ = SplatUndef.getZExtValue();
770  uint64_t Lower =
771      (SplatUndefZ & ((uint64_t(1) << findFirstSet(SplatBitsZ)) - 1));
772  uint64_t Upper =
773      (SplatUndefZ & ~((uint64_t(1) << findLastSet(SplatBitsZ)) - 1));
774  if (tryValue(SplatBitsZ | Upper | Lower))
775    return true;
776
777  // Now try assuming that any undefined bits between the first and
778  // last defined set bits are set.  This increases the chances of
779  // using a non-wraparound mask.
780  uint64_t Middle = SplatUndefZ & ~Upper & ~Lower;
781  return tryValue(SplatBitsZ | Middle);
782}
783
784SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) {
785  IntBits = FPImm.bitcastToAPInt().zextOrSelf(128);
786  isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
787
788  // Find the smallest splat.
789  SplatBits = FPImm.bitcastToAPInt();
790  unsigned Width = SplatBits.getBitWidth();
791  while (Width > 8) {
792    unsigned HalfSize = Width / 2;
793    APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize);
794    APInt LowValue = SplatBits.trunc(HalfSize);
795
796    // If the two halves do not match, stop here.
797    if (HighValue != LowValue || 8 > HalfSize)
798      break;
799
800    SplatBits = HighValue;
801    Width = HalfSize;
802  }
803  SplatUndef = 0;
804  SplatBitSize = Width;
805}
806
807SystemZVectorConstantInfo::SystemZVectorConstantInfo(BuildVectorSDNode *BVN) {
808  assert(BVN->isConstant() && "Expected a constant BUILD_VECTOR");
809  bool HasAnyUndefs;
810
811  // Get IntBits by finding the 128 bit splat.
812  BVN->isConstantSplat(IntBits, SplatUndef, SplatBitSize, HasAnyUndefs, 128,
813                       true);
814
815  // Get SplatBits by finding the 8 bit or greater splat.
816  BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 8,
817                       true);
818}
819
820bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
821                                         bool ForCodeSize) const {
822  // We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
823  if (Imm.isZero() || Imm.isNegZero())
824    return true;
825
826  return SystemZVectorConstantInfo(Imm).isVectorConstantLegal(Subtarget);
827}
828
829/// Returns true if stack probing through inline assembly is requested.
830bool SystemZTargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
831  // If the function specifically requests inline stack probes, emit them.
832  if (MF.getFunction().hasFnAttribute("probe-stack"))
833    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
834           "inline-asm";
835  return false;
836}
837
838bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
839  // We can use CGFI or CLGFI.
840  return isInt<32>(Imm) || isUInt<32>(Imm);
841}
842
843bool SystemZTargetLowering::isLegalAddImmediate(int64_t Imm) const {
844  // We can use ALGFI or SLGFI.
845  return isUInt<32>(Imm) || isUInt<32>(-Imm);
846}
847
848bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
849    EVT VT, unsigned, unsigned, MachineMemOperand::Flags, bool *Fast) const {
850  // Unaligned accesses should never be slower than the expanded version.
851  // We check specifically for aligned accesses in the few cases where
852  // they are required.
853  if (Fast)
854    *Fast = true;
855  return true;
856}
857
858// Information about the addressing mode for a memory access.
859struct AddressingMode {
860  // True if a long displacement is supported.
861  bool LongDisplacement;
862
863  // True if use of index register is supported.
864  bool IndexReg;
865
866  AddressingMode(bool LongDispl, bool IdxReg) :
867    LongDisplacement(LongDispl), IndexReg(IdxReg) {}
868};
869
870// Return the desired addressing mode for a Load which has only one use (in
871// the same block) which is a Store.
872static AddressingMode getLoadStoreAddrMode(bool HasVector,
873                                          Type *Ty) {
874  // With vector support a Load->Store combination may be combined to either
875  // an MVC or vector operations and it seems to work best to allow the
876  // vector addressing mode.
877  if (HasVector)
878    return AddressingMode(false/*LongDispl*/, true/*IdxReg*/);
879
880  // Otherwise only the MVC case is special.
881  bool MVC = Ty->isIntegerTy(8);
882  return AddressingMode(!MVC/*LongDispl*/, !MVC/*IdxReg*/);
883}
884
885// Return the addressing mode which seems most desirable given an LLVM
886// Instruction pointer.
887static AddressingMode
888supportedAddressingMode(Instruction *I, bool HasVector) {
889  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
890    switch (II->getIntrinsicID()) {
891    default: break;
892    case Intrinsic::memset:
893    case Intrinsic::memmove:
894    case Intrinsic::memcpy:
895      return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
896    }
897  }
898
899  if (isa<LoadInst>(I) && I->hasOneUse()) {
900    auto *SingleUser = cast<Instruction>(*I->user_begin());
901    if (SingleUser->getParent() == I->getParent()) {
902      if (isa<ICmpInst>(SingleUser)) {
903        if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1)))
904          if (C->getBitWidth() <= 64 &&
905              (isInt<16>(C->getSExtValue()) || isUInt<16>(C->getZExtValue())))
906            // Comparison of memory with 16 bit signed / unsigned immediate
907            return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
908      } else if (isa<StoreInst>(SingleUser))
909        // Load->Store
910        return getLoadStoreAddrMode(HasVector, I->getType());
911    }
912  } else if (auto *StoreI = dyn_cast<StoreInst>(I)) {
913    if (auto *LoadI = dyn_cast<LoadInst>(StoreI->getValueOperand()))
914      if (LoadI->hasOneUse() && LoadI->getParent() == I->getParent())
915        // Load->Store
916        return getLoadStoreAddrMode(HasVector, LoadI->getType());
917  }
918
919  if (HasVector && (isa<LoadInst>(I) || isa<StoreInst>(I))) {
920
921    // * Use LDE instead of LE/LEY for z13 to avoid partial register
922    //   dependencies (LDE only supports small offsets).
923    // * Utilize the vector registers to hold floating point
924    //   values (vector load / store instructions only support small
925    //   offsets).
926
927    Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
928                         I->getOperand(0)->getType());
929    bool IsFPAccess = MemAccessTy->isFloatingPointTy();
930    bool IsVectorAccess = MemAccessTy->isVectorTy();
931
932    // A store of an extracted vector element will be combined into a VSTE type
933    // instruction.
934    if (!IsVectorAccess && isa<StoreInst>(I)) {
935      Value *DataOp = I->getOperand(0);
936      if (isa<ExtractElementInst>(DataOp))
937        IsVectorAccess = true;
938    }
939
940    // A load which gets inserted into a vector element will be combined into a
941    // VLE type instruction.
942    if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) {
943      User *LoadUser = *I->user_begin();
944      if (isa<InsertElementInst>(LoadUser))
945        IsVectorAccess = true;
946    }
947
948    if (IsFPAccess || IsVectorAccess)
949      return AddressingMode(false/*LongDispl*/, true/*IdxReg*/);
950  }
951
952  return AddressingMode(true/*LongDispl*/, true/*IdxReg*/);
953}
954
955bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
956       const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const {
957  // Punt on globals for now, although they can be used in limited
958  // RELATIVE LONG cases.
959  if (AM.BaseGV)
960    return false;
961
962  // Require a 20-bit signed offset.
963  if (!isInt<20>(AM.BaseOffs))
964    return false;
965
966  AddressingMode SupportedAM(true, true);
967  if (I != nullptr)
968    SupportedAM = supportedAddressingMode(I, Subtarget.hasVector());
969
970  if (!SupportedAM.LongDisplacement && !isUInt<12>(AM.BaseOffs))
971    return false;
972
973  if (!SupportedAM.IndexReg)
974    // No indexing allowed.
975    return AM.Scale == 0;
976  else
977    // Indexing is OK but no scale factor can be applied.
978    return AM.Scale == 0 || AM.Scale == 1;
979}
980
981bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
982  if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
983    return false;
984  unsigned FromBits = FromType->getPrimitiveSizeInBits();
985  unsigned ToBits = ToType->getPrimitiveSizeInBits();
986  return FromBits > ToBits;
987}
988
989bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
990  if (!FromVT.isInteger() || !ToVT.isInteger())
991    return false;
992  unsigned FromBits = FromVT.getSizeInBits();
993  unsigned ToBits = ToVT.getSizeInBits();
994  return FromBits > ToBits;
995}
996
997//===----------------------------------------------------------------------===//
998// Inline asm support
999//===----------------------------------------------------------------------===//
1000
1001TargetLowering::ConstraintType
1002SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
1003  if (Constraint.size() == 1) {
1004    switch (Constraint[0]) {
1005    case 'a': // Address register
1006    case 'd': // Data register (equivalent to 'r')
1007    case 'f': // Floating-point register
1008    case 'h': // High-part register
1009    case 'r': // General-purpose register
1010    case 'v': // Vector register
1011      return C_RegisterClass;
1012
1013    case 'Q': // Memory with base and unsigned 12-bit displacement
1014    case 'R': // Likewise, plus an index
1015    case 'S': // Memory with base and signed 20-bit displacement
1016    case 'T': // Likewise, plus an index
1017    case 'm': // Equivalent to 'T'.
1018      return C_Memory;
1019
1020    case 'I': // Unsigned 8-bit constant
1021    case 'J': // Unsigned 12-bit constant
1022    case 'K': // Signed 16-bit constant
1023    case 'L': // Signed 20-bit displacement (on all targets we support)
1024    case 'M': // 0x7fffffff
1025      return C_Immediate;
1026
1027    default:
1028      break;
1029    }
1030  }
1031  return TargetLowering::getConstraintType(Constraint);
1032}
1033
1034TargetLowering::ConstraintWeight SystemZTargetLowering::
1035getSingleConstraintMatchWeight(AsmOperandInfo &info,
1036                               const char *constraint) const {
1037  ConstraintWeight weight = CW_Invalid;
1038  Value *CallOperandVal = info.CallOperandVal;
1039  // If we don't have a value, we can't do a match,
1040  // but allow it at the lowest weight.
1041  if (!CallOperandVal)
1042    return CW_Default;
1043  Type *type = CallOperandVal->getType();
1044  // Look at the constraint type.
1045  switch (*constraint) {
1046  default:
1047    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
1048    break;
1049
1050  case 'a': // Address register
1051  case 'd': // Data register (equivalent to 'r')
1052  case 'h': // High-part register
1053  case 'r': // General-purpose register
1054    if (CallOperandVal->getType()->isIntegerTy())
1055      weight = CW_Register;
1056    break;
1057
1058  case 'f': // Floating-point register
1059    if (type->isFloatingPointTy())
1060      weight = CW_Register;
1061    break;
1062
1063  case 'v': // Vector register
1064    if ((type->isVectorTy() || type->isFloatingPointTy()) &&
1065        Subtarget.hasVector())
1066      weight = CW_Register;
1067    break;
1068
1069  case 'I': // Unsigned 8-bit constant
1070    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
1071      if (isUInt<8>(C->getZExtValue()))
1072        weight = CW_Constant;
1073    break;
1074
1075  case 'J': // Unsigned 12-bit constant
1076    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
1077      if (isUInt<12>(C->getZExtValue()))
1078        weight = CW_Constant;
1079    break;
1080
1081  case 'K': // Signed 16-bit constant
1082    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
1083      if (isInt<16>(C->getSExtValue()))
1084        weight = CW_Constant;
1085    break;
1086
1087  case 'L': // Signed 20-bit displacement (on all targets we support)
1088    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
1089      if (isInt<20>(C->getSExtValue()))
1090        weight = CW_Constant;
1091    break;
1092
1093  case 'M': // 0x7fffffff
1094    if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
1095      if (C->getZExtValue() == 0x7fffffff)
1096        weight = CW_Constant;
1097    break;
1098  }
1099  return weight;
1100}
1101
1102// Parse a "{tNNN}" register constraint for which the register type "t"
1103// has already been verified.  MC is the class associated with "t" and
1104// Map maps 0-based register numbers to LLVM register numbers.
1105static std::pair<unsigned, const TargetRegisterClass *>
1106parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
1107                    const unsigned *Map, unsigned Size) {
1108  assert(*(Constraint.end()-1) == '}' && "Missing '}'");
1109  if (isdigit(Constraint[2])) {
1110    unsigned Index;
1111    bool Failed =
1112        Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
1113    if (!Failed && Index < Size && Map[Index])
1114      return std::make_pair(Map[Index], RC);
1115  }
1116  return std::make_pair(0U, nullptr);
1117}
1118
1119std::pair<unsigned, const TargetRegisterClass *>
1120SystemZTargetLowering::getRegForInlineAsmConstraint(
1121    const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
1122  if (Constraint.size() == 1) {
1123    // GCC Constraint Letters
1124    switch (Constraint[0]) {
1125    default: break;
1126    case 'd': // Data register (equivalent to 'r')
1127    case 'r': // General-purpose register
1128      if (VT == MVT::i64)
1129        return std::make_pair(0U, &SystemZ::GR64BitRegClass);
1130      else if (VT == MVT::i128)
1131        return std::make_pair(0U, &SystemZ::GR128BitRegClass);
1132      return std::make_pair(0U, &SystemZ::GR32BitRegClass);
1133
1134    case 'a': // Address register
1135      if (VT == MVT::i64)
1136        return std::make_pair(0U, &SystemZ::ADDR64BitRegClass);
1137      else if (VT == MVT::i128)
1138        return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
1139      return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
1140
1141    case 'h': // High-part register (an LLVM extension)
1142      return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
1143
1144    case 'f': // Floating-point register
1145      if (!useSoftFloat()) {
1146        if (VT == MVT::f64)
1147          return std::make_pair(0U, &SystemZ::FP64BitRegClass);
1148        else if (VT == MVT::f128)
1149          return std::make_pair(0U, &SystemZ::FP128BitRegClass);
1150        return std::make_pair(0U, &SystemZ::FP32BitRegClass);
1151      }
1152      break;
1153    case 'v': // Vector register
1154      if (Subtarget.hasVector()) {
1155        if (VT == MVT::f32)
1156          return std::make_pair(0U, &SystemZ::VR32BitRegClass);
1157        if (VT == MVT::f64)
1158          return std::make_pair(0U, &SystemZ::VR64BitRegClass);
1159        return std::make_pair(0U, &SystemZ::VR128BitRegClass);
1160      }
1161      break;
1162    }
1163  }
1164  if (Constraint.size() > 0 && Constraint[0] == '{') {
1165    // We need to override the default register parsing for GPRs and FPRs
1166    // because the interpretation depends on VT.  The internal names of
1167    // the registers are also different from the external names
1168    // (F0D and F0S instead of F0, etc.).
1169    if (Constraint[1] == 'r') {
1170      if (VT == MVT::i32)
1171        return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
1172                                   SystemZMC::GR32Regs, 16);
1173      if (VT == MVT::i128)
1174        return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
1175                                   SystemZMC::GR128Regs, 16);
1176      return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
1177                                 SystemZMC::GR64Regs, 16);
1178    }
1179    if (Constraint[1] == 'f') {
1180      if (useSoftFloat())
1181        return std::make_pair(
1182            0u, static_cast<const TargetRegisterClass *>(nullptr));
1183      if (VT == MVT::f32)
1184        return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
1185                                   SystemZMC::FP32Regs, 16);
1186      if (VT == MVT::f128)
1187        return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
1188                                   SystemZMC::FP128Regs, 16);
1189      return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
1190                                 SystemZMC::FP64Regs, 16);
1191    }
1192    if (Constraint[1] == 'v') {
1193      if (!Subtarget.hasVector())
1194        return std::make_pair(
1195            0u, static_cast<const TargetRegisterClass *>(nullptr));
1196      if (VT == MVT::f32)
1197        return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass,
1198                                   SystemZMC::VR32Regs, 32);
1199      if (VT == MVT::f64)
1200        return parseRegisterNumber(Constraint, &SystemZ::VR64BitRegClass,
1201                                   SystemZMC::VR64Regs, 32);
1202      return parseRegisterNumber(Constraint, &SystemZ::VR128BitRegClass,
1203                                 SystemZMC::VR128Regs, 32);
1204    }
1205  }
1206  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
1207}
1208
1209// FIXME? Maybe this could be a TableGen attribute on some registers and
1210// this table could be generated automatically from RegInfo.
1211Register SystemZTargetLowering::getRegisterByName(const char *RegName, LLT VT,
1212                                                  const MachineFunction &MF) const {
1213
1214  Register Reg = StringSwitch<Register>(RegName)
1215                   .Case("r15", SystemZ::R15D)
1216                   .Default(0);
1217  if (Reg)
1218    return Reg;
1219  report_fatal_error("Invalid register name global variable");
1220}
1221
1222void SystemZTargetLowering::
1223LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
1224                             std::vector<SDValue> &Ops,
1225                             SelectionDAG &DAG) const {
1226  // Only support length 1 constraints for now.
1227  if (Constraint.length() == 1) {
1228    switch (Constraint[0]) {
1229    case 'I': // Unsigned 8-bit constant
1230      if (auto *C = dyn_cast<ConstantSDNode>(Op))
1231        if (isUInt<8>(C->getZExtValue()))
1232          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
1233                                              Op.getValueType()));
1234      return;
1235
1236    case 'J': // Unsigned 12-bit constant
1237      if (auto *C = dyn_cast<ConstantSDNode>(Op))
1238        if (isUInt<12>(C->getZExtValue()))
1239          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
1240                                              Op.getValueType()));
1241      return;
1242
1243    case 'K': // Signed 16-bit constant
1244      if (auto *C = dyn_cast<ConstantSDNode>(Op))
1245        if (isInt<16>(C->getSExtValue()))
1246          Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
1247                                              Op.getValueType()));
1248      return;
1249
1250    case 'L': // Signed 20-bit displacement (on all targets we support)
1251      if (auto *C = dyn_cast<ConstantSDNode>(Op))
1252        if (isInt<20>(C->getSExtValue()))
1253          Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
1254                                              Op.getValueType()));
1255      return;
1256
1257    case 'M': // 0x7fffffff
1258      if (auto *C = dyn_cast<ConstantSDNode>(Op))
1259        if (C->getZExtValue() == 0x7fffffff)
1260          Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
1261                                              Op.getValueType()));
1262      return;
1263    }
1264  }
1265  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
1266}
1267
1268//===----------------------------------------------------------------------===//
1269// Calling conventions
1270//===----------------------------------------------------------------------===//
1271
1272#include "SystemZGenCallingConv.inc"
1273
1274const MCPhysReg *SystemZTargetLowering::getScratchRegisters(
1275  CallingConv::ID) const {
1276  static const MCPhysReg ScratchRegs[] = { SystemZ::R0D, SystemZ::R1D,
1277                                           SystemZ::R14D, 0 };
1278  return ScratchRegs;
1279}
1280
1281bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
1282                                                     Type *ToType) const {
1283  return isTruncateFree(FromType, ToType);
1284}
1285
1286bool SystemZTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
1287  return CI->isTailCall();
1288}
1289
1290// We do not yet support 128-bit single-element vector types.  If the user
1291// attempts to use such types as function argument or return type, prefer
1292// to error out instead of emitting code violating the ABI.
1293static void VerifyVectorType(MVT VT, EVT ArgVT) {
1294  if (ArgVT.isVector() && !VT.isVector())
1295    report_fatal_error("Unsupported vector argument or return type");
1296}
1297
1298static void VerifyVectorTypes(const SmallVectorImpl<ISD::InputArg> &Ins) {
1299  for (unsigned i = 0; i < Ins.size(); ++i)
1300    VerifyVectorType(Ins[i].VT, Ins[i].ArgVT);
1301}
1302
1303static void VerifyVectorTypes(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1304  for (unsigned i = 0; i < Outs.size(); ++i)
1305    VerifyVectorType(Outs[i].VT, Outs[i].ArgVT);
1306}
1307
1308// Value is a value that has been passed to us in the location described by VA
1309// (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
1310// any loads onto Chain.
1311static SDValue convertLocVTToValVT(SelectionDAG &DAG, const SDLoc &DL,
1312                                   CCValAssign &VA, SDValue Chain,
1313                                   SDValue Value) {
1314  // If the argument has been promoted from a smaller type, insert an
1315  // assertion to capture this.
1316  if (VA.getLocInfo() == CCValAssign::SExt)
1317    Value = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Value,
1318                        DAG.getValueType(VA.getValVT()));
1319  else if (VA.getLocInfo() == CCValAssign::ZExt)
1320    Value = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Value,
1321                        DAG.getValueType(VA.getValVT()));
1322
1323  if (VA.isExtInLoc())
1324    Value = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Value);
1325  else if (VA.getLocInfo() == CCValAssign::BCvt) {
1326    // If this is a short vector argument loaded from the stack,
1327    // extend from i64 to full vector size and then bitcast.
1328    assert(VA.getLocVT() == MVT::i64);
1329    assert(VA.getValVT().isVector());
1330    Value = DAG.getBuildVector(MVT::v2i64, DL, {Value, DAG.getUNDEF(MVT::i64)});
1331    Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
1332  } else
1333    assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
1334  return Value;
1335}
1336
1337// Value is a value of type VA.getValVT() that we need to copy into
1338// the location described by VA.  Return a copy of Value converted to
1339// VA.getValVT().  The caller is responsible for handling indirect values.
1340static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
1341                                   CCValAssign &VA, SDValue Value) {
1342  switch (VA.getLocInfo()) {
1343  case CCValAssign::SExt:
1344    return DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Value);
1345  case CCValAssign::ZExt:
1346    return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
1347  case CCValAssign::AExt:
1348    return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
1349  case CCValAssign::BCvt:
1350    // If this is a short vector argument to be stored to the stack,
1351    // bitcast to v2i64 and then extract first element.
1352    assert(VA.getLocVT() == MVT::i64);
1353    assert(VA.getValVT().isVector());
1354    Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
1355    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
1356                       DAG.getConstant(0, DL, MVT::i32));
1357  case CCValAssign::Full:
1358    return Value;
1359  default:
1360    llvm_unreachable("Unhandled getLocInfo()");
1361  }
1362}
1363
1364SDValue SystemZTargetLowering::LowerFormalArguments(
1365    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1366    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1367    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1368  MachineFunction &MF = DAG.getMachineFunction();
1369  MachineFrameInfo &MFI = MF.getFrameInfo();
1370  MachineRegisterInfo &MRI = MF.getRegInfo();
1371  SystemZMachineFunctionInfo *FuncInfo =
1372      MF.getInfo<SystemZMachineFunctionInfo>();
1373  auto *TFL =
1374      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
1375  EVT PtrVT = getPointerTy(DAG.getDataLayout());
1376
1377  // Detect unsupported vector argument types.
1378  if (Subtarget.hasVector())
1379    VerifyVectorTypes(Ins);
1380
1381  // Assign locations to all of the incoming arguments.
1382  SmallVector<CCValAssign, 16> ArgLocs;
1383  SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1384  CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
1385
1386  unsigned NumFixedGPRs = 0;
1387  unsigned NumFixedFPRs = 0;
1388  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
1389    SDValue ArgValue;
1390    CCValAssign &VA = ArgLocs[I];
1391    EVT LocVT = VA.getLocVT();
1392    if (VA.isRegLoc()) {
1393      // Arguments passed in registers
1394      const TargetRegisterClass *RC;
1395      switch (LocVT.getSimpleVT().SimpleTy) {
1396      default:
1397        // Integers smaller than i64 should be promoted to i64.
1398        llvm_unreachable("Unexpected argument type");
1399      case MVT::i32:
1400        NumFixedGPRs += 1;
1401        RC = &SystemZ::GR32BitRegClass;
1402        break;
1403      case MVT::i64:
1404        NumFixedGPRs += 1;
1405        RC = &SystemZ::GR64BitRegClass;
1406        break;
1407      case MVT::f32:
1408        NumFixedFPRs += 1;
1409        RC = &SystemZ::FP32BitRegClass;
1410        break;
1411      case MVT::f64:
1412        NumFixedFPRs += 1;
1413        RC = &SystemZ::FP64BitRegClass;
1414        break;
1415      case MVT::v16i8:
1416      case MVT::v8i16:
1417      case MVT::v4i32:
1418      case MVT::v2i64:
1419      case MVT::v4f32:
1420      case MVT::v2f64:
1421        RC = &SystemZ::VR128BitRegClass;
1422        break;
1423      }
1424
1425      Register VReg = MRI.createVirtualRegister(RC);
1426      MRI.addLiveIn(VA.getLocReg(), VReg);
1427      ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
1428    } else {
1429      assert(VA.isMemLoc() && "Argument not register or memory");
1430
1431      // Create the frame index object for this incoming parameter.
1432      int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8,
1433                                     VA.getLocMemOffset(), true);
1434
1435      // Create the SelectionDAG nodes corresponding to a load
1436      // from this parameter.  Unpromoted ints and floats are
1437      // passed as right-justified 8-byte values.
1438      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1439      if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
1440        FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
1441                          DAG.getIntPtrConstant(4, DL));
1442      ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
1443                             MachinePointerInfo::getFixedStack(MF, FI));
1444    }
1445
1446    // Convert the value of the argument register into the value that's
1447    // being passed.
1448    if (VA.getLocInfo() == CCValAssign::Indirect) {
1449      InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
1450                                   MachinePointerInfo()));
1451      // If the original argument was split (e.g. i128), we need
1452      // to load all parts of it here (using the same address).
1453      unsigned ArgIndex = Ins[I].OrigArgIndex;
1454      assert (Ins[I].PartOffset == 0);
1455      while (I + 1 != E && Ins[I + 1].OrigArgIndex == ArgIndex) {
1456        CCValAssign &PartVA = ArgLocs[I + 1];
1457        unsigned PartOffset = Ins[I + 1].PartOffset;
1458        SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
1459                                      DAG.getIntPtrConstant(PartOffset, DL));
1460        InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
1461                                     MachinePointerInfo()));
1462        ++I;
1463      }
1464    } else
1465      InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
1466  }
1467
1468  if (IsVarArg) {
1469    // Save the number of non-varargs registers for later use by va_start, etc.
1470    FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
1471    FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
1472
1473    // Likewise the address (in the form of a frame index) of where the
1474    // first stack vararg would be.  The 1-byte size here is arbitrary.
1475    int64_t StackSize = CCInfo.getNextStackOffset();
1476    FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
1477
1478    // ...and a similar frame index for the caller-allocated save area
1479    // that will be used to store the incoming registers.
1480    int64_t RegSaveOffset =
1481      -SystemZMC::CallFrameSize + TFL->getRegSpillOffset(MF, SystemZ::R2D) - 16;
1482    unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true);
1483    FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
1484
1485    // Store the FPR varargs in the reserved frame slots.  (We store the
1486    // GPRs as part of the prologue.)
1487    if (NumFixedFPRs < SystemZ::NumArgFPRs && !useSoftFloat()) {
1488      SDValue MemOps[SystemZ::NumArgFPRs];
1489      for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
1490        unsigned Offset = TFL->getRegSpillOffset(MF, SystemZ::ArgFPRs[I]);
1491        int FI =
1492          MFI.CreateFixedObject(8, -SystemZMC::CallFrameSize + Offset, true);
1493        SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
1494        unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
1495                                     &SystemZ::FP64BitRegClass);
1496        SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
1497        MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
1498                                 MachinePointerInfo::getFixedStack(MF, FI));
1499      }
1500      // Join the stores, which are independent of one another.
1501      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1502                          makeArrayRef(&MemOps[NumFixedFPRs],
1503                                       SystemZ::NumArgFPRs-NumFixedFPRs));
1504    }
1505  }
1506
1507  return Chain;
1508}
1509
1510static bool canUseSiblingCall(const CCState &ArgCCInfo,
1511                              SmallVectorImpl<CCValAssign> &ArgLocs,
1512                              SmallVectorImpl<ISD::OutputArg> &Outs) {
1513  // Punt if there are any indirect or stack arguments, or if the call
1514  // needs the callee-saved argument register R6, or if the call uses
1515  // the callee-saved register arguments SwiftSelf and SwiftError.
1516  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
1517    CCValAssign &VA = ArgLocs[I];
1518    if (VA.getLocInfo() == CCValAssign::Indirect)
1519      return false;
1520    if (!VA.isRegLoc())
1521      return false;
1522    Register Reg = VA.getLocReg();
1523    if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
1524      return false;
1525    if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftError())
1526      return false;
1527  }
1528  return true;
1529}
1530
1531SDValue
1532SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
1533                                 SmallVectorImpl<SDValue> &InVals) const {
1534  SelectionDAG &DAG = CLI.DAG;
1535  SDLoc &DL = CLI.DL;
1536  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1537  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1538  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1539  SDValue Chain = CLI.Chain;
1540  SDValue Callee = CLI.Callee;
1541  bool &IsTailCall = CLI.IsTailCall;
1542  CallingConv::ID CallConv = CLI.CallConv;
1543  bool IsVarArg = CLI.IsVarArg;
1544  MachineFunction &MF = DAG.getMachineFunction();
1545  EVT PtrVT = getPointerTy(MF.getDataLayout());
1546
1547  // Detect unsupported vector argument and return types.
1548  if (Subtarget.hasVector()) {
1549    VerifyVectorTypes(Outs);
1550    VerifyVectorTypes(Ins);
1551  }
1552
1553  // Analyze the operands of the call, assigning locations to each operand.
1554  SmallVector<CCValAssign, 16> ArgLocs;
1555  SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1556  ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
1557
1558  // We don't support GuaranteedTailCallOpt, only automatically-detected
1559  // sibling calls.
1560  if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs, Outs))
1561    IsTailCall = false;
1562
1563  // Get a count of how many bytes are to be pushed on the stack.
1564  unsigned NumBytes = ArgCCInfo.getNextStackOffset();
1565
1566  // Mark the start of the call.
1567  if (!IsTailCall)
1568    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
1569
1570  // Copy argument values to their designated locations.
1571  SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
1572  SmallVector<SDValue, 8> MemOpChains;
1573  SDValue StackPtr;
1574  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
1575    CCValAssign &VA = ArgLocs[I];
1576    SDValue ArgValue = OutVals[I];
1577
1578    if (VA.getLocInfo() == CCValAssign::Indirect) {
1579      // Store the argument in a stack slot and pass its address.
1580      SDValue SpillSlot = DAG.CreateStackTemporary(Outs[I].ArgVT);
1581      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
1582      MemOpChains.push_back(
1583          DAG.getStore(Chain, DL, ArgValue, SpillSlot,
1584                       MachinePointerInfo::getFixedStack(MF, FI)));
1585      // If the original argument was split (e.g. i128), we need
1586      // to store all parts of it here (and pass just one address).
1587      unsigned ArgIndex = Outs[I].OrigArgIndex;
1588      assert (Outs[I].PartOffset == 0);
1589      while (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
1590        SDValue PartValue = OutVals[I + 1];
1591        unsigned PartOffset = Outs[I + 1].PartOffset;
1592        SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
1593                                      DAG.getIntPtrConstant(PartOffset, DL));
1594        MemOpChains.push_back(
1595            DAG.getStore(Chain, DL, PartValue, Address,
1596                         MachinePointerInfo::getFixedStack(MF, FI)));
1597        ++I;
1598      }
1599      ArgValue = SpillSlot;
1600    } else
1601      ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
1602
1603    if (VA.isRegLoc())
1604      // Queue up the argument copies and emit them at the end.
1605      RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
1606    else {
1607      assert(VA.isMemLoc() && "Argument not register or memory");
1608
1609      // Work out the address of the stack slot.  Unpromoted ints and
1610      // floats are passed as right-justified 8-byte values.
1611      if (!StackPtr.getNode())
1612        StackPtr = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, PtrVT);
1613      unsigned Offset = SystemZMC::CallFrameSize + VA.getLocMemOffset();
1614      if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
1615        Offset += 4;
1616      SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
1617                                    DAG.getIntPtrConstant(Offset, DL));
1618
1619      // Emit the store.
1620      MemOpChains.push_back(
1621          DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
1622    }
1623  }
1624
1625  // Join the stores, which are independent of one another.
1626  if (!MemOpChains.empty())
1627    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
1628
1629  // Accept direct calls by converting symbolic call addresses to the
1630  // associated Target* opcodes.  Force %r1 to be used for indirect
1631  // tail calls.
1632  SDValue Glue;
1633  if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1634    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
1635    Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
1636  } else if (auto *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1637    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
1638    Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
1639  } else if (IsTailCall) {
1640    Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
1641    Glue = Chain.getValue(1);
1642    Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
1643  }
1644
1645  // Build a sequence of copy-to-reg nodes, chained and glued together.
1646  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
1647    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
1648                             RegsToPass[I].second, Glue);
1649    Glue = Chain.getValue(1);
1650  }
1651
1652  // The first call operand is the chain and the second is the target address.
1653  SmallVector<SDValue, 8> Ops;
1654  Ops.push_back(Chain);
1655  Ops.push_back(Callee);
1656
1657  // Add argument registers to the end of the list so that they are
1658  // known live into the call.
1659  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
1660    Ops.push_back(DAG.getRegister(RegsToPass[I].first,
1661                                  RegsToPass[I].second.getValueType()));
1662
1663  // Add a register mask operand representing the call-preserved registers.
1664  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1665  const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
1666  assert(Mask && "Missing call preserved mask for calling convention");
1667  Ops.push_back(DAG.getRegisterMask(Mask));
1668
1669  // Glue the call to the argument copies, if any.
1670  if (Glue.getNode())
1671    Ops.push_back(Glue);
1672
1673  // Emit the call.
1674  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1675  if (IsTailCall)
1676    return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
1677  Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
1678  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
1679  Glue = Chain.getValue(1);
1680
1681  // Mark the end of the call, which is glued to the call itself.
1682  Chain = DAG.getCALLSEQ_END(Chain,
1683                             DAG.getConstant(NumBytes, DL, PtrVT, true),
1684                             DAG.getConstant(0, DL, PtrVT, true),
1685                             Glue, DL);
1686  Glue = Chain.getValue(1);
1687
1688  // Assign locations to each value returned by this call.
1689  SmallVector<CCValAssign, 16> RetLocs;
1690  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
1691  RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
1692
1693  // Copy all of the result registers out of their specified physreg.
1694  for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
1695    CCValAssign &VA = RetLocs[I];
1696
1697    // Copy the value out, gluing the copy to the end of the call sequence.
1698    SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
1699                                          VA.getLocVT(), Glue);
1700    Chain = RetValue.getValue(1);
1701    Glue = RetValue.getValue(2);
1702
1703    // Convert the value of the return register into the value that's
1704    // being returned.
1705    InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, RetValue));
1706  }
1707
1708  return Chain;
1709}
1710
1711bool SystemZTargetLowering::
1712CanLowerReturn(CallingConv::ID CallConv,
1713               MachineFunction &MF, bool isVarArg,
1714               const SmallVectorImpl<ISD::OutputArg> &Outs,
1715               LLVMContext &Context) const {
1716  // Detect unsupported vector return types.
1717  if (Subtarget.hasVector())
1718    VerifyVectorTypes(Outs);
1719
1720  // Special case that we cannot easily detect in RetCC_SystemZ since
1721  // i128 is not a legal type.
1722  for (auto &Out : Outs)
1723    if (Out.ArgVT == MVT::i128)
1724      return false;
1725
1726  SmallVector<CCValAssign, 16> RetLocs;
1727  CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
1728  return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
1729}
1730
1731SDValue
1732SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
1733                                   bool IsVarArg,
1734                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
1735                                   const SmallVectorImpl<SDValue> &OutVals,
1736                                   const SDLoc &DL, SelectionDAG &DAG) const {
1737  MachineFunction &MF = DAG.getMachineFunction();
1738
1739  // Detect unsupported vector return types.
1740  if (Subtarget.hasVector())
1741    VerifyVectorTypes(Outs);
1742
1743  // Assign locations to each returned value.
1744  SmallVector<CCValAssign, 16> RetLocs;
1745  CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
1746  RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
1747
1748  // Quick exit for void returns
1749  if (RetLocs.empty())
1750    return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, Chain);
1751
1752  if (CallConv == CallingConv::GHC)
1753    report_fatal_error("GHC functions return void only");
1754
1755  // Copy the result values into the output registers.
1756  SDValue Glue;
1757  SmallVector<SDValue, 4> RetOps;
1758  RetOps.push_back(Chain);
1759  for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
1760    CCValAssign &VA = RetLocs[I];
1761    SDValue RetValue = OutVals[I];
1762
1763    // Make the return register live on exit.
1764    assert(VA.isRegLoc() && "Can only return in registers!");
1765
1766    // Promote the value as required.
1767    RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);
1768
1769    // Chain and glue the copies together.
1770    Register Reg = VA.getLocReg();
1771    Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
1772    Glue = Chain.getValue(1);
1773    RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
1774  }
1775
1776  // Update chain and glue.
1777  RetOps[0] = Chain;
1778  if (Glue.getNode())
1779    RetOps.push_back(Glue);
1780
1781  return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
1782}
1783
1784// Return true if Op is an intrinsic node with chain that returns the CC value
1785// as its only (other) argument.  Provide the associated SystemZISD opcode and
1786// the mask of valid CC values if so.
1787static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
1788                                      unsigned &CCValid) {
1789  unsigned Id = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1790  switch (Id) {
1791  case Intrinsic::s390_tbegin:
1792    Opcode = SystemZISD::TBEGIN;
1793    CCValid = SystemZ::CCMASK_TBEGIN;
1794    return true;
1795
1796  case Intrinsic::s390_tbegin_nofloat:
1797    Opcode = SystemZISD::TBEGIN_NOFLOAT;
1798    CCValid = SystemZ::CCMASK_TBEGIN;
1799    return true;
1800
1801  case Intrinsic::s390_tend:
1802    Opcode = SystemZISD::TEND;
1803    CCValid = SystemZ::CCMASK_TEND;
1804    return true;
1805
1806  default:
1807    return false;
1808  }
1809}
1810
1811// Return true if Op is an intrinsic node without chain that returns the
1812// CC value as its final argument.  Provide the associated SystemZISD
1813// opcode and the mask of valid CC values if so.
1814static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
1815  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1816  switch (Id) {
1817  case Intrinsic::s390_vpkshs:
1818  case Intrinsic::s390_vpksfs:
1819  case Intrinsic::s390_vpksgs:
1820    Opcode = SystemZISD::PACKS_CC;
1821    CCValid = SystemZ::CCMASK_VCMP;
1822    return true;
1823
1824  case Intrinsic::s390_vpklshs:
1825  case Intrinsic::s390_vpklsfs:
1826  case Intrinsic::s390_vpklsgs:
1827    Opcode = SystemZISD::PACKLS_CC;
1828    CCValid = SystemZ::CCMASK_VCMP;
1829    return true;
1830
1831  case Intrinsic::s390_vceqbs:
1832  case Intrinsic::s390_vceqhs:
1833  case Intrinsic::s390_vceqfs:
1834  case Intrinsic::s390_vceqgs:
1835    Opcode = SystemZISD::VICMPES;
1836    CCValid = SystemZ::CCMASK_VCMP;
1837    return true;
1838
1839  case Intrinsic::s390_vchbs:
1840  case Intrinsic::s390_vchhs:
1841  case Intrinsic::s390_vchfs:
1842  case Intrinsic::s390_vchgs:
1843    Opcode = SystemZISD::VICMPHS;
1844    CCValid = SystemZ::CCMASK_VCMP;
1845    return true;
1846
1847  case Intrinsic::s390_vchlbs:
1848  case Intrinsic::s390_vchlhs:
1849  case Intrinsic::s390_vchlfs:
1850  case Intrinsic::s390_vchlgs:
1851    Opcode = SystemZISD::VICMPHLS;
1852    CCValid = SystemZ::CCMASK_VCMP;
1853    return true;
1854
1855  case Intrinsic::s390_vtm:
1856    Opcode = SystemZISD::VTM;
1857    CCValid = SystemZ::CCMASK_VCMP;
1858    return true;
1859
1860  case Intrinsic::s390_vfaebs:
1861  case Intrinsic::s390_vfaehs:
1862  case Intrinsic::s390_vfaefs:
1863    Opcode = SystemZISD::VFAE_CC;
1864    CCValid = SystemZ::CCMASK_ANY;
1865    return true;
1866
1867  case Intrinsic::s390_vfaezbs:
1868  case Intrinsic::s390_vfaezhs:
1869  case Intrinsic::s390_vfaezfs:
1870    Opcode = SystemZISD::VFAEZ_CC;
1871    CCValid = SystemZ::CCMASK_ANY;
1872    return true;
1873
1874  case Intrinsic::s390_vfeebs:
1875  case Intrinsic::s390_vfeehs:
1876  case Intrinsic::s390_vfeefs:
1877    Opcode = SystemZISD::VFEE_CC;
1878    CCValid = SystemZ::CCMASK_ANY;
1879    return true;
1880
1881  case Intrinsic::s390_vfeezbs:
1882  case Intrinsic::s390_vfeezhs:
1883  case Intrinsic::s390_vfeezfs:
1884    Opcode = SystemZISD::VFEEZ_CC;
1885    CCValid = SystemZ::CCMASK_ANY;
1886    return true;
1887
1888  case Intrinsic::s390_vfenebs:
1889  case Intrinsic::s390_vfenehs:
1890  case Intrinsic::s390_vfenefs:
1891    Opcode = SystemZISD::VFENE_CC;
1892    CCValid = SystemZ::CCMASK_ANY;
1893    return true;
1894
1895  case Intrinsic::s390_vfenezbs:
1896  case Intrinsic::s390_vfenezhs:
1897  case Intrinsic::s390_vfenezfs:
1898    Opcode = SystemZISD::VFENEZ_CC;
1899    CCValid = SystemZ::CCMASK_ANY;
1900    return true;
1901
1902  case Intrinsic::s390_vistrbs:
1903  case Intrinsic::s390_vistrhs:
1904  case Intrinsic::s390_vistrfs:
1905    Opcode = SystemZISD::VISTR_CC;
1906    CCValid = SystemZ::CCMASK_0 | SystemZ::CCMASK_3;
1907    return true;
1908
1909  case Intrinsic::s390_vstrcbs:
1910  case Intrinsic::s390_vstrchs:
1911  case Intrinsic::s390_vstrcfs:
1912    Opcode = SystemZISD::VSTRC_CC;
1913    CCValid = SystemZ::CCMASK_ANY;
1914    return true;
1915
1916  case Intrinsic::s390_vstrczbs:
1917  case Intrinsic::s390_vstrczhs:
1918  case Intrinsic::s390_vstrczfs:
1919    Opcode = SystemZISD::VSTRCZ_CC;
1920    CCValid = SystemZ::CCMASK_ANY;
1921    return true;
1922
1923  case Intrinsic::s390_vstrsb:
1924  case Intrinsic::s390_vstrsh:
1925  case Intrinsic::s390_vstrsf:
1926    Opcode = SystemZISD::VSTRS_CC;
1927    CCValid = SystemZ::CCMASK_ANY;
1928    return true;
1929
1930  case Intrinsic::s390_vstrszb:
1931  case Intrinsic::s390_vstrszh:
1932  case Intrinsic::s390_vstrszf:
1933    Opcode = SystemZISD::VSTRSZ_CC;
1934    CCValid = SystemZ::CCMASK_ANY;
1935    return true;
1936
1937  case Intrinsic::s390_vfcedbs:
1938  case Intrinsic::s390_vfcesbs:
1939    Opcode = SystemZISD::VFCMPES;
1940    CCValid = SystemZ::CCMASK_VCMP;
1941    return true;
1942
1943  case Intrinsic::s390_vfchdbs:
1944  case Intrinsic::s390_vfchsbs:
1945    Opcode = SystemZISD::VFCMPHS;
1946    CCValid = SystemZ::CCMASK_VCMP;
1947    return true;
1948
1949  case Intrinsic::s390_vfchedbs:
1950  case Intrinsic::s390_vfchesbs:
1951    Opcode = SystemZISD::VFCMPHES;
1952    CCValid = SystemZ::CCMASK_VCMP;
1953    return true;
1954
1955  case Intrinsic::s390_vftcidb:
1956  case Intrinsic::s390_vftcisb:
1957    Opcode = SystemZISD::VFTCI;
1958    CCValid = SystemZ::CCMASK_VCMP;
1959    return true;
1960
1961  case Intrinsic::s390_tdc:
1962    Opcode = SystemZISD::TDC;
1963    CCValid = SystemZ::CCMASK_TDC;
1964    return true;
1965
1966  default:
1967    return false;
1968  }
1969}
1970
1971// Emit an intrinsic with chain and an explicit CC register result.
1972static SDNode *emitIntrinsicWithCCAndChain(SelectionDAG &DAG, SDValue Op,
1973                                           unsigned Opcode) {
1974  // Copy all operands except the intrinsic ID.
1975  unsigned NumOps = Op.getNumOperands();
1976  SmallVector<SDValue, 6> Ops;
1977  Ops.reserve(NumOps - 1);
1978  Ops.push_back(Op.getOperand(0));
1979  for (unsigned I = 2; I < NumOps; ++I)
1980    Ops.push_back(Op.getOperand(I));
1981
1982  assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
1983  SDVTList RawVTs = DAG.getVTList(MVT::i32, MVT::Other);
1984  SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
1985  SDValue OldChain = SDValue(Op.getNode(), 1);
1986  SDValue NewChain = SDValue(Intr.getNode(), 1);
1987  DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
1988  return Intr.getNode();
1989}
1990
1991// Emit an intrinsic with an explicit CC register result.
1992static SDNode *emitIntrinsicWithCC(SelectionDAG &DAG, SDValue Op,
1993                                   unsigned Opcode) {
1994  // Copy all operands except the intrinsic ID.
1995  unsigned NumOps = Op.getNumOperands();
1996  SmallVector<SDValue, 6> Ops;
1997  Ops.reserve(NumOps - 1);
1998  for (unsigned I = 1; I < NumOps; ++I)
1999    Ops.push_back(Op.getOperand(I));
2000
2001  SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), Op->getVTList(), Ops);
2002  return Intr.getNode();
2003}
2004
2005// CC is a comparison that will be implemented using an integer or
2006// floating-point comparison.  Return the condition code mask for
2007// a branch on true.  In the integer case, CCMASK_CMP_UO is set for
2008// unsigned comparisons and clear for signed ones.  In the floating-point
2009// case, CCMASK_CMP_UO has its normal mask meaning (unordered).
2010static unsigned CCMaskForCondCode(ISD::CondCode CC) {
2011#define CONV(X) \
2012  case ISD::SET##X: return SystemZ::CCMASK_CMP_##X; \
2013  case ISD::SETO##X: return SystemZ::CCMASK_CMP_##X; \
2014  case ISD::SETU##X: return SystemZ::CCMASK_CMP_UO | SystemZ::CCMASK_CMP_##X
2015
2016  switch (CC) {
2017  default:
2018    llvm_unreachable("Invalid integer condition!");
2019
2020  CONV(EQ);
2021  CONV(NE);
2022  CONV(GT);
2023  CONV(GE);
2024  CONV(LT);
2025  CONV(LE);
2026
2027  case ISD::SETO:  return SystemZ::CCMASK_CMP_O;
2028  case ISD::SETUO: return SystemZ::CCMASK_CMP_UO;
2029  }
2030#undef CONV
2031}
2032
2033// If C can be converted to a comparison against zero, adjust the operands
2034// as necessary.
2035static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
2036  if (C.ICmpType == SystemZICMP::UnsignedOnly)
2037    return;
2038
2039  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1.getNode());
2040  if (!ConstOp1)
2041    return;
2042
2043  int64_t Value = ConstOp1->getSExtValue();
2044  if ((Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_GT) ||
2045      (Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_LE) ||
2046      (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_LT) ||
2047      (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_GE)) {
2048    C.CCMask ^= SystemZ::CCMASK_CMP_EQ;
2049    C.Op1 = DAG.getConstant(0, DL, C.Op1.getValueType());
2050  }
2051}
2052
2053// If a comparison described by C is suitable for CLI(Y), CHHSI or CLHHSI,
2054// adjust the operands as necessary.
2055static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
2056                             Comparison &C) {
2057  // For us to make any changes, it must a comparison between a single-use
2058  // load and a constant.
2059  if (!C.Op0.hasOneUse() ||
2060      C.Op0.getOpcode() != ISD::LOAD ||
2061      C.Op1.getOpcode() != ISD::Constant)
2062    return;
2063
2064  // We must have an 8- or 16-bit load.
2065  auto *Load = cast<LoadSDNode>(C.Op0);
2066  unsigned NumBits = Load->getMemoryVT().getSizeInBits();
2067  if ((NumBits != 8 && NumBits != 16) ||
2068      NumBits != Load->getMemoryVT().getStoreSizeInBits())
2069    return;
2070
2071  // The load must be an extending one and the constant must be within the
2072  // range of the unextended value.
2073  auto *ConstOp1 = cast<ConstantSDNode>(C.Op1);
2074  uint64_t Value = ConstOp1->getZExtValue();
2075  uint64_t Mask = (1 << NumBits) - 1;
2076  if (Load->getExtensionType() == ISD::SEXTLOAD) {
2077    // Make sure that ConstOp1 is in range of C.Op0.
2078    int64_t SignedValue = ConstOp1->getSExtValue();
2079    if (uint64_t(SignedValue) + (uint64_t(1) << (NumBits - 1)) > Mask)
2080      return;
2081    if (C.ICmpType != SystemZICMP::SignedOnly) {
2082      // Unsigned comparison between two sign-extended values is equivalent
2083      // to unsigned comparison between two zero-extended values.
2084      Value &= Mask;
2085    } else if (NumBits == 8) {
2086      // Try to treat the comparison as unsigned, so that we can use CLI.
2087      // Adjust CCMask and Value as necessary.
2088      if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_LT)
2089        // Test whether the high bit of the byte is set.
2090        Value = 127, C.CCMask = SystemZ::CCMASK_CMP_GT;
2091      else if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_GE)
2092        // Test whether the high bit of the byte is clear.
2093        Value = 128, C.CCMask = SystemZ::CCMASK_CMP_LT;
2094      else
2095        // No instruction exists for this combination.
2096        return;
2097      C.ICmpType = SystemZICMP::UnsignedOnly;
2098    }
2099  } else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
2100    if (Value > Mask)
2101      return;
2102    // If the constant is in range, we can use any comparison.
2103    C.ICmpType = SystemZICMP::Any;
2104  } else
2105    return;
2106
2107  // Make sure that the first operand is an i32 of the right extension type.
2108  ISD::LoadExtType ExtType = (C.ICmpType == SystemZICMP::SignedOnly ?
2109                              ISD::SEXTLOAD :
2110                              ISD::ZEXTLOAD);
2111  if (C.Op0.getValueType() != MVT::i32 ||
2112      Load->getExtensionType() != ExtType) {
2113    C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(),
2114                           Load->getBasePtr(), Load->getPointerInfo(),
2115                           Load->getMemoryVT(), Load->getAlignment(),
2116                           Load->getMemOperand()->getFlags());
2117    // Update the chain uses.
2118    DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), C.Op0.getValue(1));
2119  }
2120
2121  // Make sure that the second operand is an i32 with the right value.
2122  if (C.Op1.getValueType() != MVT::i32 ||
2123      Value != ConstOp1->getZExtValue())
2124    C.Op1 = DAG.getConstant(Value, DL, MVT::i32);
2125}
2126
2127// Return true if Op is either an unextended load, or a load suitable
2128// for integer register-memory comparisons of type ICmpType.
2129static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
2130  auto *Load = dyn_cast<LoadSDNode>(Op.getNode());
2131  if (Load) {
2132    // There are no instructions to compare a register with a memory byte.
2133    if (Load->getMemoryVT() == MVT::i8)
2134      return false;
2135    // Otherwise decide on extension type.
2136    switch (Load->getExtensionType()) {
2137    case ISD::NON_EXTLOAD:
2138      return true;
2139    case ISD::SEXTLOAD:
2140      return ICmpType != SystemZICMP::UnsignedOnly;
2141    case ISD::ZEXTLOAD:
2142      return ICmpType != SystemZICMP::SignedOnly;
2143    default:
2144      break;
2145    }
2146  }
2147  return false;
2148}
2149
2150// Return true if it is better to swap the operands of C.
2151static bool shouldSwapCmpOperands(const Comparison &C) {
2152  // Leave f128 comparisons alone, since they have no memory forms.
2153  if (C.Op0.getValueType() == MVT::f128)
2154    return false;
2155
2156  // Always keep a floating-point constant second, since comparisons with
2157  // zero can use LOAD TEST and comparisons with other constants make a
2158  // natural memory operand.
2159  if (isa<ConstantFPSDNode>(C.Op1))
2160    return false;
2161
2162  // Never swap comparisons with zero since there are many ways to optimize
2163  // those later.
2164  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
2165  if (ConstOp1 && ConstOp1->getZExtValue() == 0)
2166    return false;
2167
2168  // Also keep natural memory operands second if the loaded value is
2169  // only used here.  Several comparisons have memory forms.
2170  if (isNaturalMemoryOperand(C.Op1, C.ICmpType) && C.Op1.hasOneUse())
2171    return false;
2172
2173  // Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
2174  // In that case we generally prefer the memory to be second.
2175  if (isNaturalMemoryOperand(C.Op0, C.ICmpType) && C.Op0.hasOneUse()) {
2176    // The only exceptions are when the second operand is a constant and
2177    // we can use things like CHHSI.
2178    if (!ConstOp1)
2179      return true;
2180    // The unsigned memory-immediate instructions can handle 16-bit
2181    // unsigned integers.
2182    if (C.ICmpType != SystemZICMP::SignedOnly &&
2183        isUInt<16>(ConstOp1->getZExtValue()))
2184      return false;
2185    // The signed memory-immediate instructions can handle 16-bit
2186    // signed integers.
2187    if (C.ICmpType != SystemZICMP::UnsignedOnly &&
2188        isInt<16>(ConstOp1->getSExtValue()))
2189      return false;
2190    return true;
2191  }
2192
2193  // Try to promote the use of CGFR and CLGFR.
2194  unsigned Opcode0 = C.Op0.getOpcode();
2195  if (C.ICmpType != SystemZICMP::UnsignedOnly && Opcode0 == ISD::SIGN_EXTEND)
2196    return true;
2197  if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
2198    return true;
2199  if (C.ICmpType != SystemZICMP::SignedOnly &&
2200      Opcode0 == ISD::AND &&
2201      C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
2202      cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
2203    return true;
2204
2205  return false;
2206}
2207
2208// Check whether C tests for equality between X and Y and whether X - Y
2209// or Y - X is also computed.  In that case it's better to compare the
2210// result of the subtraction against zero.
2211static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL,
2212                                 Comparison &C) {
2213  if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
2214      C.CCMask == SystemZ::CCMASK_CMP_NE) {
2215    for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
2216      SDNode *N = *I;
2217      if (N->getOpcode() == ISD::SUB &&
2218          ((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) ||
2219           (N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
2220        C.Op0 = SDValue(N, 0);
2221        C.Op1 = DAG.getConstant(0, DL, N->getValueType(0));
2222        return;
2223      }
2224    }
2225  }
2226}
2227
2228// Check whether C compares a floating-point value with zero and if that
2229// floating-point value is also negated.  In this case we can use the
2230// negation to set CC, so avoiding separate LOAD AND TEST and
2231// LOAD (NEGATIVE/COMPLEMENT) instructions.
2232static void adjustForFNeg(Comparison &C) {
2233  // This optimization is invalid for strict comparisons, since FNEG
2234  // does not raise any exceptions.
2235  if (C.Chain)
2236    return;
2237  auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
2238  if (C1 && C1->isZero()) {
2239    for (auto I = C.Op0->use_begin(), E = C.Op0->use_end(); I != E; ++I) {
2240      SDNode *N = *I;
2241      if (N->getOpcode() == ISD::FNEG) {
2242        C.Op0 = SDValue(N, 0);
2243        C.CCMask = SystemZ::reverseCCMask(C.CCMask);
2244        return;
2245      }
2246    }
2247  }
2248}
2249
2250// Check whether C compares (shl X, 32) with 0 and whether X is
2251// also sign-extended.  In that case it is better to test the result
2252// of the sign extension using LTGFR.
2253//
2254// This case is important because InstCombine transforms a comparison
2255// with (sext (trunc X)) into a comparison with (shl X, 32).
2256static void adjustForLTGFR(Comparison &C) {
2257  // Check for a comparison between (shl X, 32) and 0.
2258  if (C.Op0.getOpcode() == ISD::SHL &&
2259      C.Op0.getValueType() == MVT::i64 &&
2260      C.Op1.getOpcode() == ISD::Constant &&
2261      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
2262    auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
2263    if (C1 && C1->getZExtValue() == 32) {
2264      SDValue ShlOp0 = C.Op0.getOperand(0);
2265      // See whether X has any SIGN_EXTEND_INREG uses.
2266      for (auto I = ShlOp0->use_begin(), E = ShlOp0->use_end(); I != E; ++I) {
2267        SDNode *N = *I;
2268        if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
2269            cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
2270          C.Op0 = SDValue(N, 0);
2271          return;
2272        }
2273      }
2274    }
2275  }
2276}
2277
2278// If C compares the truncation of an extending load, try to compare
2279// the untruncated value instead.  This exposes more opportunities to
2280// reuse CC.
2281static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
2282                               Comparison &C) {
2283  if (C.Op0.getOpcode() == ISD::TRUNCATE &&
2284      C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
2285      C.Op1.getOpcode() == ISD::Constant &&
2286      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
2287    auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
2288    if (L->getMemoryVT().getStoreSizeInBits() <= C.Op0.getValueSizeInBits()) {
2289      unsigned Type = L->getExtensionType();
2290      if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) ||
2291          (Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
2292        C.Op0 = C.Op0.getOperand(0);
2293        C.Op1 = DAG.getConstant(0, DL, C.Op0.getValueType());
2294      }
2295    }
2296  }
2297}
2298
2299// Return true if shift operation N has an in-range constant shift value.
2300// Store it in ShiftVal if so.
2301static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
2302  auto *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
2303  if (!Shift)
2304    return false;
2305
2306  uint64_t Amount = Shift->getZExtValue();
2307  if (Amount >= N.getValueSizeInBits())
2308    return false;
2309
2310  ShiftVal = Amount;
2311  return true;
2312}
2313
2314// Check whether an AND with Mask is suitable for a TEST UNDER MASK
2315// instruction and whether the CC value is descriptive enough to handle
2316// a comparison of type Opcode between the AND result and CmpVal.
2317// CCMask says which comparison result is being tested and BitSize is
2318// the number of bits in the operands.  If TEST UNDER MASK can be used,
2319// return the corresponding CC mask, otherwise return 0.
2320static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
2321                                     uint64_t Mask, uint64_t CmpVal,
2322                                     unsigned ICmpType) {
2323  assert(Mask != 0 && "ANDs with zero should have been removed by now");
2324
2325  // Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
2326  if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
2327      !SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
2328    return 0;
2329
2330  // Work out the masks for the lowest and highest bits.
2331  unsigned HighShift = 63 - countLeadingZeros(Mask);
2332  uint64_t High = uint64_t(1) << HighShift;
2333  uint64_t Low = uint64_t(1) << countTrailingZeros(Mask);
2334
2335  // Signed ordered comparisons are effectively unsigned if the sign
2336  // bit is dropped.
2337  bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);
2338
2339  // Check for equality comparisons with 0, or the equivalent.
2340  if (CmpVal == 0) {
2341    if (CCMask == SystemZ::CCMASK_CMP_EQ)
2342      return SystemZ::CCMASK_TM_ALL_0;
2343    if (CCMask == SystemZ::CCMASK_CMP_NE)
2344      return SystemZ::CCMASK_TM_SOME_1;
2345  }
2346  if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
2347    if (CCMask == SystemZ::CCMASK_CMP_LT)
2348      return SystemZ::CCMASK_TM_ALL_0;
2349    if (CCMask == SystemZ::CCMASK_CMP_GE)
2350      return SystemZ::CCMASK_TM_SOME_1;
2351  }
2352  if (EffectivelyUnsigned && CmpVal < Low) {
2353    if (CCMask == SystemZ::CCMASK_CMP_LE)
2354      return SystemZ::CCMASK_TM_ALL_0;
2355    if (CCMask == SystemZ::CCMASK_CMP_GT)
2356      return SystemZ::CCMASK_TM_SOME_1;
2357  }
2358
2359  // Check for equality comparisons with the mask, or the equivalent.
2360  if (CmpVal == Mask) {
2361    if (CCMask == SystemZ::CCMASK_CMP_EQ)
2362      return SystemZ::CCMASK_TM_ALL_1;
2363    if (CCMask == SystemZ::CCMASK_CMP_NE)
2364      return SystemZ::CCMASK_TM_SOME_0;
2365  }
2366  if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
2367    if (CCMask == SystemZ::CCMASK_CMP_GT)
2368      return SystemZ::CCMASK_TM_ALL_1;
2369    if (CCMask == SystemZ::CCMASK_CMP_LE)
2370      return SystemZ::CCMASK_TM_SOME_0;
2371  }
2372  if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
2373    if (CCMask == SystemZ::CCMASK_CMP_GE)
2374      return SystemZ::CCMASK_TM_ALL_1;
2375    if (CCMask == SystemZ::CCMASK_CMP_LT)
2376      return SystemZ::CCMASK_TM_SOME_0;
2377  }
2378
2379  // Check for ordered comparisons with the top bit.
2380  if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
2381    if (CCMask == SystemZ::CCMASK_CMP_LE)
2382      return SystemZ::CCMASK_TM_MSB_0;
2383    if (CCMask == SystemZ::CCMASK_CMP_GT)
2384      return SystemZ::CCMASK_TM_MSB_1;
2385  }
2386  if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
2387    if (CCMask == SystemZ::CCMASK_CMP_LT)
2388      return SystemZ::CCMASK_TM_MSB_0;
2389    if (CCMask == SystemZ::CCMASK_CMP_GE)
2390      return SystemZ::CCMASK_TM_MSB_1;
2391  }
2392
2393  // If there are just two bits, we can do equality checks for Low and High
2394  // as well.
2395  if (Mask == Low + High) {
2396    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
2397      return SystemZ::CCMASK_TM_MIXED_MSB_0;
2398    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
2399      return SystemZ::CCMASK_TM_MIXED_MSB_0 ^ SystemZ::CCMASK_ANY;
2400    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
2401      return SystemZ::CCMASK_TM_MIXED_MSB_1;
2402    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
2403      return SystemZ::CCMASK_TM_MIXED_MSB_1 ^ SystemZ::CCMASK_ANY;
2404  }
2405
2406  // Looks like we've exhausted our options.
2407  return 0;
2408}
2409
2410// See whether C can be implemented as a TEST UNDER MASK instruction.
2411// Update the arguments with the TM version if so.
2412static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
2413                                   Comparison &C) {
2414  // Check that we have a comparison with a constant.
2415  auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
2416  if (!ConstOp1)
2417    return;
2418  uint64_t CmpVal = ConstOp1->getZExtValue();
2419
2420  // Check whether the nonconstant input is an AND with a constant mask.
2421  Comparison NewC(C);
2422  uint64_t MaskVal;
2423  ConstantSDNode *Mask = nullptr;
2424  if (C.Op0.getOpcode() == ISD::AND) {
2425    NewC.Op0 = C.Op0.getOperand(0);
2426    NewC.Op1 = C.Op0.getOperand(1);
2427    Mask = dyn_cast<ConstantSDNode>(NewC.Op1);
2428    if (!Mask)
2429      return;
2430    MaskVal = Mask->getZExtValue();
2431  } else {
2432    // There is no instruction to compare with a 64-bit immediate
2433    // so use TMHH instead if possible.  We need an unsigned ordered
2434    // comparison with an i64 immediate.
2435    if (NewC.Op0.getValueType() != MVT::i64 ||
2436        NewC.CCMask == SystemZ::CCMASK_CMP_EQ ||
2437        NewC.CCMask == SystemZ::CCMASK_CMP_NE ||
2438        NewC.ICmpType == SystemZICMP::SignedOnly)
2439      return;
2440    // Convert LE and GT comparisons into LT and GE.
2441    if (NewC.CCMask == SystemZ::CCMASK_CMP_LE ||
2442        NewC.CCMask == SystemZ::CCMASK_CMP_GT) {
2443      if (CmpVal == uint64_t(-1))
2444        return;
2445      CmpVal += 1;
2446      NewC.CCMask ^= SystemZ::CCMASK_CMP_EQ;
2447    }
2448    // If the low N bits of Op1 are zero than the low N bits of Op0 can
2449    // be masked off without changing the result.
2450    MaskVal = -(CmpVal & -CmpVal);
2451    NewC.ICmpType = SystemZICMP::UnsignedOnly;
2452  }
2453  if (!MaskVal)
2454    return;
2455
2456  // Check whether the combination of mask, comparison value and comparison
2457  // type are suitable.
2458  unsigned BitSize = NewC.Op0.getValueSizeInBits();
2459  unsigned NewCCMask, ShiftVal;
2460  if (NewC.ICmpType != SystemZICMP::SignedOnly &&
2461      NewC.Op0.getOpcode() == ISD::SHL &&
2462      isSimpleShift(NewC.Op0, ShiftVal) &&
2463      (MaskVal >> ShiftVal != 0) &&
2464      ((CmpVal >> ShiftVal) << ShiftVal) == CmpVal &&
2465      (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
2466                                        MaskVal >> ShiftVal,
2467                                        CmpVal >> ShiftVal,
2468                                        SystemZICMP::Any))) {
2469    NewC.Op0 = NewC.Op0.getOperand(0);
2470    MaskVal >>= ShiftVal;
2471  } else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
2472             NewC.Op0.getOpcode() == ISD::SRL &&
2473             isSimpleShift(NewC.Op0, ShiftVal) &&
2474             (MaskVal << ShiftVal != 0) &&
2475             ((CmpVal << ShiftVal) >> ShiftVal) == CmpVal &&
2476             (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
2477                                               MaskVal << ShiftVal,
2478                                               CmpVal << ShiftVal,
2479                                               SystemZICMP::UnsignedOnly))) {
2480    NewC.Op0 = NewC.Op0.getOperand(0);
2481    MaskVal <<= ShiftVal;
2482  } else {
2483    NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal, CmpVal,
2484                                     NewC.ICmpType);
2485    if (!NewCCMask)
2486      return;
2487  }
2488
2489  // Go ahead and make the change.
2490  C.Opcode = SystemZISD::TM;
2491  C.Op0 = NewC.Op0;
2492  if (Mask && Mask->getZExtValue() == MaskVal)
2493    C.Op1 = SDValue(Mask, 0);
2494  else
2495    C.Op1 = DAG.getConstant(MaskVal, DL, C.Op0.getValueType());
2496  C.CCValid = SystemZ::CCMASK_TM;
2497  C.CCMask = NewCCMask;
2498}
2499
2500// See whether the comparison argument contains a redundant AND
2501// and remove it if so.  This sometimes happens due to the generic
2502// BRCOND expansion.
2503static void adjustForRedundantAnd(SelectionDAG &DAG, const SDLoc &DL,
2504                                  Comparison &C) {
2505  if (C.Op0.getOpcode() != ISD::AND)
2506    return;
2507  auto *Mask = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
2508  if (!Mask)
2509    return;
2510  KnownBits Known = DAG.computeKnownBits(C.Op0.getOperand(0));
2511  if ((~Known.Zero).getZExtValue() & ~Mask->getZExtValue())
2512    return;
2513
2514  C.Op0 = C.Op0.getOperand(0);
2515}
2516
2517// Return a Comparison that tests the condition-code result of intrinsic
2518// node Call against constant integer CC using comparison code Cond.
2519// Opcode is the opcode of the SystemZISD operation for the intrinsic
2520// and CCValid is the set of possible condition-code results.
2521static Comparison getIntrinsicCmp(SelectionDAG &DAG, unsigned Opcode,
2522                                  SDValue Call, unsigned CCValid, uint64_t CC,
2523                                  ISD::CondCode Cond) {
2524  Comparison C(Call, SDValue(), SDValue());
2525  C.Opcode = Opcode;
2526  C.CCValid = CCValid;
2527  if (Cond == ISD::SETEQ)
2528    // bit 3 for CC==0, bit 0 for CC==3, always false for CC>3.
2529    C.CCMask = CC < 4 ? 1 << (3 - CC) : 0;
2530  else if (Cond == ISD::SETNE)
2531    // ...and the inverse of that.
2532    C.CCMask = CC < 4 ? ~(1 << (3 - CC)) : -1;
2533  else if (Cond == ISD::SETLT || Cond == ISD::SETULT)
2534    // bits above bit 3 for CC==0 (always false), bits above bit 0 for CC==3,
2535    // always true for CC>3.
2536    C.CCMask = CC < 4 ? ~0U << (4 - CC) : -1;
2537  else if (Cond == ISD::SETGE || Cond == ISD::SETUGE)
2538    // ...and the inverse of that.
2539    C.CCMask = CC < 4 ? ~(~0U << (4 - CC)) : 0;
2540  else if (Cond == ISD::SETLE || Cond == ISD::SETULE)
2541    // bit 3 and above for CC==0, bit 0 and above for CC==3 (always true),
2542    // always true for CC>3.
2543    C.CCMask = CC < 4 ? ~0U << (3 - CC) : -1;
2544  else if (Cond == ISD::SETGT || Cond == ISD::SETUGT)
2545    // ...and the inverse of that.
2546    C.CCMask = CC < 4 ? ~(~0U << (3 - CC)) : 0;
2547  else
2548    llvm_unreachable("Unexpected integer comparison type");
2549  C.CCMask &= CCValid;
2550  return C;
2551}
2552
2553// Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
2554static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
2555                         ISD::CondCode Cond, const SDLoc &DL,
2556                         SDValue Chain = SDValue(),
2557                         bool IsSignaling = false) {
2558  if (CmpOp1.getOpcode() == ISD::Constant) {
2559    assert(!Chain);
2560    uint64_t Constant = cast<ConstantSDNode>(CmpOp1)->getZExtValue();
2561    unsigned Opcode, CCValid;
2562    if (CmpOp0.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
2563        CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&
2564        isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))
2565      return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
2566    if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
2567        CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&
2568        isIntrinsicWithCC(CmpOp0, Opcode, CCValid))
2569      return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
2570  }
2571  Comparison C(CmpOp0, CmpOp1, Chain);
2572  C.CCMask = CCMaskForCondCode(Cond);
2573  if (C.Op0.getValueType().isFloatingPoint()) {
2574    C.CCValid = SystemZ::CCMASK_FCMP;
2575    if (!C.Chain)
2576      C.Opcode = SystemZISD::FCMP;
2577    else if (!IsSignaling)
2578      C.Opcode = SystemZISD::STRICT_FCMP;
2579    else
2580      C.Opcode = SystemZISD::STRICT_FCMPS;
2581    adjustForFNeg(C);
2582  } else {
2583    assert(!C.Chain);
2584    C.CCValid = SystemZ::CCMASK_ICMP;
2585    C.Opcode = SystemZISD::ICMP;
2586    // Choose the type of comparison.  Equality and inequality tests can
2587    // use either signed or unsigned comparisons.  The choice also doesn't
2588    // matter if both sign bits are known to be clear.  In those cases we
2589    // want to give the main isel code the freedom to choose whichever
2590    // form fits best.
2591    if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
2592        C.CCMask == SystemZ::CCMASK_CMP_NE ||
2593        (DAG.SignBitIsZero(C.Op0) && DAG.SignBitIsZero(C.Op1)))
2594      C.ICmpType = SystemZICMP::Any;
2595    else if (C.CCMask & SystemZ::CCMASK_CMP_UO)
2596      C.ICmpType = SystemZICMP::UnsignedOnly;
2597    else
2598      C.ICmpType = SystemZICMP::SignedOnly;
2599    C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
2600    adjustForRedundantAnd(DAG, DL, C);
2601    adjustZeroCmp(DAG, DL, C);
2602    adjustSubwordCmp(DAG, DL, C);
2603    adjustForSubtraction(DAG, DL, C);
2604    adjustForLTGFR(C);
2605    adjustICmpTruncate(DAG, DL, C);
2606  }
2607
2608  if (shouldSwapCmpOperands(C)) {
2609    std::swap(C.Op0, C.Op1);
2610    C.CCMask = SystemZ::reverseCCMask(C.CCMask);
2611  }
2612
2613  adjustForTestUnderMask(DAG, DL, C);
2614  return C;
2615}
2616
2617// Emit the comparison instruction described by C.
2618static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
2619  if (!C.Op1.getNode()) {
2620    SDNode *Node;
2621    switch (C.Op0.getOpcode()) {
2622    case ISD::INTRINSIC_W_CHAIN:
2623      Node = emitIntrinsicWithCCAndChain(DAG, C.Op0, C.Opcode);
2624      return SDValue(Node, 0);
2625    case ISD::INTRINSIC_WO_CHAIN:
2626      Node = emitIntrinsicWithCC(DAG, C.Op0, C.Opcode);
2627      return SDValue(Node, Node->getNumValues() - 1);
2628    default:
2629      llvm_unreachable("Invalid comparison operands");
2630    }
2631  }
2632  if (C.Opcode == SystemZISD::ICMP)
2633    return DAG.getNode(SystemZISD::ICMP, DL, MVT::i32, C.Op0, C.Op1,
2634                       DAG.getTargetConstant(C.ICmpType, DL, MVT::i32));
2635  if (C.Opcode == SystemZISD::TM) {
2636    bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
2637                         bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
2638    return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1,
2639                       DAG.getTargetConstant(RegisterOnly, DL, MVT::i32));
2640  }
2641  if (C.Chain) {
2642    SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
2643    return DAG.getNode(C.Opcode, DL, VTs, C.Chain, C.Op0, C.Op1);
2644  }
2645  return DAG.getNode(C.Opcode, DL, MVT::i32, C.Op0, C.Op1);
2646}
2647
2648// Implement a 32-bit *MUL_LOHI operation by extending both operands to
2649// 64 bits.  Extend is the extension type to use.  Store the high part
2650// in Hi and the low part in Lo.
2651static void lowerMUL_LOHI32(SelectionDAG &DAG, const SDLoc &DL, unsigned Extend,
2652                            SDValue Op0, SDValue Op1, SDValue &Hi,
2653                            SDValue &Lo) {
2654  Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
2655  Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
2656  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
2657  Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2658                   DAG.getConstant(32, DL, MVT::i64));
2659  Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
2660  Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
2661}
2662
2663// Lower a binary operation that produces two VT results, one in each
2664// half of a GR128 pair.  Op0 and Op1 are the VT operands to the operation,
2665// and Opcode performs the GR128 operation.  Store the even register result
2666// in Even and the odd register result in Odd.
2667static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
2668                             unsigned Opcode, SDValue Op0, SDValue Op1,
2669                             SDValue &Even, SDValue &Odd) {
2670  SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped, Op0, Op1);
2671  bool Is32Bit = is32Bit(VT);
2672  Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
2673  Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
2674}
2675
2676// Return an i32 value that is 1 if the CC value produced by CCReg is
2677// in the mask CCMask and 0 otherwise.  CC is known to have a value
2678// in CCValid, so other values can be ignored.
2679static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue CCReg,
2680                         unsigned CCValid, unsigned CCMask) {
2681  SDValue Ops[] = {DAG.getConstant(1, DL, MVT::i32),
2682                   DAG.getConstant(0, DL, MVT::i32),
2683                   DAG.getTargetConstant(CCValid, DL, MVT::i32),
2684                   DAG.getTargetConstant(CCMask, DL, MVT::i32), CCReg};
2685  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, MVT::i32, Ops);
2686}
2687
2688// Return the SystemISD vector comparison operation for CC, or 0 if it cannot
2689// be done directly.  Mode is CmpMode::Int for integer comparisons, CmpMode::FP
2690// for regular floating-point comparisons, CmpMode::StrictFP for strict (quiet)
2691// floating-point comparisons, and CmpMode::SignalingFP for strict signaling
2692// floating-point comparisons.
2693enum class CmpMode { Int, FP, StrictFP, SignalingFP };
2694static unsigned getVectorComparison(ISD::CondCode CC, CmpMode Mode) {
2695  switch (CC) {
2696  case ISD::SETOEQ:
2697  case ISD::SETEQ:
2698    switch (Mode) {
2699    case CmpMode::Int:         return SystemZISD::VICMPE;
2700    case CmpMode::FP:          return SystemZISD::VFCMPE;
2701    case CmpMode::StrictFP:    return SystemZISD::STRICT_VFCMPE;
2702    case CmpMode::SignalingFP: return SystemZISD::STRICT_VFCMPES;
2703    }
2704    llvm_unreachable("Bad mode");
2705
2706  case ISD::SETOGE:
2707  case ISD::SETGE:
2708    switch (Mode) {
2709    case CmpMode::Int:         return 0;
2710    case CmpMode::FP:          return SystemZISD::VFCMPHE;
2711    case CmpMode::StrictFP:    return SystemZISD::STRICT_VFCMPHE;
2712    case CmpMode::SignalingFP: return SystemZISD::STRICT_VFCMPHES;
2713    }
2714    llvm_unreachable("Bad mode");
2715
2716  case ISD::SETOGT:
2717  case ISD::SETGT:
2718    switch (Mode) {
2719    case CmpMode::Int:         return SystemZISD::VICMPH;
2720    case CmpMode::FP:          return SystemZISD::VFCMPH;
2721    case CmpMode::StrictFP:    return SystemZISD::STRICT_VFCMPH;
2722    case CmpMode::SignalingFP: return SystemZISD::STRICT_VFCMPHS;
2723    }
2724    llvm_unreachable("Bad mode");
2725
2726  case ISD::SETUGT:
2727    switch (Mode) {
2728    case CmpMode::Int:         return SystemZISD::VICMPHL;
2729    case CmpMode::FP:          return 0;
2730    case CmpMode::StrictFP:    return 0;
2731    case CmpMode::SignalingFP: return 0;
2732    }
2733    llvm_unreachable("Bad mode");
2734
2735  default:
2736    return 0;
2737  }
2738}
2739
2740// Return the SystemZISD vector comparison operation for CC or its inverse,
2741// or 0 if neither can be done directly.  Indicate in Invert whether the
2742// result is for the inverse of CC.  Mode is as above.
2743static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, CmpMode Mode,
2744                                            bool &Invert) {
2745  if (unsigned Opcode = getVectorComparison(CC, Mode)) {
2746    Invert = false;
2747    return Opcode;
2748  }
2749
2750  CC = ISD::getSetCCInverse(CC, Mode == CmpMode::Int ? MVT::i32 : MVT::f32);
2751  if (unsigned Opcode = getVectorComparison(CC, Mode)) {
2752    Invert = true;
2753    return Opcode;
2754  }
2755
2756  return 0;
2757}
2758
2759// Return a v2f64 that contains the extended form of elements Start and Start+1
2760// of v4f32 value Op.  If Chain is nonnull, return the strict form.
2761static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
2762                                  SDValue Op, SDValue Chain) {
2763  int Mask[] = { Start, -1, Start + 1, -1 };
2764  Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
2765  if (Chain) {
2766    SDVTList VTs = DAG.getVTList(MVT::v2f64, MVT::Other);
2767    return DAG.getNode(SystemZISD::STRICT_VEXTEND, DL, VTs, Chain, Op);
2768  }
2769  return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
2770}
2771
2772// Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
2773// producing a result of type VT.  If Chain is nonnull, return the strict form.
2774SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
2775                                            const SDLoc &DL, EVT VT,
2776                                            SDValue CmpOp0,
2777                                            SDValue CmpOp1,
2778                                            SDValue Chain) const {
2779  // There is no hardware support for v4f32 (unless we have the vector
2780  // enhancements facility 1), so extend the vector into two v2f64s
2781  // and compare those.
2782  if (CmpOp0.getValueType() == MVT::v4f32 &&
2783      !Subtarget.hasVectorEnhancements1()) {
2784    SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0, Chain);
2785    SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0, Chain);
2786    SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1, Chain);
2787    SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1, Chain);
2788    if (Chain) {
2789      SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::Other);
2790      SDValue HRes = DAG.getNode(Opcode, DL, VTs, Chain, H0, H1);
2791      SDValue LRes = DAG.getNode(Opcode, DL, VTs, Chain, L0, L1);
2792      SDValue Res = DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
2793      SDValue Chains[6] = { H0.getValue(1), L0.getValue(1),
2794                            H1.getValue(1), L1.getValue(1),
2795                            HRes.getValue(1), LRes.getValue(1) };
2796      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2797      SDValue Ops[2] = { Res, NewChain };
2798      return DAG.getMergeValues(Ops, DL);
2799    }
2800    SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
2801    SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
2802    return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
2803  }
2804  if (Chain) {
2805    SDVTList VTs = DAG.getVTList(VT, MVT::Other);
2806    return DAG.getNode(Opcode, DL, VTs, Chain, CmpOp0, CmpOp1);
2807  }
2808  return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
2809}
2810
2811// Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
2812// an integer mask of type VT.  If Chain is nonnull, we have a strict
2813// floating-point comparison.  If in addition IsSignaling is true, we have
2814// a strict signaling floating-point comparison.
2815SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
2816                                                const SDLoc &DL, EVT VT,
2817                                                ISD::CondCode CC,
2818                                                SDValue CmpOp0,
2819                                                SDValue CmpOp1,
2820                                                SDValue Chain,
2821                                                bool IsSignaling) const {
2822  bool IsFP = CmpOp0.getValueType().isFloatingPoint();
2823  assert (!Chain || IsFP);
2824  assert (!IsSignaling || Chain);
2825  CmpMode Mode = IsSignaling ? CmpMode::SignalingFP :
2826                 Chain ? CmpMode::StrictFP : IsFP ? CmpMode::FP : CmpMode::Int;
2827  bool Invert = false;
2828  SDValue Cmp;
2829  switch (CC) {
2830    // Handle tests for order using (or (ogt y x) (oge x y)).
2831  case ISD::SETUO:
2832    Invert = true;
2833    LLVM_FALLTHROUGH;
2834  case ISD::SETO: {
2835    assert(IsFP && "Unexpected integer comparison");
2836    SDValue LT = getVectorCmp(DAG, getVectorComparison(ISD::SETOGT, Mode),
2837                              DL, VT, CmpOp1, CmpOp0, Chain);
2838    SDValue GE = getVectorCmp(DAG, getVectorComparison(ISD::SETOGE, Mode),
2839                              DL, VT, CmpOp0, CmpOp1, Chain);
2840    Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
2841    if (Chain)
2842      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
2843                          LT.getValue(1), GE.getValue(1));
2844    break;
2845  }
2846
2847    // Handle <> tests using (or (ogt y x) (ogt x y)).
2848  case ISD::SETUEQ:
2849    Invert = true;
2850    LLVM_FALLTHROUGH;
2851  case ISD::SETONE: {
2852    assert(IsFP && "Unexpected integer comparison");
2853    SDValue LT = getVectorCmp(DAG, getVectorComparison(ISD::SETOGT, Mode),
2854                              DL, VT, CmpOp1, CmpOp0, Chain);
2855    SDValue GT = getVectorCmp(DAG, getVectorComparison(ISD::SETOGT, Mode),
2856                              DL, VT, CmpOp0, CmpOp1, Chain);
2857    Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
2858    if (Chain)
2859      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
2860                          LT.getValue(1), GT.getValue(1));
2861    break;
2862  }
2863
2864    // Otherwise a single comparison is enough.  It doesn't really
2865    // matter whether we try the inversion or the swap first, since
2866    // there are no cases where both work.
2867  default:
2868    if (unsigned Opcode = getVectorComparisonOrInvert(CC, Mode, Invert))
2869      Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1, Chain);
2870    else {
2871      CC = ISD::getSetCCSwappedOperands(CC);
2872      if (unsigned Opcode = getVectorComparisonOrInvert(CC, Mode, Invert))
2873        Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0, Chain);
2874      else
2875        llvm_unreachable("Unhandled comparison");
2876    }
2877    if (Chain)
2878      Chain = Cmp.getValue(1);
2879    break;
2880  }
2881  if (Invert) {
2882    SDValue Mask =
2883      DAG.getSplatBuildVector(VT, DL, DAG.getConstant(-1, DL, MVT::i64));
2884    Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask);
2885  }
2886  if (Chain && Chain.getNode() != Cmp.getNode()) {
2887    SDValue Ops[2] = { Cmp, Chain };
2888    Cmp = DAG.getMergeValues(Ops, DL);
2889  }
2890  return Cmp;
2891}
2892
2893SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
2894                                          SelectionDAG &DAG) const {
2895  SDValue CmpOp0   = Op.getOperand(0);
2896  SDValue CmpOp1   = Op.getOperand(1);
2897  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
2898  SDLoc DL(Op);
2899  EVT VT = Op.getValueType();
2900  if (VT.isVector())
2901    return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1);
2902
2903  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
2904  SDValue CCReg = emitCmp(DAG, DL, C);
2905  return emitSETCC(DAG, DL, CCReg, C.CCValid, C.CCMask);
2906}
2907
2908SDValue SystemZTargetLowering::lowerSTRICT_FSETCC(SDValue Op,
2909                                                  SelectionDAG &DAG,
2910                                                  bool IsSignaling) const {
2911  SDValue Chain    = Op.getOperand(0);
2912  SDValue CmpOp0   = Op.getOperand(1);
2913  SDValue CmpOp1   = Op.getOperand(2);
2914  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
2915  SDLoc DL(Op);
2916  EVT VT = Op.getNode()->getValueType(0);
2917  if (VT.isVector()) {
2918    SDValue Res = lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1,
2919                                   Chain, IsSignaling);
2920    return Res.getValue(Op.getResNo());
2921  }
2922
2923  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL, Chain, IsSignaling));
2924  SDValue CCReg = emitCmp(DAG, DL, C);
2925  CCReg->setFlags(Op->getFlags());
2926  SDValue Result = emitSETCC(DAG, DL, CCReg, C.CCValid, C.CCMask);
2927  SDValue Ops[2] = { Result, CCReg.getValue(1) };
2928  return DAG.getMergeValues(Ops, DL);
2929}
2930
2931SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
2932  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
2933  SDValue CmpOp0   = Op.getOperand(2);
2934  SDValue CmpOp1   = Op.getOperand(3);
2935  SDValue Dest     = Op.getOperand(4);
2936  SDLoc DL(Op);
2937
2938  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
2939  SDValue CCReg = emitCmp(DAG, DL, C);
2940  return DAG.getNode(
2941      SystemZISD::BR_CCMASK, DL, Op.getValueType(), Op.getOperand(0),
2942      DAG.getTargetConstant(C.CCValid, DL, MVT::i32),
2943      DAG.getTargetConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
2944}
2945
2946// Return true if Pos is CmpOp and Neg is the negative of CmpOp,
2947// allowing Pos and Neg to be wider than CmpOp.
2948static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
2949  return (Neg.getOpcode() == ISD::SUB &&
2950          Neg.getOperand(0).getOpcode() == ISD::Constant &&
2951          cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
2952          Neg.getOperand(1) == Pos &&
2953          (Pos == CmpOp ||
2954           (Pos.getOpcode() == ISD::SIGN_EXTEND &&
2955            Pos.getOperand(0) == CmpOp)));
2956}
2957
2958// Return the absolute or negative absolute of Op; IsNegative decides which.
2959static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
2960                           bool IsNegative) {
2961  Op = DAG.getNode(SystemZISD::IABS, DL, Op.getValueType(), Op);
2962  if (IsNegative)
2963    Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
2964                     DAG.getConstant(0, DL, Op.getValueType()), Op);
2965  return Op;
2966}
2967
2968SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
2969                                              SelectionDAG &DAG) const {
2970  SDValue CmpOp0   = Op.getOperand(0);
2971  SDValue CmpOp1   = Op.getOperand(1);
2972  SDValue TrueOp   = Op.getOperand(2);
2973  SDValue FalseOp  = Op.getOperand(3);
2974  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
2975  SDLoc DL(Op);
2976
2977  Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
2978
2979  // Check for absolute and negative-absolute selections, including those
2980  // where the comparison value is sign-extended (for LPGFR and LNGFR).
2981  // This check supplements the one in DAGCombiner.
2982  if (C.Opcode == SystemZISD::ICMP &&
2983      C.CCMask != SystemZ::CCMASK_CMP_EQ &&
2984      C.CCMask != SystemZ::CCMASK_CMP_NE &&
2985      C.Op1.getOpcode() == ISD::Constant &&
2986      cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
2987    if (isAbsolute(C.Op0, TrueOp, FalseOp))
2988      return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);
2989    if (isAbsolute(C.Op0, FalseOp, TrueOp))
2990      return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
2991  }
2992
2993  SDValue CCReg = emitCmp(DAG, DL, C);
2994  SDValue Ops[] = {TrueOp, FalseOp,
2995                   DAG.getTargetConstant(C.CCValid, DL, MVT::i32),
2996                   DAG.getTargetConstant(C.CCMask, DL, MVT::i32), CCReg};
2997
2998  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, Op.getValueType(), Ops);
2999}
3000
3001SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
3002                                                  SelectionDAG &DAG) const {
3003  SDLoc DL(Node);
3004  const GlobalValue *GV = Node->getGlobal();
3005  int64_t Offset = Node->getOffset();
3006  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3007  CodeModel::Model CM = DAG.getTarget().getCodeModel();
3008
3009  SDValue Result;
3010  if (Subtarget.isPC32DBLSymbol(GV, CM)) {
3011    if (isInt<32>(Offset)) {
3012      // Assign anchors at 1<<12 byte boundaries.
3013      uint64_t Anchor = Offset & ~uint64_t(0xfff);
3014      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
3015      Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3016
3017      // The offset can be folded into the address if it is aligned to a
3018      // halfword.
3019      Offset -= Anchor;
3020      if (Offset != 0 && (Offset & 1) == 0) {
3021        SDValue Full =
3022          DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
3023        Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
3024        Offset = 0;
3025      }
3026    } else {
3027      // Conservatively load a constant offset greater than 32 bits into a
3028      // register below.
3029      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT);
3030      Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3031    }
3032  } else {
3033    Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
3034    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3035    Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3036                         MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3037  }
3038
3039  // If there was a non-zero offset that we didn't fold, create an explicit
3040  // addition for it.
3041  if (Offset != 0)
3042    Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
3043                         DAG.getConstant(Offset, DL, PtrVT));
3044
3045  return Result;
3046}
3047
3048SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
3049                                                 SelectionDAG &DAG,
3050                                                 unsigned Opcode,
3051                                                 SDValue GOTOffset) const {
3052  SDLoc DL(Node);
3053  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3054  SDValue Chain = DAG.getEntryNode();
3055  SDValue Glue;
3056
3057  if (DAG.getMachineFunction().getFunction().getCallingConv() ==
3058      CallingConv::GHC)
3059    report_fatal_error("In GHC calling convention TLS is not supported");
3060
3061  // __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
3062  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
3063  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
3064  Glue = Chain.getValue(1);
3065  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
3066  Glue = Chain.getValue(1);
3067
3068  // The first call operand is the chain and the second is the TLS symbol.
3069  SmallVector<SDValue, 8> Ops;
3070  Ops.push_back(Chain);
3071  Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
3072                                           Node->getValueType(0),
3073                                           0, 0));
3074
3075  // Add argument registers to the end of the list so that they are
3076  // known live into the call.
3077  Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
3078  Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));
3079
3080  // Add a register mask operand representing the call-preserved registers.
3081  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3082  const uint32_t *Mask =
3083      TRI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
3084  assert(Mask && "Missing call preserved mask for calling convention");
3085  Ops.push_back(DAG.getRegisterMask(Mask));
3086
3087  // Glue the call to the argument copies.
3088  Ops.push_back(Glue);
3089
3090  // Emit the call.
3091  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3092  Chain = DAG.getNode(Opcode, DL, NodeTys, Ops);
3093  Glue = Chain.getValue(1);
3094
3095  // Copy the return value from %r2.
3096  return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
3097}
3098
3099SDValue SystemZTargetLowering::lowerThreadPointer(const SDLoc &DL,
3100                                                  SelectionDAG &DAG) const {
3101  SDValue Chain = DAG.getEntryNode();
3102  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3103
3104  // The high part of the thread pointer is in access register 0.
3105  SDValue TPHi = DAG.getCopyFromReg(Chain, DL, SystemZ::A0, MVT::i32);
3106  TPHi = DAG.getNode(ISD::ANY_EXTEND, DL, PtrVT, TPHi);
3107
3108  // The low part of the thread pointer is in access register 1.
3109  SDValue TPLo = DAG.getCopyFromReg(Chain, DL, SystemZ::A1, MVT::i32);
3110  TPLo = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TPLo);
3111
3112  // Merge them into a single 64-bit address.
3113  SDValue TPHiShifted = DAG.getNode(ISD::SHL, DL, PtrVT, TPHi,
3114                                    DAG.getConstant(32, DL, PtrVT));
3115  return DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
3116}
3117
3118SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
3119                                                     SelectionDAG &DAG) const {
3120  if (DAG.getTarget().useEmulatedTLS())
3121    return LowerToTLSEmulatedModel(Node, DAG);
3122  SDLoc DL(Node);
3123  const GlobalValue *GV = Node->getGlobal();
3124  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3125  TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
3126
3127  if (DAG.getMachineFunction().getFunction().getCallingConv() ==
3128      CallingConv::GHC)
3129    report_fatal_error("In GHC calling convention TLS is not supported");
3130
3131  SDValue TP = lowerThreadPointer(DL, DAG);
3132
3133  // Get the offset of GA from the thread pointer, based on the TLS model.
3134  SDValue Offset;
3135  switch (model) {
3136    case TLSModel::GeneralDynamic: {
3137      // Load the GOT offset of the tls_index (module ID / per-symbol offset).
3138      SystemZConstantPoolValue *CPV =
3139        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
3140
3141      Offset = DAG.getConstantPool(CPV, PtrVT, Align(8));
3142      Offset = DAG.getLoad(
3143          PtrVT, DL, DAG.getEntryNode(), Offset,
3144          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3145
3146      // Call __tls_get_offset to retrieve the offset.
3147      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
3148      break;
3149    }
3150
3151    case TLSModel::LocalDynamic: {
3152      // Load the GOT offset of the module ID.
3153      SystemZConstantPoolValue *CPV =
3154        SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
3155
3156      Offset = DAG.getConstantPool(CPV, PtrVT, Align(8));
3157      Offset = DAG.getLoad(
3158          PtrVT, DL, DAG.getEntryNode(), Offset,
3159          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3160
3161      // Call __tls_get_offset to retrieve the module base offset.
3162      Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
3163
3164      // Note: The SystemZLDCleanupPass will remove redundant computations
3165      // of the module base offset.  Count total number of local-dynamic
3166      // accesses to trigger execution of that pass.
3167      SystemZMachineFunctionInfo* MFI =
3168        DAG.getMachineFunction().getInfo<SystemZMachineFunctionInfo>();
3169      MFI->incNumLocalDynamicTLSAccesses();
3170
3171      // Add the per-symbol offset.
3172      CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
3173
3174      SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, Align(8));
3175      DTPOffset = DAG.getLoad(
3176          PtrVT, DL, DAG.getEntryNode(), DTPOffset,
3177          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3178
3179      Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
3180      break;
3181    }
3182
3183    case TLSModel::InitialExec: {
3184      // Load the offset from the GOT.
3185      Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
3186                                          SystemZII::MO_INDNTPOFF);
3187      Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
3188      Offset =
3189          DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
3190                      MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3191      break;
3192    }
3193
3194    case TLSModel::LocalExec: {
3195      // Force the offset into the constant pool and load it from there.
3196      SystemZConstantPoolValue *CPV =
3197        SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
3198
3199      Offset = DAG.getConstantPool(CPV, PtrVT, Align(8));
3200      Offset = DAG.getLoad(
3201          PtrVT, DL, DAG.getEntryNode(), Offset,
3202          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3203      break;
3204    }
3205  }
3206
3207  // Add the base and offset together.
3208  return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
3209}
3210
3211SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
3212                                                 SelectionDAG &DAG) const {
3213  SDLoc DL(Node);
3214  const BlockAddress *BA = Node->getBlockAddress();
3215  int64_t Offset = Node->getOffset();
3216  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3217
3218  SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
3219  Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3220  return Result;
3221}
3222
3223SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
3224                                              SelectionDAG &DAG) const {
3225  SDLoc DL(JT);
3226  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3227  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
3228
3229  // Use LARL to load the address of the table.
3230  return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3231}
3232
3233SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
3234                                                 SelectionDAG &DAG) const {
3235  SDLoc DL(CP);
3236  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3237
3238  SDValue Result;
3239  if (CP->isMachineConstantPoolEntry())
3240    Result =
3241        DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign());
3242  else
3243    Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign(),
3244                                       CP->getOffset());
3245
3246  // Use LARL to load the address of the constant pool entry.
3247  return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3248}
3249
3250SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
3251                                              SelectionDAG &DAG) const {
3252  auto *TFL =
3253      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
3254  MachineFunction &MF = DAG.getMachineFunction();
3255  MachineFrameInfo &MFI = MF.getFrameInfo();
3256  MFI.setFrameAddressIsTaken(true);
3257
3258  SDLoc DL(Op);
3259  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3260  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3261
3262  // Return null if the back chain is not present.
3263  bool HasBackChain = MF.getFunction().hasFnAttribute("backchain");
3264  if (TFL->usePackedStack(MF) && !HasBackChain)
3265    return DAG.getConstant(0, DL, PtrVT);
3266
3267  // By definition, the frame address is the address of the back chain.
3268  int BackChainIdx = TFL->getOrCreateFramePointerSaveIndex(MF);
3269  SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
3270
3271  // FIXME The frontend should detect this case.
3272  if (Depth > 0) {
3273    report_fatal_error("Unsupported stack frame traversal count");
3274  }
3275
3276  return BackChain;
3277}
3278
3279SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
3280                                               SelectionDAG &DAG) const {
3281  MachineFunction &MF = DAG.getMachineFunction();
3282  MachineFrameInfo &MFI = MF.getFrameInfo();
3283  MFI.setReturnAddressIsTaken(true);
3284
3285  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
3286    return SDValue();
3287
3288  SDLoc DL(Op);
3289  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3290  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3291
3292  // FIXME The frontend should detect this case.
3293  if (Depth > 0) {
3294    report_fatal_error("Unsupported stack frame traversal count");
3295  }
3296
3297  // Return R14D, which has the return address. Mark it an implicit live-in.
3298  unsigned LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass);
3299  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, LinkReg, PtrVT);
3300}
3301
3302SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
3303                                            SelectionDAG &DAG) const {
3304  SDLoc DL(Op);
3305  SDValue In = Op.getOperand(0);
3306  EVT InVT = In.getValueType();
3307  EVT ResVT = Op.getValueType();
3308
3309  // Convert loads directly.  This is normally done by DAGCombiner,
3310  // but we need this case for bitcasts that are created during lowering
3311  // and which are then lowered themselves.
3312  if (auto *LoadN = dyn_cast<LoadSDNode>(In))
3313    if (ISD::isNormalLoad(LoadN)) {
3314      SDValue NewLoad = DAG.getLoad(ResVT, DL, LoadN->getChain(),
3315                                    LoadN->getBasePtr(), LoadN->getMemOperand());
3316      // Update the chain uses.
3317      DAG.ReplaceAllUsesOfValueWith(SDValue(LoadN, 1), NewLoad.getValue(1));
3318      return NewLoad;
3319    }
3320
3321  if (InVT == MVT::i32 && ResVT == MVT::f32) {
3322    SDValue In64;
3323    if (Subtarget.hasHighWord()) {
3324      SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
3325                                       MVT::i64);
3326      In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
3327                                       MVT::i64, SDValue(U64, 0), In);
3328    } else {
3329      In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
3330      In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
3331                         DAG.getConstant(32, DL, MVT::i64));
3332    }
3333    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
3334    return DAG.getTargetExtractSubreg(SystemZ::subreg_h32,
3335                                      DL, MVT::f32, Out64);
3336  }
3337  if (InVT == MVT::f32 && ResVT == MVT::i32) {
3338    SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
3339    SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
3340                                             MVT::f64, SDValue(U64, 0), In);
3341    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
3342    if (Subtarget.hasHighWord())
3343      return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
3344                                        MVT::i32, Out64);
3345    SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
3346                                DAG.getConstant(32, DL, MVT::i64));
3347    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
3348  }
3349  llvm_unreachable("Unexpected bitcast combination");
3350}
3351
3352SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
3353                                            SelectionDAG &DAG) const {
3354  MachineFunction &MF = DAG.getMachineFunction();
3355  SystemZMachineFunctionInfo *FuncInfo =
3356    MF.getInfo<SystemZMachineFunctionInfo>();
3357  EVT PtrVT = getPointerTy(DAG.getDataLayout());
3358
3359  SDValue Chain   = Op.getOperand(0);
3360  SDValue Addr    = Op.getOperand(1);
3361  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3362  SDLoc DL(Op);
3363
3364  // The initial values of each field.
3365  const unsigned NumFields = 4;
3366  SDValue Fields[NumFields] = {
3367    DAG.getConstant(FuncInfo->getVarArgsFirstGPR(), DL, PtrVT),
3368    DAG.getConstant(FuncInfo->getVarArgsFirstFPR(), DL, PtrVT),
3369    DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT),
3370    DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT)
3371  };
3372
3373  // Store each field into its respective slot.
3374  SDValue MemOps[NumFields];
3375  unsigned Offset = 0;
3376  for (unsigned I = 0; I < NumFields; ++I) {
3377    SDValue FieldAddr = Addr;
3378    if (Offset != 0)
3379      FieldAddr = DAG.getNode(ISD::ADD, DL, PtrVT, FieldAddr,
3380                              DAG.getIntPtrConstant(Offset, DL));
3381    MemOps[I] = DAG.getStore(Chain, DL, Fields[I], FieldAddr,
3382                             MachinePointerInfo(SV, Offset));
3383    Offset += 8;
3384  }
3385  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3386}
3387
3388SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
3389                                           SelectionDAG &DAG) const {
3390  SDValue Chain      = Op.getOperand(0);
3391  SDValue DstPtr     = Op.getOperand(1);
3392  SDValue SrcPtr     = Op.getOperand(2);
3393  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
3394  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
3395  SDLoc DL(Op);
3396
3397  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32, DL),
3398                       Align(8), /*isVolatile*/ false, /*AlwaysInline*/ false,
3399                       /*isTailCall*/ false, MachinePointerInfo(DstSV),
3400                       MachinePointerInfo(SrcSV));
3401}
3402
3403SDValue SystemZTargetLowering::
3404lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
3405  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
3406  MachineFunction &MF = DAG.getMachineFunction();
3407  bool RealignOpt = !MF.getFunction().hasFnAttribute("no-realign-stack");
3408  bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
3409
3410  SDValue Chain = Op.getOperand(0);
3411  SDValue Size  = Op.getOperand(1);
3412  SDValue Align = Op.getOperand(2);
3413  SDLoc DL(Op);
3414
3415  // If user has set the no alignment function attribute, ignore
3416  // alloca alignments.
3417  uint64_t AlignVal = (RealignOpt ?
3418                       dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0);
3419
3420  uint64_t StackAlign = TFI->getStackAlignment();
3421  uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
3422  uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
3423
3424  unsigned SPReg = getStackPointerRegisterToSaveRestore();
3425  SDValue NeededSpace = Size;
3426
3427  // Get a reference to the stack pointer.
3428  SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
3429
3430  // If we need a backchain, save it now.
3431  SDValue Backchain;
3432  if (StoreBackchain)
3433    Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
3434
3435  // Add extra space for alignment if needed.
3436  if (ExtraAlignSpace)
3437    NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace,
3438                              DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
3439
3440  // Get the new stack pointer value.
3441  SDValue NewSP;
3442  if (hasInlineStackProbe(MF)) {
3443    NewSP = DAG.getNode(SystemZISD::PROBED_ALLOCA, DL,
3444                DAG.getVTList(MVT::i64, MVT::Other), Chain, OldSP, NeededSpace);
3445    Chain = NewSP.getValue(1);
3446  }
3447  else {
3448    NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
3449    // Copy the new stack pointer back.
3450    Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
3451  }
3452
3453  // The allocated data lives above the 160 bytes allocated for the standard
3454  // frame, plus any outgoing stack arguments.  We don't know how much that
3455  // amounts to yet, so emit a special ADJDYNALLOC placeholder.
3456  SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
3457  SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
3458
3459  // Dynamically realign if needed.
3460  if (RequiredAlign > StackAlign) {
3461    Result =
3462      DAG.getNode(ISD::ADD, DL, MVT::i64, Result,
3463                  DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
3464    Result =
3465      DAG.getNode(ISD::AND, DL, MVT::i64, Result,
3466                  DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64));
3467  }
3468
3469  if (StoreBackchain)
3470    Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
3471
3472  SDValue Ops[2] = { Result, Chain };
3473  return DAG.getMergeValues(Ops, DL);
3474}
3475
3476SDValue SystemZTargetLowering::lowerGET_DYNAMIC_AREA_OFFSET(
3477    SDValue Op, SelectionDAG &DAG) const {
3478  SDLoc DL(Op);
3479
3480  return DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
3481}
3482
3483SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
3484                                              SelectionDAG &DAG) const {
3485  EVT VT = Op.getValueType();
3486  SDLoc DL(Op);
3487  SDValue Ops[2];
3488  if (is32Bit(VT))
3489    // Just do a normal 64-bit multiplication and extract the results.
3490    // We define this so that it can be used for constant division.
3491    lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
3492                    Op.getOperand(1), Ops[1], Ops[0]);
3493  else if (Subtarget.hasMiscellaneousExtensions2())
3494    // SystemZISD::SMUL_LOHI returns the low result in the odd register and
3495    // the high result in the even register.  ISD::SMUL_LOHI is defined to
3496    // return the low half first, so the results are in reverse order.
3497    lowerGR128Binary(DAG, DL, VT, SystemZISD::SMUL_LOHI,
3498                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
3499  else {
3500    // Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI:
3501    //
3502    //   (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
3503    //
3504    // but using the fact that the upper halves are either all zeros
3505    // or all ones:
3506    //
3507    //   (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
3508    //
3509    // and grouping the right terms together since they are quicker than the
3510    // multiplication:
3511    //
3512    //   (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
3513    SDValue C63 = DAG.getConstant(63, DL, MVT::i64);
3514    SDValue LL = Op.getOperand(0);
3515    SDValue RL = Op.getOperand(1);
3516    SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
3517    SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
3518    // SystemZISD::UMUL_LOHI returns the low result in the odd register and
3519    // the high result in the even register.  ISD::SMUL_LOHI is defined to
3520    // return the low half first, so the results are in reverse order.
3521    lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI,
3522                     LL, RL, Ops[1], Ops[0]);
3523    SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
3524    SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
3525    SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
3526    Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
3527  }
3528  return DAG.getMergeValues(Ops, DL);
3529}
3530
3531SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
3532                                              SelectionDAG &DAG) const {
3533  EVT VT = Op.getValueType();
3534  SDLoc DL(Op);
3535  SDValue Ops[2];
3536  if (is32Bit(VT))
3537    // Just do a normal 64-bit multiplication and extract the results.
3538    // We define this so that it can be used for constant division.
3539    lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
3540                    Op.getOperand(1), Ops[1], Ops[0]);
3541  else
3542    // SystemZISD::UMUL_LOHI returns the low result in the odd register and
3543    // the high result in the even register.  ISD::UMUL_LOHI is defined to
3544    // return the low half first, so the results are in reverse order.
3545    lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI,
3546                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
3547  return DAG.getMergeValues(Ops, DL);
3548}
3549
3550SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
3551                                            SelectionDAG &DAG) const {
3552  SDValue Op0 = Op.getOperand(0);
3553  SDValue Op1 = Op.getOperand(1);
3554  EVT VT = Op.getValueType();
3555  SDLoc DL(Op);
3556
3557  // We use DSGF for 32-bit division.  This means the first operand must
3558  // always be 64-bit, and the second operand should be 32-bit whenever
3559  // that is possible, to improve performance.
3560  if (is32Bit(VT))
3561    Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
3562  else if (DAG.ComputeNumSignBits(Op1) > 32)
3563    Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
3564
3565  // DSG(F) returns the remainder in the even register and the
3566  // quotient in the odd register.
3567  SDValue Ops[2];
3568  lowerGR128Binary(DAG, DL, VT, SystemZISD::SDIVREM, Op0, Op1, Ops[1], Ops[0]);
3569  return DAG.getMergeValues(Ops, DL);
3570}
3571
3572SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
3573                                            SelectionDAG &DAG) const {
3574  EVT VT = Op.getValueType();
3575  SDLoc DL(Op);
3576
3577  // DL(G) returns the remainder in the even register and the
3578  // quotient in the odd register.
3579  SDValue Ops[2];
3580  lowerGR128Binary(DAG, DL, VT, SystemZISD::UDIVREM,
3581                   Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
3582  return DAG.getMergeValues(Ops, DL);
3583}
3584
3585SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
3586  assert(Op.getValueType() == MVT::i64 && "Should be 64-bit operation");
3587
3588  // Get the known-zero masks for each operand.
3589  SDValue Ops[] = {Op.getOperand(0), Op.getOperand(1)};
3590  KnownBits Known[2] = {DAG.computeKnownBits(Ops[0]),
3591                        DAG.computeKnownBits(Ops[1])};
3592
3593  // See if the upper 32 bits of one operand and the lower 32 bits of the
3594  // other are known zero.  They are the low and high operands respectively.
3595  uint64_t Masks[] = { Known[0].Zero.getZExtValue(),
3596                       Known[1].Zero.getZExtValue() };
3597  unsigned High, Low;
3598  if ((Masks[0] >> 32) == 0xffffffff && uint32_t(Masks[1]) == 0xffffffff)
3599    High = 1, Low = 0;
3600  else if ((Masks[1] >> 32) == 0xffffffff && uint32_t(Masks[0]) == 0xffffffff)
3601    High = 0, Low = 1;
3602  else
3603    return Op;
3604
3605  SDValue LowOp = Ops[Low];
3606  SDValue HighOp = Ops[High];
3607
3608  // If the high part is a constant, we're better off using IILH.
3609  if (HighOp.getOpcode() == ISD::Constant)
3610    return Op;
3611
3612  // If the low part is a constant that is outside the range of LHI,
3613  // then we're better off using IILF.
3614  if (LowOp.getOpcode() == ISD::Constant) {
3615    int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue());
3616    if (!isInt<16>(Value))
3617      return Op;
3618  }
3619
3620  // Check whether the high part is an AND that doesn't change the
3621  // high 32 bits and just masks out low bits.  We can skip it if so.
3622  if (HighOp.getOpcode() == ISD::AND &&
3623      HighOp.getOperand(1).getOpcode() == ISD::Constant) {
3624    SDValue HighOp0 = HighOp.getOperand(0);
3625    uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
3626    if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
3627      HighOp = HighOp0;
3628  }
3629
3630  // Take advantage of the fact that all GR32 operations only change the
3631  // low 32 bits by truncating Low to an i32 and inserting it directly
3632  // using a subreg.  The interesting cases are those where the truncation
3633  // can be folded.
3634  SDLoc DL(Op);
3635  SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
3636  return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
3637                                   MVT::i64, HighOp, Low32);
3638}
3639
3640// Lower SADDO/SSUBO/UADDO/USUBO nodes.
3641SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
3642                                          SelectionDAG &DAG) const {
3643  SDNode *N = Op.getNode();
3644  SDValue LHS = N->getOperand(0);
3645  SDValue RHS = N->getOperand(1);
3646  SDLoc DL(N);
3647  unsigned BaseOp = 0;
3648  unsigned CCValid = 0;
3649  unsigned CCMask = 0;
3650
3651  switch (Op.getOpcode()) {
3652  default: llvm_unreachable("Unknown instruction!");
3653  case ISD::SADDO:
3654    BaseOp = SystemZISD::SADDO;
3655    CCValid = SystemZ::CCMASK_ARITH;
3656    CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
3657    break;
3658  case ISD::SSUBO:
3659    BaseOp = SystemZISD::SSUBO;
3660    CCValid = SystemZ::CCMASK_ARITH;
3661    CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
3662    break;
3663  case ISD::UADDO:
3664    BaseOp = SystemZISD::UADDO;
3665    CCValid = SystemZ::CCMASK_LOGICAL;
3666    CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
3667    break;
3668  case ISD::USUBO:
3669    BaseOp = SystemZISD::USUBO;
3670    CCValid = SystemZ::CCMASK_LOGICAL;
3671    CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
3672    break;
3673  }
3674
3675  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
3676  SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
3677
3678  SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
3679  if (N->getValueType(1) == MVT::i1)
3680    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
3681
3682  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
3683}
3684
3685static bool isAddCarryChain(SDValue Carry) {
3686  while (Carry.getOpcode() == ISD::ADDCARRY)
3687    Carry = Carry.getOperand(2);
3688  return Carry.getOpcode() == ISD::UADDO;
3689}
3690
3691static bool isSubBorrowChain(SDValue Carry) {
3692  while (Carry.getOpcode() == ISD::SUBCARRY)
3693    Carry = Carry.getOperand(2);
3694  return Carry.getOpcode() == ISD::USUBO;
3695}
3696
3697// Lower ADDCARRY/SUBCARRY nodes.
3698SDValue SystemZTargetLowering::lowerADDSUBCARRY(SDValue Op,
3699                                                SelectionDAG &DAG) const {
3700
3701  SDNode *N = Op.getNode();
3702  MVT VT = N->getSimpleValueType(0);
3703
3704  // Let legalize expand this if it isn't a legal type yet.
3705  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3706    return SDValue();
3707
3708  SDValue LHS = N->getOperand(0);
3709  SDValue RHS = N->getOperand(1);
3710  SDValue Carry = Op.getOperand(2);
3711  SDLoc DL(N);
3712  unsigned BaseOp = 0;
3713  unsigned CCValid = 0;
3714  unsigned CCMask = 0;
3715
3716  switch (Op.getOpcode()) {
3717  default: llvm_unreachable("Unknown instruction!");
3718  case ISD::ADDCARRY:
3719    if (!isAddCarryChain(Carry))
3720      return SDValue();
3721
3722    BaseOp = SystemZISD::ADDCARRY;
3723    CCValid = SystemZ::CCMASK_LOGICAL;
3724    CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
3725    break;
3726  case ISD::SUBCARRY:
3727    if (!isSubBorrowChain(Carry))
3728      return SDValue();
3729
3730    BaseOp = SystemZISD::SUBCARRY;
3731    CCValid = SystemZ::CCMASK_LOGICAL;
3732    CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
3733    break;
3734  }
3735
3736  // Set the condition code from the carry flag.
3737  Carry = DAG.getNode(SystemZISD::GET_CCMASK, DL, MVT::i32, Carry,
3738                      DAG.getConstant(CCValid, DL, MVT::i32),
3739                      DAG.getConstant(CCMask, DL, MVT::i32));
3740
3741  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
3742  SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS, Carry);
3743
3744  SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
3745  if (N->getValueType(1) == MVT::i1)
3746    SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
3747
3748  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
3749}
3750
3751SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
3752                                          SelectionDAG &DAG) const {
3753  EVT VT = Op.getValueType();
3754  SDLoc DL(Op);
3755  Op = Op.getOperand(0);
3756
3757  // Handle vector types via VPOPCT.
3758  if (VT.isVector()) {
3759    Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op);
3760    Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::v16i8, Op);
3761    switch (VT.getScalarSizeInBits()) {
3762    case 8:
3763      break;
3764    case 16: {
3765      Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
3766      SDValue Shift = DAG.getConstant(8, DL, MVT::i32);
3767      SDValue Tmp = DAG.getNode(SystemZISD::VSHL_BY_SCALAR, DL, VT, Op, Shift);
3768      Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
3769      Op = DAG.getNode(SystemZISD::VSRL_BY_SCALAR, DL, VT, Op, Shift);
3770      break;
3771    }
3772    case 32: {
3773      SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
3774                                            DAG.getConstant(0, DL, MVT::i32));
3775      Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
3776      break;
3777    }
3778    case 64: {
3779      SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
3780                                            DAG.getConstant(0, DL, MVT::i32));
3781      Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp);
3782      Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
3783      break;
3784    }
3785    default:
3786      llvm_unreachable("Unexpected type");
3787    }
3788    return Op;
3789  }
3790
3791  // Get the known-zero mask for the operand.
3792  KnownBits Known = DAG.computeKnownBits(Op);
3793  unsigned NumSignificantBits = Known.getMaxValue().getActiveBits();
3794  if (NumSignificantBits == 0)
3795    return DAG.getConstant(0, DL, VT);
3796
3797  // Skip known-zero high parts of the operand.
3798  int64_t OrigBitSize = VT.getSizeInBits();
3799  int64_t BitSize = (int64_t)1 << Log2_32_Ceil(NumSignificantBits);
3800  BitSize = std::min(BitSize, OrigBitSize);
3801
3802  // The POPCNT instruction counts the number of bits in each byte.
3803  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
3804  Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
3805  Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
3806
3807  // Add up per-byte counts in a binary tree.  All bits of Op at
3808  // position larger than BitSize remain zero throughout.
3809  for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
3810    SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, DL, VT));
3811    if (BitSize != OrigBitSize)
3812      Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
3813                        DAG.getConstant(((uint64_t)1 << BitSize) - 1, DL, VT));
3814    Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
3815  }
3816
3817  // Extract overall result from high byte.
3818  if (BitSize > 8)
3819    Op = DAG.getNode(ISD::SRL, DL, VT, Op,
3820                     DAG.getConstant(BitSize - 8, DL, VT));
3821
3822  return Op;
3823}
3824
3825SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
3826                                                 SelectionDAG &DAG) const {
3827  SDLoc DL(Op);
3828  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
3829    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
3830  SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
3831    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
3832
3833  // The only fence that needs an instruction is a sequentially-consistent
3834  // cross-thread fence.
3835  if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
3836      FenceSSID == SyncScope::System) {
3837    return SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, MVT::Other,
3838                                      Op.getOperand(0)),
3839                   0);
3840  }
3841
3842  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
3843  return DAG.getNode(SystemZISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
3844}
3845
3846// Op is an atomic load.  Lower it into a normal volatile load.
3847SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
3848                                                SelectionDAG &DAG) const {
3849  auto *Node = cast<AtomicSDNode>(Op.getNode());
3850  return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
3851                        Node->getChain(), Node->getBasePtr(),
3852                        Node->getMemoryVT(), Node->getMemOperand());
3853}
3854
3855// Op is an atomic store.  Lower it into a normal volatile store.
3856SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
3857                                                 SelectionDAG &DAG) const {
3858  auto *Node = cast<AtomicSDNode>(Op.getNode());
3859  SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
3860                                    Node->getBasePtr(), Node->getMemoryVT(),
3861                                    Node->getMemOperand());
3862  // We have to enforce sequential consistency by performing a
3863  // serialization operation after the store.
3864  if (Node->getOrdering() == AtomicOrdering::SequentiallyConsistent)
3865    Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
3866                                       MVT::Other, Chain), 0);
3867  return Chain;
3868}
3869
3870// Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
3871// two into the fullword ATOMIC_LOADW_* operation given by Opcode.
3872SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
3873                                                   SelectionDAG &DAG,
3874                                                   unsigned Opcode) const {
3875  auto *Node = cast<AtomicSDNode>(Op.getNode());
3876
3877  // 32-bit operations need no code outside the main loop.
3878  EVT NarrowVT = Node->getMemoryVT();
3879  EVT WideVT = MVT::i32;
3880  if (NarrowVT == WideVT)
3881    return Op;
3882
3883  int64_t BitSize = NarrowVT.getSizeInBits();
3884  SDValue ChainIn = Node->getChain();
3885  SDValue Addr = Node->getBasePtr();
3886  SDValue Src2 = Node->getVal();
3887  MachineMemOperand *MMO = Node->getMemOperand();
3888  SDLoc DL(Node);
3889  EVT PtrVT = Addr.getValueType();
3890
3891  // Convert atomic subtracts of constants into additions.
3892  if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
3893    if (auto *Const = dyn_cast<ConstantSDNode>(Src2)) {
3894      Opcode = SystemZISD::ATOMIC_LOADW_ADD;
3895      Src2 = DAG.getConstant(-Const->getSExtValue(), DL, Src2.getValueType());
3896    }
3897
3898  // Get the address of the containing word.
3899  SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
3900                                    DAG.getConstant(-4, DL, PtrVT));
3901
3902  // Get the number of bits that the word must be rotated left in order
3903  // to bring the field to the top bits of a GR32.
3904  SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
3905                                 DAG.getConstant(3, DL, PtrVT));
3906  BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
3907
3908  // Get the complementing shift amount, for rotating a field in the top
3909  // bits back to its proper position.
3910  SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
3911                                    DAG.getConstant(0, DL, WideVT), BitShift);
3912
3913  // Extend the source operand to 32 bits and prepare it for the inner loop.
3914  // ATOMIC_SWAPW uses RISBG to rotate the field left, but all other
3915  // operations require the source to be shifted in advance.  (This shift
3916  // can be folded if the source is constant.)  For AND and NAND, the lower
3917  // bits must be set, while for other opcodes they should be left clear.
3918  if (Opcode != SystemZISD::ATOMIC_SWAPW)
3919    Src2 = DAG.getNode(ISD::SHL, DL, WideVT, Src2,
3920                       DAG.getConstant(32 - BitSize, DL, WideVT));
3921  if (Opcode == SystemZISD::ATOMIC_LOADW_AND ||
3922      Opcode == SystemZISD::ATOMIC_LOADW_NAND)
3923    Src2 = DAG.getNode(ISD::OR, DL, WideVT, Src2,
3924                       DAG.getConstant(uint32_t(-1) >> BitSize, DL, WideVT));
3925
3926  // Construct the ATOMIC_LOADW_* node.
3927  SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
3928  SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
3929                    DAG.getConstant(BitSize, DL, WideVT) };
3930  SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
3931                                             NarrowVT, MMO);
3932
3933  // Rotate the result of the final CS so that the field is in the lower
3934  // bits of a GR32, then truncate it.
3935  SDValue ResultShift = DAG.getNode(ISD::ADD, DL, WideVT, BitShift,
3936                                    DAG.getConstant(BitSize, DL, WideVT));
3937  SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);
3938
3939  SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
3940  return DAG.getMergeValues(RetOps, DL);
3941}
3942
3943// Op is an ATOMIC_LOAD_SUB operation.  Lower 8- and 16-bit operations
3944// into ATOMIC_LOADW_SUBs and decide whether to convert 32- and 64-bit
3945// operations into additions.
3946SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
3947                                                    SelectionDAG &DAG) const {
3948  auto *Node = cast<AtomicSDNode>(Op.getNode());
3949  EVT MemVT = Node->getMemoryVT();
3950  if (MemVT == MVT::i32 || MemVT == MVT::i64) {
3951    // A full-width operation.
3952    assert(Op.getValueType() == MemVT && "Mismatched VTs");
3953    SDValue Src2 = Node->getVal();
3954    SDValue NegSrc2;
3955    SDLoc DL(Src2);
3956
3957    if (auto *Op2 = dyn_cast<ConstantSDNode>(Src2)) {
3958      // Use an addition if the operand is constant and either LAA(G) is
3959      // available or the negative value is in the range of A(G)FHI.
3960      int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
3961      if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1())
3962        NegSrc2 = DAG.getConstant(Value, DL, MemVT);
3963    } else if (Subtarget.hasInterlockedAccess1())
3964      // Use LAA(G) if available.
3965      NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, DL, MemVT),
3966                            Src2);
3967
3968    if (NegSrc2.getNode())
3969      return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT,
3970                           Node->getChain(), Node->getBasePtr(), NegSrc2,
3971                           Node->getMemOperand());
3972
3973    // Use the node as-is.
3974    return Op;
3975  }
3976
3977  return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
3978}
3979
3980// Lower 8/16/32/64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS node.
3981SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
3982                                                    SelectionDAG &DAG) const {
3983  auto *Node = cast<AtomicSDNode>(Op.getNode());
3984  SDValue ChainIn = Node->getOperand(0);
3985  SDValue Addr = Node->getOperand(1);
3986  SDValue CmpVal = Node->getOperand(2);
3987  SDValue SwapVal = Node->getOperand(3);
3988  MachineMemOperand *MMO = Node->getMemOperand();
3989  SDLoc DL(Node);
3990
3991  // We have native support for 32-bit and 64-bit compare and swap, but we
3992  // still need to expand extracting the "success" result from the CC.
3993  EVT NarrowVT = Node->getMemoryVT();
3994  EVT WideVT = NarrowVT == MVT::i64 ? MVT::i64 : MVT::i32;
3995  if (NarrowVT == WideVT) {
3996    SDVTList Tys = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
3997    SDValue Ops[] = { ChainIn, Addr, CmpVal, SwapVal };
3998    SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP,
3999                                               DL, Tys, Ops, NarrowVT, MMO);
4000    SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
4001                                SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
4002
4003    DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
4004    DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
4005    DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
4006    return SDValue();
4007  }
4008
4009  // Convert 8-bit and 16-bit compare and swap to a loop, implemented
4010  // via a fullword ATOMIC_CMP_SWAPW operation.
4011  int64_t BitSize = NarrowVT.getSizeInBits();
4012  EVT PtrVT = Addr.getValueType();
4013
4014  // Get the address of the containing word.
4015  SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
4016                                    DAG.getConstant(-4, DL, PtrVT));
4017
4018  // Get the number of bits that the word must be rotated left in order
4019  // to bring the field to the top bits of a GR32.
4020  SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
4021                                 DAG.getConstant(3, DL, PtrVT));
4022  BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
4023
4024  // Get the complementing shift amount, for rotating a field in the top
4025  // bits back to its proper position.
4026  SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
4027                                    DAG.getConstant(0, DL, WideVT), BitShift);
4028
4029  // Construct the ATOMIC_CMP_SWAPW node.
4030  SDVTList VTList = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
4031  SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
4032                    NegBitShift, DAG.getConstant(BitSize, DL, WideVT) };
4033  SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
4034                                             VTList, Ops, NarrowVT, MMO);
4035  SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
4036                              SystemZ::CCMASK_ICMP, SystemZ::CCMASK_CMP_EQ);
4037
4038  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
4039  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
4040  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
4041  return SDValue();
4042}
4043
4044MachineMemOperand::Flags
4045SystemZTargetLowering::getTargetMMOFlags(const Instruction &I) const {
4046  // Because of how we convert atomic_load and atomic_store to normal loads and
4047  // stores in the DAG, we need to ensure that the MMOs are marked volatile
4048  // since DAGCombine hasn't been updated to account for atomic, but non
4049  // volatile loads.  (See D57601)
4050  if (auto *SI = dyn_cast<StoreInst>(&I))
4051    if (SI->isAtomic())
4052      return MachineMemOperand::MOVolatile;
4053  if (auto *LI = dyn_cast<LoadInst>(&I))
4054    if (LI->isAtomic())
4055      return MachineMemOperand::MOVolatile;
4056  if (auto *AI = dyn_cast<AtomicRMWInst>(&I))
4057    if (AI->isAtomic())
4058      return MachineMemOperand::MOVolatile;
4059  if (auto *AI = dyn_cast<AtomicCmpXchgInst>(&I))
4060    if (AI->isAtomic())
4061      return MachineMemOperand::MOVolatile;
4062  return MachineMemOperand::MONone;
4063}
4064
4065SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
4066                                              SelectionDAG &DAG) const {
4067  MachineFunction &MF = DAG.getMachineFunction();
4068  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
4069  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
4070    report_fatal_error("Variable-sized stack allocations are not supported "
4071                       "in GHC calling convention");
4072  return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
4073                            SystemZ::R15D, Op.getValueType());
4074}
4075
4076SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
4077                                                 SelectionDAG &DAG) const {
4078  MachineFunction &MF = DAG.getMachineFunction();
4079  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
4080  bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
4081
4082  if (MF.getFunction().getCallingConv() == CallingConv::GHC)
4083    report_fatal_error("Variable-sized stack allocations are not supported "
4084                       "in GHC calling convention");
4085
4086  SDValue Chain = Op.getOperand(0);
4087  SDValue NewSP = Op.getOperand(1);
4088  SDValue Backchain;
4089  SDLoc DL(Op);
4090
4091  if (StoreBackchain) {
4092    SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SystemZ::R15D, MVT::i64);
4093    Backchain = DAG.getLoad(MVT::i64, DL, Chain, OldSP, MachinePointerInfo());
4094  }
4095
4096  Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R15D, NewSP);
4097
4098  if (StoreBackchain)
4099    Chain = DAG.getStore(Chain, DL, Backchain, NewSP, MachinePointerInfo());
4100
4101  return Chain;
4102}
4103
4104SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
4105                                             SelectionDAG &DAG) const {
4106  bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
4107  if (!IsData)
4108    // Just preserve the chain.
4109    return Op.getOperand(0);
4110
4111  SDLoc DL(Op);
4112  bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
4113  unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
4114  auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
4115  SDValue Ops[] = {Op.getOperand(0), DAG.getTargetConstant(Code, DL, MVT::i32),
4116                   Op.getOperand(1)};
4117  return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, DL,
4118                                 Node->getVTList(), Ops,
4119                                 Node->getMemoryVT(), Node->getMemOperand());
4120}
4121
4122// Convert condition code in CCReg to an i32 value.
4123static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
4124  SDLoc DL(CCReg);
4125  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
4126  return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
4127                     DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
4128}
4129
4130SDValue
4131SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
4132                                              SelectionDAG &DAG) const {
4133  unsigned Opcode, CCValid;
4134  if (isIntrinsicWithCCAndChain(Op, Opcode, CCValid)) {
4135    assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
4136    SDNode *Node = emitIntrinsicWithCCAndChain(DAG, Op, Opcode);
4137    SDValue CC = getCCResult(DAG, SDValue(Node, 0));
4138    DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), CC);
4139    return SDValue();
4140  }
4141
4142  return SDValue();
4143}
4144
4145SDValue
4146SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
4147                                               SelectionDAG &DAG) const {
4148  unsigned Opcode, CCValid;
4149  if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
4150    SDNode *Node = emitIntrinsicWithCC(DAG, Op, Opcode);
4151    if (Op->getNumValues() == 1)
4152      return getCCResult(DAG, SDValue(Node, 0));
4153    assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
4154    return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
4155                       SDValue(Node, 0), getCCResult(DAG, SDValue(Node, 1)));
4156  }
4157
4158  unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4159  switch (Id) {
4160  case Intrinsic::thread_pointer:
4161    return lowerThreadPointer(SDLoc(Op), DAG);
4162
4163  case Intrinsic::s390_vpdi:
4164    return DAG.getNode(SystemZISD::PERMUTE_DWORDS, SDLoc(Op), Op.getValueType(),
4165                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4166
4167  case Intrinsic::s390_vperm:
4168    return DAG.getNode(SystemZISD::PERMUTE, SDLoc(Op), Op.getValueType(),
4169                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4170
4171  case Intrinsic::s390_vuphb:
4172  case Intrinsic::s390_vuphh:
4173  case Intrinsic::s390_vuphf:
4174    return DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(Op), Op.getValueType(),
4175                       Op.getOperand(1));
4176
4177  case Intrinsic::s390_vuplhb:
4178  case Intrinsic::s390_vuplhh:
4179  case Intrinsic::s390_vuplhf:
4180    return DAG.getNode(SystemZISD::UNPACKL_HIGH, SDLoc(Op), Op.getValueType(),
4181                       Op.getOperand(1));
4182
4183  case Intrinsic::s390_vuplb:
4184  case Intrinsic::s390_vuplhw:
4185  case Intrinsic::s390_vuplf:
4186    return DAG.getNode(SystemZISD::UNPACK_LOW, SDLoc(Op), Op.getValueType(),
4187                       Op.getOperand(1));
4188
4189  case Intrinsic::s390_vupllb:
4190  case Intrinsic::s390_vupllh:
4191  case Intrinsic::s390_vupllf:
4192    return DAG.getNode(SystemZISD::UNPACKL_LOW, SDLoc(Op), Op.getValueType(),
4193                       Op.getOperand(1));
4194
4195  case Intrinsic::s390_vsumb:
4196  case Intrinsic::s390_vsumh:
4197  case Intrinsic::s390_vsumgh:
4198  case Intrinsic::s390_vsumgf:
4199  case Intrinsic::s390_vsumqf:
4200  case Intrinsic::s390_vsumqg:
4201    return DAG.getNode(SystemZISD::VSUM, SDLoc(Op), Op.getValueType(),
4202                       Op.getOperand(1), Op.getOperand(2));
4203  }
4204
4205  return SDValue();
4206}
4207
4208namespace {
4209// Says that SystemZISD operation Opcode can be used to perform the equivalent
4210// of a VPERM with permute vector Bytes.  If Opcode takes three operands,
4211// Operand is the constant third operand, otherwise it is the number of
4212// bytes in each element of the result.
4213struct Permute {
4214  unsigned Opcode;
4215  unsigned Operand;
4216  unsigned char Bytes[SystemZ::VectorBytes];
4217};
4218}
4219
4220static const Permute PermuteForms[] = {
4221  // VMRHG
4222  { SystemZISD::MERGE_HIGH, 8,
4223    { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 } },
4224  // VMRHF
4225  { SystemZISD::MERGE_HIGH, 4,
4226    { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 } },
4227  // VMRHH
4228  { SystemZISD::MERGE_HIGH, 2,
4229    { 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } },
4230  // VMRHB
4231  { SystemZISD::MERGE_HIGH, 1,
4232    { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 } },
4233  // VMRLG
4234  { SystemZISD::MERGE_LOW, 8,
4235    { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 } },
4236  // VMRLF
4237  { SystemZISD::MERGE_LOW, 4,
4238    { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } },
4239  // VMRLH
4240  { SystemZISD::MERGE_LOW, 2,
4241    { 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } },
4242  // VMRLB
4243  { SystemZISD::MERGE_LOW, 1,
4244    { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 } },
4245  // VPKG
4246  { SystemZISD::PACK, 4,
4247    { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 } },
4248  // VPKF
4249  { SystemZISD::PACK, 2,
4250    { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } },
4251  // VPKH
4252  { SystemZISD::PACK, 1,
4253    { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } },
4254  // VPDI V1, V2, 4  (low half of V1, high half of V2)
4255  { SystemZISD::PERMUTE_DWORDS, 4,
4256    { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 } },
4257  // VPDI V1, V2, 1  (high half of V1, low half of V2)
4258  { SystemZISD::PERMUTE_DWORDS, 1,
4259    { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 } }
4260};
4261
4262// Called after matching a vector shuffle against a particular pattern.
4263// Both the original shuffle and the pattern have two vector operands.
4264// OpNos[0] is the operand of the original shuffle that should be used for
4265// operand 0 of the pattern, or -1 if operand 0 of the pattern can be anything.
4266// OpNos[1] is the same for operand 1 of the pattern.  Resolve these -1s and
4267// set OpNo0 and OpNo1 to the shuffle operands that should actually be used
4268// for operands 0 and 1 of the pattern.
4269static bool chooseShuffleOpNos(int *OpNos, unsigned &OpNo0, unsigned &OpNo1) {
4270  if (OpNos[0] < 0) {
4271    if (OpNos[1] < 0)
4272      return false;
4273    OpNo0 = OpNo1 = OpNos[1];
4274  } else if (OpNos[1] < 0) {
4275    OpNo0 = OpNo1 = OpNos[0];
4276  } else {
4277    OpNo0 = OpNos[0];
4278    OpNo1 = OpNos[1];
4279  }
4280  return true;
4281}
4282
4283// Bytes is a VPERM-like permute vector, except that -1 is used for
4284// undefined bytes.  Return true if the VPERM can be implemented using P.
4285// When returning true set OpNo0 to the VPERM operand that should be
4286// used for operand 0 of P and likewise OpNo1 for operand 1 of P.
4287//
4288// For example, if swapping the VPERM operands allows P to match, OpNo0
4289// will be 1 and OpNo1 will be 0.  If instead Bytes only refers to one
4290// operand, but rewriting it to use two duplicated operands allows it to
4291// match P, then OpNo0 and OpNo1 will be the same.
4292static bool matchPermute(const SmallVectorImpl<int> &Bytes, const Permute &P,
4293                         unsigned &OpNo0, unsigned &OpNo1) {
4294  int OpNos[] = { -1, -1 };
4295  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
4296    int Elt = Bytes[I];
4297    if (Elt >= 0) {
4298      // Make sure that the two permute vectors use the same suboperand
4299      // byte number.  Only the operand numbers (the high bits) are
4300      // allowed to differ.
4301      if ((Elt ^ P.Bytes[I]) & (SystemZ::VectorBytes - 1))
4302        return false;
4303      int ModelOpNo = P.Bytes[I] / SystemZ::VectorBytes;
4304      int RealOpNo = unsigned(Elt) / SystemZ::VectorBytes;
4305      // Make sure that the operand mappings are consistent with previous
4306      // elements.
4307      if (OpNos[ModelOpNo] == 1 - RealOpNo)
4308        return false;
4309      OpNos[ModelOpNo] = RealOpNo;
4310    }
4311  }
4312  return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
4313}
4314
4315// As above, but search for a matching permute.
4316static const Permute *matchPermute(const SmallVectorImpl<int> &Bytes,
4317                                   unsigned &OpNo0, unsigned &OpNo1) {
4318  for (auto &P : PermuteForms)
4319    if (matchPermute(Bytes, P, OpNo0, OpNo1))
4320      return &P;
4321  return nullptr;
4322}
4323
4324// Bytes is a VPERM-like permute vector, except that -1 is used for
4325// undefined bytes.  This permute is an operand of an outer permute.
4326// See whether redistributing the -1 bytes gives a shuffle that can be
4327// implemented using P.  If so, set Transform to a VPERM-like permute vector
4328// that, when applied to the result of P, gives the original permute in Bytes.
4329static bool matchDoublePermute(const SmallVectorImpl<int> &Bytes,
4330                               const Permute &P,
4331                               SmallVectorImpl<int> &Transform) {
4332  unsigned To = 0;
4333  for (unsigned From = 0; From < SystemZ::VectorBytes; ++From) {
4334    int Elt = Bytes[From];
4335    if (Elt < 0)
4336      // Byte number From of the result is undefined.
4337      Transform[From] = -1;
4338    else {
4339      while (P.Bytes[To] != Elt) {
4340        To += 1;
4341        if (To == SystemZ::VectorBytes)
4342          return false;
4343      }
4344      Transform[From] = To;
4345    }
4346  }
4347  return true;
4348}
4349
4350// As above, but search for a matching permute.
4351static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes,
4352                                         SmallVectorImpl<int> &Transform) {
4353  for (auto &P : PermuteForms)
4354    if (matchDoublePermute(Bytes, P, Transform))
4355      return &P;
4356  return nullptr;
4357}
4358
4359// Convert the mask of the given shuffle op into a byte-level mask,
4360// as if it had type vNi8.
4361static bool getVPermMask(SDValue ShuffleOp,
4362                         SmallVectorImpl<int> &Bytes) {
4363  EVT VT = ShuffleOp.getValueType();
4364  unsigned NumElements = VT.getVectorNumElements();
4365  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
4366
4367  if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(ShuffleOp)) {
4368    Bytes.resize(NumElements * BytesPerElement, -1);
4369    for (unsigned I = 0; I < NumElements; ++I) {
4370      int Index = VSN->getMaskElt(I);
4371      if (Index >= 0)
4372        for (unsigned J = 0; J < BytesPerElement; ++J)
4373          Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
4374    }
4375    return true;
4376  }
4377  if (SystemZISD::SPLAT == ShuffleOp.getOpcode() &&
4378      isa<ConstantSDNode>(ShuffleOp.getOperand(1))) {
4379    unsigned Index = ShuffleOp.getConstantOperandVal(1);
4380    Bytes.resize(NumElements * BytesPerElement, -1);
4381    for (unsigned I = 0; I < NumElements; ++I)
4382      for (unsigned J = 0; J < BytesPerElement; ++J)
4383        Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
4384    return true;
4385  }
4386  return false;
4387}
4388
4389// Bytes is a VPERM-like permute vector, except that -1 is used for
4390// undefined bytes.  See whether bytes [Start, Start + BytesPerElement) of
4391// the result come from a contiguous sequence of bytes from one input.
4392// Set Base to the selector for the first byte if so.
4393static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start,
4394                            unsigned BytesPerElement, int &Base) {
4395  Base = -1;
4396  for (unsigned I = 0; I < BytesPerElement; ++I) {
4397    if (Bytes[Start + I] >= 0) {
4398      unsigned Elem = Bytes[Start + I];
4399      if (Base < 0) {
4400        Base = Elem - I;
4401        // Make sure the bytes would come from one input operand.
4402        if (unsigned(Base) % Bytes.size() + BytesPerElement > Bytes.size())
4403          return false;
4404      } else if (unsigned(Base) != Elem - I)
4405        return false;
4406    }
4407  }
4408  return true;
4409}
4410
4411// Bytes is a VPERM-like permute vector, except that -1 is used for
4412// undefined bytes.  Return true if it can be performed using VSLDB.
4413// When returning true, set StartIndex to the shift amount and OpNo0
4414// and OpNo1 to the VPERM operands that should be used as the first
4415// and second shift operand respectively.
4416static bool isShlDoublePermute(const SmallVectorImpl<int> &Bytes,
4417                               unsigned &StartIndex, unsigned &OpNo0,
4418                               unsigned &OpNo1) {
4419  int OpNos[] = { -1, -1 };
4420  int Shift = -1;
4421  for (unsigned I = 0; I < 16; ++I) {
4422    int Index = Bytes[I];
4423    if (Index >= 0) {
4424      int ExpectedShift = (Index - I) % SystemZ::VectorBytes;
4425      int ModelOpNo = unsigned(ExpectedShift + I) / SystemZ::VectorBytes;
4426      int RealOpNo = unsigned(Index) / SystemZ::VectorBytes;
4427      if (Shift < 0)
4428        Shift = ExpectedShift;
4429      else if (Shift != ExpectedShift)
4430        return false;
4431      // Make sure that the operand mappings are consistent with previous
4432      // elements.
4433      if (OpNos[ModelOpNo] == 1 - RealOpNo)
4434        return false;
4435      OpNos[ModelOpNo] = RealOpNo;
4436    }
4437  }
4438  StartIndex = Shift;
4439  return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
4440}
4441
4442// Create a node that performs P on operands Op0 and Op1, casting the
4443// operands to the appropriate type.  The type of the result is determined by P.
4444static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
4445                              const Permute &P, SDValue Op0, SDValue Op1) {
4446  // VPDI (PERMUTE_DWORDS) always operates on v2i64s.  The input
4447  // elements of a PACK are twice as wide as the outputs.
4448  unsigned InBytes = (P.Opcode == SystemZISD::PERMUTE_DWORDS ? 8 :
4449                      P.Opcode == SystemZISD::PACK ? P.Operand * 2 :
4450                      P.Operand);
4451  // Cast both operands to the appropriate type.
4452  MVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBytes * 8),
4453                              SystemZ::VectorBytes / InBytes);
4454  Op0 = DAG.getNode(ISD::BITCAST, DL, InVT, Op0);
4455  Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1);
4456  SDValue Op;
4457  if (P.Opcode == SystemZISD::PERMUTE_DWORDS) {
4458    SDValue Op2 = DAG.getTargetConstant(P.Operand, DL, MVT::i32);
4459    Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2);
4460  } else if (P.Opcode == SystemZISD::PACK) {
4461    MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8),
4462                                 SystemZ::VectorBytes / P.Operand);
4463    Op = DAG.getNode(SystemZISD::PACK, DL, OutVT, Op0, Op1);
4464  } else {
4465    Op = DAG.getNode(P.Opcode, DL, InVT, Op0, Op1);
4466  }
4467  return Op;
4468}
4469
4470static bool isZeroVector(SDValue N) {
4471  if (N->getOpcode() == ISD::BITCAST)
4472    N = N->getOperand(0);
4473  if (N->getOpcode() == ISD::SPLAT_VECTOR)
4474    if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand(0)))
4475      return Op->getZExtValue() == 0;
4476  return ISD::isBuildVectorAllZeros(N.getNode());
4477}
4478
4479// Return the index of the zero/undef vector, or UINT32_MAX if not found.
4480static uint32_t findZeroVectorIdx(SDValue *Ops, unsigned Num) {
4481  for (unsigned I = 0; I < Num ; I++)
4482    if (isZeroVector(Ops[I]))
4483      return I;
4484  return UINT32_MAX;
4485}
4486
4487// Bytes is a VPERM-like permute vector, except that -1 is used for
4488// undefined bytes.  Implement it on operands Ops[0] and Ops[1] using
4489// VSLDB or VPERM.
4490static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
4491                                     SDValue *Ops,
4492                                     const SmallVectorImpl<int> &Bytes) {
4493  for (unsigned I = 0; I < 2; ++I)
4494    Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]);
4495
4496  // First see whether VSLDB can be used.
4497  unsigned StartIndex, OpNo0, OpNo1;
4498  if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
4499    return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
4500                       Ops[OpNo1],
4501                       DAG.getTargetConstant(StartIndex, DL, MVT::i32));
4502
4503  // Fall back on VPERM.  Construct an SDNode for the permute vector.  Try to
4504  // eliminate a zero vector by reusing any zero index in the permute vector.
4505  unsigned ZeroVecIdx = findZeroVectorIdx(&Ops[0], 2);
4506  if (ZeroVecIdx != UINT32_MAX) {
4507    bool MaskFirst = true;
4508    int ZeroIdx = -1;
4509    for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
4510      unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
4511      unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes;
4512      if (OpNo == ZeroVecIdx && I == 0) {
4513        // If the first byte is zero, use mask as first operand.
4514        ZeroIdx = 0;
4515        break;
4516      }
4517      if (OpNo != ZeroVecIdx && Byte == 0) {
4518        // If mask contains a zero, use it by placing that vector first.
4519        ZeroIdx = I + SystemZ::VectorBytes;
4520        MaskFirst = false;
4521        break;
4522      }
4523    }
4524    if (ZeroIdx != -1) {
4525      SDValue IndexNodes[SystemZ::VectorBytes];
4526      for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
4527        if (Bytes[I] >= 0) {
4528          unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
4529          unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes;
4530          if (OpNo == ZeroVecIdx)
4531            IndexNodes[I] = DAG.getConstant(ZeroIdx, DL, MVT::i32);
4532          else {
4533            unsigned BIdx = MaskFirst ? Byte + SystemZ::VectorBytes : Byte;
4534            IndexNodes[I] = DAG.getConstant(BIdx, DL, MVT::i32);
4535          }
4536        } else
4537          IndexNodes[I] = DAG.getUNDEF(MVT::i32);
4538      }
4539      SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
4540      SDValue Src = ZeroVecIdx == 0 ? Ops[1] : Ops[0];
4541      if (MaskFirst)
4542        return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Mask, Src,
4543                           Mask);
4544      else
4545        return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Src, Mask,
4546                           Mask);
4547    }
4548  }
4549
4550  SDValue IndexNodes[SystemZ::VectorBytes];
4551  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
4552    if (Bytes[I] >= 0)
4553      IndexNodes[I] = DAG.getConstant(Bytes[I], DL, MVT::i32);
4554    else
4555      IndexNodes[I] = DAG.getUNDEF(MVT::i32);
4556  SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
4557  return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0],
4558                     (!Ops[1].isUndef() ? Ops[1] : Ops[0]), Op2);
4559}
4560
4561namespace {
4562// Describes a general N-operand vector shuffle.
4563struct GeneralShuffle {
4564  GeneralShuffle(EVT vt) : VT(vt), UnpackFromEltSize(UINT_MAX) {}
4565  void addUndef();
4566  bool add(SDValue, unsigned);
4567  SDValue getNode(SelectionDAG &, const SDLoc &);
4568  void tryPrepareForUnpack();
4569  bool unpackWasPrepared() { return UnpackFromEltSize <= 4; }
4570  SDValue insertUnpackIfPrepared(SelectionDAG &DAG, const SDLoc &DL, SDValue Op);
4571
4572  // The operands of the shuffle.
4573  SmallVector<SDValue, SystemZ::VectorBytes> Ops;
4574
4575  // Index I is -1 if byte I of the result is undefined.  Otherwise the
4576  // result comes from byte Bytes[I] % SystemZ::VectorBytes of operand
4577  // Bytes[I] / SystemZ::VectorBytes.
4578  SmallVector<int, SystemZ::VectorBytes> Bytes;
4579
4580  // The type of the shuffle result.
4581  EVT VT;
4582
4583  // Holds a value of 1, 2 or 4 if a final unpack has been prepared for.
4584  unsigned UnpackFromEltSize;
4585};
4586}
4587
4588// Add an extra undefined element to the shuffle.
4589void GeneralShuffle::addUndef() {
4590  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
4591  for (unsigned I = 0; I < BytesPerElement; ++I)
4592    Bytes.push_back(-1);
4593}
4594
4595// Add an extra element to the shuffle, taking it from element Elem of Op.
4596// A null Op indicates a vector input whose value will be calculated later;
4597// there is at most one such input per shuffle and it always has the same
4598// type as the result. Aborts and returns false if the source vector elements
4599// of an EXTRACT_VECTOR_ELT are smaller than the destination elements. Per
4600// LLVM they become implicitly extended, but this is rare and not optimized.
4601bool GeneralShuffle::add(SDValue Op, unsigned Elem) {
4602  unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
4603
4604  // The source vector can have wider elements than the result,
4605  // either through an explicit TRUNCATE or because of type legalization.
4606  // We want the least significant part.
4607  EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
4608  unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();
4609
4610  // Return false if the source elements are smaller than their destination
4611  // elements.
4612  if (FromBytesPerElement < BytesPerElement)
4613    return false;
4614
4615  unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
4616                   (FromBytesPerElement - BytesPerElement));
4617
4618  // Look through things like shuffles and bitcasts.
4619  while (Op.getNode()) {
4620    if (Op.getOpcode() == ISD::BITCAST)
4621      Op = Op.getOperand(0);
4622    else if (Op.getOpcode() == ISD::VECTOR_SHUFFLE && Op.hasOneUse()) {
4623      // See whether the bytes we need come from a contiguous part of one
4624      // operand.
4625      SmallVector<int, SystemZ::VectorBytes> OpBytes;
4626      if (!getVPermMask(Op, OpBytes))
4627        break;
4628      int NewByte;
4629      if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte))
4630        break;
4631      if (NewByte < 0) {
4632        addUndef();
4633        return true;
4634      }
4635      Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
4636      Byte = unsigned(NewByte) % SystemZ::VectorBytes;
4637    } else if (Op.isUndef()) {
4638      addUndef();
4639      return true;
4640    } else
4641      break;
4642  }
4643
4644  // Make sure that the source of the extraction is in Ops.
4645  unsigned OpNo = 0;
4646  for (; OpNo < Ops.size(); ++OpNo)
4647    if (Ops[OpNo] == Op)
4648      break;
4649  if (OpNo == Ops.size())
4650    Ops.push_back(Op);
4651
4652  // Add the element to Bytes.
4653  unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
4654  for (unsigned I = 0; I < BytesPerElement; ++I)
4655    Bytes.push_back(Base + I);
4656
4657  return true;
4658}
4659
4660// Return SDNodes for the completed shuffle.
4661SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
4662  assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector");
4663
4664  if (Ops.size() == 0)
4665    return DAG.getUNDEF(VT);
4666
4667  // Use a single unpack if possible as the last operation.
4668  tryPrepareForUnpack();
4669
4670  // Make sure that there are at least two shuffle operands.
4671  if (Ops.size() == 1)
4672    Ops.push_back(DAG.getUNDEF(MVT::v16i8));
4673
4674  // Create a tree of shuffles, deferring root node until after the loop.
4675  // Try to redistribute the undefined elements of non-root nodes so that
4676  // the non-root shuffles match something like a pack or merge, then adjust
4677  // the parent node's permute vector to compensate for the new order.
4678  // Among other things, this copes with vectors like <2 x i16> that were
4679  // padded with undefined elements during type legalization.
4680  //
4681  // In the best case this redistribution will lead to the whole tree
4682  // using packs and merges.  It should rarely be a loss in other cases.
4683  unsigned Stride = 1;
4684  for (; Stride * 2 < Ops.size(); Stride *= 2) {
4685    for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) {
4686      SDValue SubOps[] = { Ops[I], Ops[I + Stride] };
4687
4688      // Create a mask for just these two operands.
4689      SmallVector<int, SystemZ::VectorBytes> NewBytes(SystemZ::VectorBytes);
4690      for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
4691        unsigned OpNo = unsigned(Bytes[J]) / SystemZ::VectorBytes;
4692        unsigned Byte = unsigned(Bytes[J]) % SystemZ::VectorBytes;
4693        if (OpNo == I)
4694          NewBytes[J] = Byte;
4695        else if (OpNo == I + Stride)
4696          NewBytes[J] = SystemZ::VectorBytes + Byte;
4697        else
4698          NewBytes[J] = -1;
4699      }
4700      // See if it would be better to reorganize NewMask to avoid using VPERM.
4701      SmallVector<int, SystemZ::VectorBytes> NewBytesMap(SystemZ::VectorBytes);
4702      if (const Permute *P = matchDoublePermute(NewBytes, NewBytesMap)) {
4703        Ops[I] = getPermuteNode(DAG, DL, *P, SubOps[0], SubOps[1]);
4704        // Applying NewBytesMap to Ops[I] gets back to NewBytes.
4705        for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
4706          if (NewBytes[J] >= 0) {
4707            assert(unsigned(NewBytesMap[J]) < SystemZ::VectorBytes &&
4708                   "Invalid double permute");
4709            Bytes[J] = I * SystemZ::VectorBytes + NewBytesMap[J];
4710          } else
4711            assert(NewBytesMap[J] < 0 && "Invalid double permute");
4712        }
4713      } else {
4714        // Just use NewBytes on the operands.
4715        Ops[I] = getGeneralPermuteNode(DAG, DL, SubOps, NewBytes);
4716        for (unsigned J = 0; J < SystemZ::VectorBytes; ++J)
4717          if (NewBytes[J] >= 0)
4718            Bytes[J] = I * SystemZ::VectorBytes + J;
4719      }
4720    }
4721  }
4722
4723  // Now we just have 2 inputs.  Put the second operand in Ops[1].
4724  if (Stride > 1) {
4725    Ops[1] = Ops[Stride];
4726    for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
4727      if (Bytes[I] >= int(SystemZ::VectorBytes))
4728        Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes;
4729  }
4730
4731  // Look for an instruction that can do the permute without resorting
4732  // to VPERM.
4733  unsigned OpNo0, OpNo1;
4734  SDValue Op;
4735  if (unpackWasPrepared() && Ops[1].isUndef())
4736    Op = Ops[0];
4737  else if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
4738    Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
4739  else
4740    Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
4741
4742  Op = insertUnpackIfPrepared(DAG, DL, Op);
4743
4744  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
4745}
4746
4747#ifndef NDEBUG
4748static void dumpBytes(const SmallVectorImpl<int> &Bytes, std::string Msg) {
4749  dbgs() << Msg.c_str() << " { ";
4750  for (unsigned i = 0; i < Bytes.size(); i++)
4751    dbgs() << Bytes[i] << " ";
4752  dbgs() << "}\n";
4753}
4754#endif
4755
4756// If the Bytes vector matches an unpack operation, prepare to do the unpack
4757// after all else by removing the zero vector and the effect of the unpack on
4758// Bytes.
4759void GeneralShuffle::tryPrepareForUnpack() {
4760  uint32_t ZeroVecOpNo = findZeroVectorIdx(&Ops[0], Ops.size());
4761  if (ZeroVecOpNo == UINT32_MAX || Ops.size() == 1)
4762    return;
4763
4764  // Only do this if removing the zero vector reduces the depth, otherwise
4765  // the critical path will increase with the final unpack.
4766  if (Ops.size() > 2 &&
4767      Log2_32_Ceil(Ops.size()) == Log2_32_Ceil(Ops.size() - 1))
4768    return;
4769
4770  // Find an unpack that would allow removing the zero vector from Ops.
4771  UnpackFromEltSize = 1;
4772  for (; UnpackFromEltSize <= 4; UnpackFromEltSize *= 2) {
4773    bool MatchUnpack = true;
4774    SmallVector<int, SystemZ::VectorBytes> SrcBytes;
4775    for (unsigned Elt = 0; Elt < SystemZ::VectorBytes; Elt++) {
4776      unsigned ToEltSize = UnpackFromEltSize * 2;
4777      bool IsZextByte = (Elt % ToEltSize) < UnpackFromEltSize;
4778      if (!IsZextByte)
4779        SrcBytes.push_back(Bytes[Elt]);
4780      if (Bytes[Elt] != -1) {
4781        unsigned OpNo = unsigned(Bytes[Elt]) / SystemZ::VectorBytes;
4782        if (IsZextByte != (OpNo == ZeroVecOpNo)) {
4783          MatchUnpack = false;
4784          break;
4785        }
4786      }
4787    }
4788    if (MatchUnpack) {
4789      if (Ops.size() == 2) {
4790        // Don't use unpack if a single source operand needs rearrangement.
4791        for (unsigned i = 0; i < SystemZ::VectorBytes / 2; i++)
4792          if (SrcBytes[i] != -1 && SrcBytes[i] % 16 != int(i)) {
4793            UnpackFromEltSize = UINT_MAX;
4794            return;
4795          }
4796      }
4797      break;
4798    }
4799  }
4800  if (UnpackFromEltSize > 4)
4801    return;
4802
4803  LLVM_DEBUG(dbgs() << "Preparing for final unpack of element size "
4804             << UnpackFromEltSize << ". Zero vector is Op#" << ZeroVecOpNo
4805             << ".\n";
4806             dumpBytes(Bytes, "Original Bytes vector:"););
4807
4808  // Apply the unpack in reverse to the Bytes array.
4809  unsigned B = 0;
4810  for (unsigned Elt = 0; Elt < SystemZ::VectorBytes;) {
4811    Elt += UnpackFromEltSize;
4812    for (unsigned i = 0; i < UnpackFromEltSize; i++, Elt++, B++)
4813      Bytes[B] = Bytes[Elt];
4814  }
4815  while (B < SystemZ::VectorBytes)
4816    Bytes[B++] = -1;
4817
4818  // Remove the zero vector from Ops
4819  Ops.erase(&Ops[ZeroVecOpNo]);
4820  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
4821    if (Bytes[I] >= 0) {
4822      unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
4823      if (OpNo > ZeroVecOpNo)
4824        Bytes[I] -= SystemZ::VectorBytes;
4825    }
4826
4827  LLVM_DEBUG(dumpBytes(Bytes, "Resulting Bytes vector, zero vector removed:");
4828             dbgs() << "\n";);
4829}
4830
4831SDValue GeneralShuffle::insertUnpackIfPrepared(SelectionDAG &DAG,
4832                                               const SDLoc &DL,
4833                                               SDValue Op) {
4834  if (!unpackWasPrepared())
4835    return Op;
4836  unsigned InBits = UnpackFromEltSize * 8;
4837  EVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBits),
4838                                SystemZ::VectorBits / InBits);
4839  SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, InVT, Op);
4840  unsigned OutBits = InBits * 2;
4841  EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(OutBits),
4842                               SystemZ::VectorBits / OutBits);
4843  return DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp);
4844}
4845
4846// Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
4847static bool isScalarToVector(SDValue Op) {
4848  for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
4849    if (!Op.getOperand(I).isUndef())
4850      return false;
4851  return true;
4852}
4853
4854// Return a vector of type VT that contains Value in the first element.
4855// The other elements don't matter.
4856static SDValue buildScalarToVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
4857                                   SDValue Value) {
4858  // If we have a constant, replicate it to all elements and let the
4859  // BUILD_VECTOR lowering take care of it.
4860  if (Value.getOpcode() == ISD::Constant ||
4861      Value.getOpcode() == ISD::ConstantFP) {
4862    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Value);
4863    return DAG.getBuildVector(VT, DL, Ops);
4864  }
4865  if (Value.isUndef())
4866    return DAG.getUNDEF(VT);
4867  return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
4868}
4869
4870// Return a vector of type VT in which Op0 is in element 0 and Op1 is in
4871// element 1.  Used for cases in which replication is cheap.
4872static SDValue buildMergeScalars(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
4873                                 SDValue Op0, SDValue Op1) {
4874  if (Op0.isUndef()) {
4875    if (Op1.isUndef())
4876      return DAG.getUNDEF(VT);
4877    return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op1);
4878  }
4879  if (Op1.isUndef())
4880    return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0);
4881  return DAG.getNode(SystemZISD::MERGE_HIGH, DL, VT,
4882                     buildScalarToVector(DAG, DL, VT, Op0),
4883                     buildScalarToVector(DAG, DL, VT, Op1));
4884}
4885
4886// Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64
4887// vector for them.
4888static SDValue joinDwords(SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
4889                          SDValue Op1) {
4890  if (Op0.isUndef() && Op1.isUndef())
4891    return DAG.getUNDEF(MVT::v2i64);
4892  // If one of the two inputs is undefined then replicate the other one,
4893  // in order to avoid using another register unnecessarily.
4894  if (Op0.isUndef())
4895    Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
4896  else if (Op1.isUndef())
4897    Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
4898  else {
4899    Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
4900    Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
4901  }
4902  return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1);
4903}
4904
4905// If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually
4906// better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for
4907// the non-EXTRACT_VECTOR_ELT elements.  See if the given BUILD_VECTOR
4908// would benefit from this representation and return it if so.
4909static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
4910                                     BuildVectorSDNode *BVN) {
4911  EVT VT = BVN->getValueType(0);
4912  unsigned NumElements = VT.getVectorNumElements();
4913
4914  // Represent the BUILD_VECTOR as an N-operand VECTOR_SHUFFLE-like operation
4915  // on byte vectors.  If there are non-EXTRACT_VECTOR_ELT elements that still
4916  // need a BUILD_VECTOR, add an additional placeholder operand for that
4917  // BUILD_VECTOR and store its operands in ResidueOps.
4918  GeneralShuffle GS(VT);
4919  SmallVector<SDValue, SystemZ::VectorBytes> ResidueOps;
4920  bool FoundOne = false;
4921  for (unsigned I = 0; I < NumElements; ++I) {
4922    SDValue Op = BVN->getOperand(I);
4923    if (Op.getOpcode() == ISD::TRUNCATE)
4924      Op = Op.getOperand(0);
4925    if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
4926        Op.getOperand(1).getOpcode() == ISD::Constant) {
4927      unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4928      if (!GS.add(Op.getOperand(0), Elem))
4929        return SDValue();
4930      FoundOne = true;
4931    } else if (Op.isUndef()) {
4932      GS.addUndef();
4933    } else {
4934      if (!GS.add(SDValue(), ResidueOps.size()))
4935        return SDValue();
4936      ResidueOps.push_back(BVN->getOperand(I));
4937    }
4938  }
4939
4940  // Nothing to do if there are no EXTRACT_VECTOR_ELTs.
4941  if (!FoundOne)
4942    return SDValue();
4943
4944  // Create the BUILD_VECTOR for the remaining elements, if any.
4945  if (!ResidueOps.empty()) {
4946    while (ResidueOps.size() < NumElements)
4947      ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType()));
4948    for (auto &Op : GS.Ops) {
4949      if (!Op.getNode()) {
4950        Op = DAG.getBuildVector(VT, SDLoc(BVN), ResidueOps);
4951        break;
4952      }
4953    }
4954  }
4955  return GS.getNode(DAG, SDLoc(BVN));
4956}
4957
4958bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
4959  if (Op.getOpcode() == ISD::LOAD && cast<LoadSDNode>(Op)->isUnindexed())
4960    return true;
4961  if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV)
4962    return true;
4963  return false;
4964}
4965
4966// Combine GPR scalar values Elems into a vector of type VT.
4967SDValue
4968SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
4969                                   SmallVectorImpl<SDValue> &Elems) const {
4970  // See whether there is a single replicated value.
4971  SDValue Single;
4972  unsigned int NumElements = Elems.size();
4973  unsigned int Count = 0;
4974  for (auto Elem : Elems) {
4975    if (!Elem.isUndef()) {
4976      if (!Single.getNode())
4977        Single = Elem;
4978      else if (Elem != Single) {
4979        Single = SDValue();
4980        break;
4981      }
4982      Count += 1;
4983    }
4984  }
4985  // There are three cases here:
4986  //
4987  // - if the only defined element is a loaded one, the best sequence
4988  //   is a replicating load.
4989  //
4990  // - otherwise, if the only defined element is an i64 value, we will
4991  //   end up with the same VLVGP sequence regardless of whether we short-cut
4992  //   for replication or fall through to the later code.
4993  //
4994  // - otherwise, if the only defined element is an i32 or smaller value,
4995  //   we would need 2 instructions to replicate it: VLVGP followed by VREPx.
4996  //   This is only a win if the single defined element is used more than once.
4997  //   In other cases we're better off using a single VLVGx.
4998  if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single)))
4999    return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
5000
5001  // If all elements are loads, use VLREP/VLEs (below).
5002  bool AllLoads = true;
5003  for (auto Elem : Elems)
5004    if (!isVectorElementLoad(Elem)) {
5005      AllLoads = false;
5006      break;
5007    }
5008
5009  // The best way of building a v2i64 from two i64s is to use VLVGP.
5010  if (VT == MVT::v2i64 && !AllLoads)
5011    return joinDwords(DAG, DL, Elems[0], Elems[1]);
5012
5013  // Use a 64-bit merge high to combine two doubles.
5014  if (VT == MVT::v2f64 && !AllLoads)
5015    return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
5016
5017  // Build v4f32 values directly from the FPRs:
5018  //
5019  //   <Axxx> <Bxxx> <Cxxxx> <Dxxx>
5020  //         V              V         VMRHF
5021  //      <ABxx>         <CDxx>
5022  //                V                 VMRHG
5023  //              <ABCD>
5024  if (VT == MVT::v4f32 && !AllLoads) {
5025    SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
5026    SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
5027    // Avoid unnecessary undefs by reusing the other operand.
5028    if (Op01.isUndef())
5029      Op01 = Op23;
5030    else if (Op23.isUndef())
5031      Op23 = Op01;
5032    // Merging identical replications is a no-op.
5033    if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
5034      return Op01;
5035    Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
5036    Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
5037    SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
5038                             DL, MVT::v2i64, Op01, Op23);
5039    return DAG.getNode(ISD::BITCAST, DL, VT, Op);
5040  }
5041
5042  // Collect the constant terms.
5043  SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue());
5044  SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);
5045
5046  unsigned NumConstants = 0;
5047  for (unsigned I = 0; I < NumElements; ++I) {
5048    SDValue Elem = Elems[I];
5049    if (Elem.getOpcode() == ISD::Constant ||
5050        Elem.getOpcode() == ISD::ConstantFP) {
5051      NumConstants += 1;
5052      Constants[I] = Elem;
5053      Done[I] = true;
5054    }
5055  }
5056  // If there was at least one constant, fill in the other elements of
5057  // Constants with undefs to get a full vector constant and use that
5058  // as the starting point.
5059  SDValue Result;
5060  SDValue ReplicatedVal;
5061  if (NumConstants > 0) {
5062    for (unsigned I = 0; I < NumElements; ++I)
5063      if (!Constants[I].getNode())
5064        Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
5065    Result = DAG.getBuildVector(VT, DL, Constants);
5066  } else {
5067    // Otherwise try to use VLREP or VLVGP to start the sequence in order to
5068    // avoid a false dependency on any previous contents of the vector
5069    // register.
5070
5071    // Use a VLREP if at least one element is a load. Make sure to replicate
5072    // the load with the most elements having its value.
5073    std::map<const SDNode*, unsigned> UseCounts;
5074    SDNode *LoadMaxUses = nullptr;
5075    for (unsigned I = 0; I < NumElements; ++I)
5076      if (isVectorElementLoad(Elems[I])) {
5077        SDNode *Ld = Elems[I].getNode();
5078        UseCounts[Ld]++;
5079        if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld])
5080          LoadMaxUses = Ld;
5081      }
5082    if (LoadMaxUses != nullptr) {
5083      ReplicatedVal = SDValue(LoadMaxUses, 0);
5084      Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, ReplicatedVal);
5085    } else {
5086      // Try to use VLVGP.
5087      unsigned I1 = NumElements / 2 - 1;
5088      unsigned I2 = NumElements - 1;
5089      bool Def1 = !Elems[I1].isUndef();
5090      bool Def2 = !Elems[I2].isUndef();
5091      if (Def1 || Def2) {
5092        SDValue Elem1 = Elems[Def1 ? I1 : I2];
5093        SDValue Elem2 = Elems[Def2 ? I2 : I1];
5094        Result = DAG.getNode(ISD::BITCAST, DL, VT,
5095                             joinDwords(DAG, DL, Elem1, Elem2));
5096        Done[I1] = true;
5097        Done[I2] = true;
5098      } else
5099        Result = DAG.getUNDEF(VT);
5100    }
5101  }
5102
5103  // Use VLVGx to insert the other elements.
5104  for (unsigned I = 0; I < NumElements; ++I)
5105    if (!Done[I] && !Elems[I].isUndef() && Elems[I] != ReplicatedVal)
5106      Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I],
5107                           DAG.getConstant(I, DL, MVT::i32));
5108  return Result;
5109}
5110
5111SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
5112                                                 SelectionDAG &DAG) const {
5113  auto *BVN = cast<BuildVectorSDNode>(Op.getNode());
5114  SDLoc DL(Op);
5115  EVT VT = Op.getValueType();
5116
5117  if (BVN->isConstant()) {
5118    if (SystemZVectorConstantInfo(BVN).isVectorConstantLegal(Subtarget))
5119      return Op;
5120
5121    // Fall back to loading it from memory.
5122    return SDValue();
5123  }
5124
5125  // See if we should use shuffles to construct the vector from other vectors.
5126  if (SDValue Res = tryBuildVectorShuffle(DAG, BVN))
5127    return Res;
5128
5129  // Detect SCALAR_TO_VECTOR conversions.
5130  if (isOperationLegal(ISD::SCALAR_TO_VECTOR, VT) && isScalarToVector(Op))
5131    return buildScalarToVector(DAG, DL, VT, Op.getOperand(0));
5132
5133  // Otherwise use buildVector to build the vector up from GPRs.
5134  unsigned NumElements = Op.getNumOperands();
5135  SmallVector<SDValue, SystemZ::VectorBytes> Ops(NumElements);
5136  for (unsigned I = 0; I < NumElements; ++I)
5137    Ops[I] = Op.getOperand(I);
5138  return buildVector(DAG, DL, VT, Ops);
5139}
5140
5141SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
5142                                                   SelectionDAG &DAG) const {
5143  auto *VSN = cast<ShuffleVectorSDNode>(Op.getNode());
5144  SDLoc DL(Op);
5145  EVT VT = Op.getValueType();
5146  unsigned NumElements = VT.getVectorNumElements();
5147
5148  if (VSN->isSplat()) {
5149    SDValue Op0 = Op.getOperand(0);
5150    unsigned Index = VSN->getSplatIndex();
5151    assert(Index < VT.getVectorNumElements() &&
5152           "Splat index should be defined and in first operand");
5153    // See whether the value we're splatting is directly available as a scalar.
5154    if ((Index == 0 && Op0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
5155        Op0.getOpcode() == ISD::BUILD_VECTOR)
5156      return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index));
5157    // Otherwise keep it as a vector-to-vector operation.
5158    return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0),
5159                       DAG.getTargetConstant(Index, DL, MVT::i32));
5160  }
5161
5162  GeneralShuffle GS(VT);
5163  for (unsigned I = 0; I < NumElements; ++I) {
5164    int Elt = VSN->getMaskElt(I);
5165    if (Elt < 0)
5166      GS.addUndef();
5167    else if (!GS.add(Op.getOperand(unsigned(Elt) / NumElements),
5168                     unsigned(Elt) % NumElements))
5169      return SDValue();
5170  }
5171  return GS.getNode(DAG, SDLoc(VSN));
5172}
5173
5174SDValue SystemZTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
5175                                                     SelectionDAG &DAG) const {
5176  SDLoc DL(Op);
5177  // Just insert the scalar into element 0 of an undefined vector.
5178  return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
5179                     Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
5180                     Op.getOperand(0), DAG.getConstant(0, DL, MVT::i32));
5181}
5182
5183SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
5184                                                      SelectionDAG &DAG) const {
5185  // Handle insertions of floating-point values.
5186  SDLoc DL(Op);
5187  SDValue Op0 = Op.getOperand(0);
5188  SDValue Op1 = Op.getOperand(1);
5189  SDValue Op2 = Op.getOperand(2);
5190  EVT VT = Op.getValueType();
5191
5192  // Insertions into constant indices of a v2f64 can be done using VPDI.
5193  // However, if the inserted value is a bitcast or a constant then it's
5194  // better to use GPRs, as below.
5195  if (VT == MVT::v2f64 &&
5196      Op1.getOpcode() != ISD::BITCAST &&
5197      Op1.getOpcode() != ISD::ConstantFP &&
5198      Op2.getOpcode() == ISD::Constant) {
5199    uint64_t Index = cast<ConstantSDNode>(Op2)->getZExtValue();
5200    unsigned Mask = VT.getVectorNumElements() - 1;
5201    if (Index <= Mask)
5202      return Op;
5203  }
5204
5205  // Otherwise bitcast to the equivalent integer form and insert via a GPR.
5206  MVT IntVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
5207  MVT IntVecVT = MVT::getVectorVT(IntVT, VT.getVectorNumElements());
5208  SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntVecVT,
5209                            DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0),
5210                            DAG.getNode(ISD::BITCAST, DL, IntVT, Op1), Op2);
5211  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
5212}
5213
5214SDValue
5215SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
5216                                               SelectionDAG &DAG) const {
5217  // Handle extractions of floating-point values.
5218  SDLoc DL(Op);
5219  SDValue Op0 = Op.getOperand(0);
5220  SDValue Op1 = Op.getOperand(1);
5221  EVT VT = Op.getValueType();
5222  EVT VecVT = Op0.getValueType();
5223
5224  // Extractions of constant indices can be done directly.
5225  if (auto *CIndexN = dyn_cast<ConstantSDNode>(Op1)) {
5226    uint64_t Index = CIndexN->getZExtValue();
5227    unsigned Mask = VecVT.getVectorNumElements() - 1;
5228    if (Index <= Mask)
5229      return Op;
5230  }
5231
5232  // Otherwise bitcast to the equivalent integer form and extract via a GPR.
5233  MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
5234  MVT IntVecVT = MVT::getVectorVT(IntVT, VecVT.getVectorNumElements());
5235  SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntVT,
5236                            DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), Op1);
5237  return DAG.getNode(ISD::BITCAST, DL, VT, Res);
5238}
5239
5240SDValue SystemZTargetLowering::
5241lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
5242  SDValue PackedOp = Op.getOperand(0);
5243  EVT OutVT = Op.getValueType();
5244  EVT InVT = PackedOp.getValueType();
5245  unsigned ToBits = OutVT.getScalarSizeInBits();
5246  unsigned FromBits = InVT.getScalarSizeInBits();
5247  do {
5248    FromBits *= 2;
5249    EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
5250                                 SystemZ::VectorBits / FromBits);
5251    PackedOp =
5252      DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp);
5253  } while (FromBits != ToBits);
5254  return PackedOp;
5255}
5256
5257// Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector.
5258SDValue SystemZTargetLowering::
5259lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
5260  SDValue PackedOp = Op.getOperand(0);
5261  SDLoc DL(Op);
5262  EVT OutVT = Op.getValueType();
5263  EVT InVT = PackedOp.getValueType();
5264  unsigned InNumElts = InVT.getVectorNumElements();
5265  unsigned OutNumElts = OutVT.getVectorNumElements();
5266  unsigned NumInPerOut = InNumElts / OutNumElts;
5267
5268  SDValue ZeroVec =
5269    DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType()));
5270
5271  SmallVector<int, 16> Mask(InNumElts);
5272  unsigned ZeroVecElt = InNumElts;
5273  for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) {
5274    unsigned MaskElt = PackedElt * NumInPerOut;
5275    unsigned End = MaskElt + NumInPerOut - 1;
5276    for (; MaskElt < End; MaskElt++)
5277      Mask[MaskElt] = ZeroVecElt++;
5278    Mask[MaskElt] = PackedElt;
5279  }
5280  SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask);
5281  return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf);
5282}
5283
5284SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
5285                                          unsigned ByScalar) const {
5286  // Look for cases where a vector shift can use the *_BY_SCALAR form.
5287  SDValue Op0 = Op.getOperand(0);
5288  SDValue Op1 = Op.getOperand(1);
5289  SDLoc DL(Op);
5290  EVT VT = Op.getValueType();
5291  unsigned ElemBitSize = VT.getScalarSizeInBits();
5292
5293  // See whether the shift vector is a splat represented as BUILD_VECTOR.
5294  if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op1)) {
5295    APInt SplatBits, SplatUndef;
5296    unsigned SplatBitSize;
5297    bool HasAnyUndefs;
5298    // Check for constant splats.  Use ElemBitSize as the minimum element
5299    // width and reject splats that need wider elements.
5300    if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
5301                             ElemBitSize, true) &&
5302        SplatBitSize == ElemBitSize) {
5303      SDValue Shift = DAG.getConstant(SplatBits.getZExtValue() & 0xfff,
5304                                      DL, MVT::i32);
5305      return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
5306    }
5307    // Check for variable splats.
5308    BitVector UndefElements;
5309    SDValue Splat = BVN->getSplatValue(&UndefElements);
5310    if (Splat) {
5311      // Since i32 is the smallest legal type, we either need a no-op
5312      // or a truncation.
5313      SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Splat);
5314      return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
5315    }
5316  }
5317
5318  // See whether the shift vector is a splat represented as SHUFFLE_VECTOR,
5319  // and the shift amount is directly available in a GPR.
5320  if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(Op1)) {
5321    if (VSN->isSplat()) {
5322      SDValue VSNOp0 = VSN->getOperand(0);
5323      unsigned Index = VSN->getSplatIndex();
5324      assert(Index < VT.getVectorNumElements() &&
5325             "Splat index should be defined and in first operand");
5326      if ((Index == 0 && VSNOp0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
5327          VSNOp0.getOpcode() == ISD::BUILD_VECTOR) {
5328        // Since i32 is the smallest legal type, we either need a no-op
5329        // or a truncation.
5330        SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
5331                                    VSNOp0.getOperand(Index));
5332        return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
5333      }
5334    }
5335  }
5336
5337  // Otherwise just treat the current form as legal.
5338  return Op;
5339}
5340
5341SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
5342                                              SelectionDAG &DAG) const {
5343  switch (Op.getOpcode()) {
5344  case ISD::FRAMEADDR:
5345    return lowerFRAMEADDR(Op, DAG);
5346  case ISD::RETURNADDR:
5347    return lowerRETURNADDR(Op, DAG);
5348  case ISD::BR_CC:
5349    return lowerBR_CC(Op, DAG);
5350  case ISD::SELECT_CC:
5351    return lowerSELECT_CC(Op, DAG);
5352  case ISD::SETCC:
5353    return lowerSETCC(Op, DAG);
5354  case ISD::STRICT_FSETCC:
5355    return lowerSTRICT_FSETCC(Op, DAG, false);
5356  case ISD::STRICT_FSETCCS:
5357    return lowerSTRICT_FSETCC(Op, DAG, true);
5358  case ISD::GlobalAddress:
5359    return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
5360  case ISD::GlobalTLSAddress:
5361    return lowerGlobalTLSAddress(cast<GlobalAddressSDNode>(Op), DAG);
5362  case ISD::BlockAddress:
5363    return lowerBlockAddress(cast<BlockAddressSDNode>(Op), DAG);
5364  case ISD::JumpTable:
5365    return lowerJumpTable(cast<JumpTableSDNode>(Op), DAG);
5366  case ISD::ConstantPool:
5367    return lowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG);
5368  case ISD::BITCAST:
5369    return lowerBITCAST(Op, DAG);
5370  case ISD::VASTART:
5371    return lowerVASTART(Op, DAG);
5372  case ISD::VACOPY:
5373    return lowerVACOPY(Op, DAG);
5374  case ISD::DYNAMIC_STACKALLOC:
5375    return lowerDYNAMIC_STACKALLOC(Op, DAG);
5376  case ISD::GET_DYNAMIC_AREA_OFFSET:
5377    return lowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
5378  case ISD::SMUL_LOHI:
5379    return lowerSMUL_LOHI(Op, DAG);
5380  case ISD::UMUL_LOHI:
5381    return lowerUMUL_LOHI(Op, DAG);
5382  case ISD::SDIVREM:
5383    return lowerSDIVREM(Op, DAG);
5384  case ISD::UDIVREM:
5385    return lowerUDIVREM(Op, DAG);
5386  case ISD::SADDO:
5387  case ISD::SSUBO:
5388  case ISD::UADDO:
5389  case ISD::USUBO:
5390    return lowerXALUO(Op, DAG);
5391  case ISD::ADDCARRY:
5392  case ISD::SUBCARRY:
5393    return lowerADDSUBCARRY(Op, DAG);
5394  case ISD::OR:
5395    return lowerOR(Op, DAG);
5396  case ISD::CTPOP:
5397    return lowerCTPOP(Op, DAG);
5398  case ISD::ATOMIC_FENCE:
5399    return lowerATOMIC_FENCE(Op, DAG);
5400  case ISD::ATOMIC_SWAP:
5401    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
5402  case ISD::ATOMIC_STORE:
5403    return lowerATOMIC_STORE(Op, DAG);
5404  case ISD::ATOMIC_LOAD:
5405    return lowerATOMIC_LOAD(Op, DAG);
5406  case ISD::ATOMIC_LOAD_ADD:
5407    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
5408  case ISD::ATOMIC_LOAD_SUB:
5409    return lowerATOMIC_LOAD_SUB(Op, DAG);
5410  case ISD::ATOMIC_LOAD_AND:
5411    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
5412  case ISD::ATOMIC_LOAD_OR:
5413    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
5414  case ISD::ATOMIC_LOAD_XOR:
5415    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
5416  case ISD::ATOMIC_LOAD_NAND:
5417    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
5418  case ISD::ATOMIC_LOAD_MIN:
5419    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
5420  case ISD::ATOMIC_LOAD_MAX:
5421    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
5422  case ISD::ATOMIC_LOAD_UMIN:
5423    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
5424  case ISD::ATOMIC_LOAD_UMAX:
5425    return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
5426  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
5427    return lowerATOMIC_CMP_SWAP(Op, DAG);
5428  case ISD::STACKSAVE:
5429    return lowerSTACKSAVE(Op, DAG);
5430  case ISD::STACKRESTORE:
5431    return lowerSTACKRESTORE(Op, DAG);
5432  case ISD::PREFETCH:
5433    return lowerPREFETCH(Op, DAG);
5434  case ISD::INTRINSIC_W_CHAIN:
5435    return lowerINTRINSIC_W_CHAIN(Op, DAG);
5436  case ISD::INTRINSIC_WO_CHAIN:
5437    return lowerINTRINSIC_WO_CHAIN(Op, DAG);
5438  case ISD::BUILD_VECTOR:
5439    return lowerBUILD_VECTOR(Op, DAG);
5440  case ISD::VECTOR_SHUFFLE:
5441    return lowerVECTOR_SHUFFLE(Op, DAG);
5442  case ISD::SCALAR_TO_VECTOR:
5443    return lowerSCALAR_TO_VECTOR(Op, DAG);
5444  case ISD::INSERT_VECTOR_ELT:
5445    return lowerINSERT_VECTOR_ELT(Op, DAG);
5446  case ISD::EXTRACT_VECTOR_ELT:
5447    return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5448  case ISD::SIGN_EXTEND_VECTOR_INREG:
5449    return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG);
5450  case ISD::ZERO_EXTEND_VECTOR_INREG:
5451    return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
5452  case ISD::SHL:
5453    return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
5454  case ISD::SRL:
5455    return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR);
5456  case ISD::SRA:
5457    return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR);
5458  default:
5459    llvm_unreachable("Unexpected node to lower");
5460  }
5461}
5462
5463// Lower operations with invalid operand or result types (currently used
5464// only for 128-bit integer types).
5465
5466static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
5467  SDLoc DL(In);
5468  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
5469                           DAG.getIntPtrConstant(0, DL));
5470  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
5471                           DAG.getIntPtrConstant(1, DL));
5472  SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
5473                                    MVT::Untyped, Hi, Lo);
5474  return SDValue(Pair, 0);
5475}
5476
5477static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) {
5478  SDLoc DL(In);
5479  SDValue Hi = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
5480                                          DL, MVT::i64, In);
5481  SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
5482                                          DL, MVT::i64, In);
5483  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi);
5484}
5485
5486void
5487SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
5488                                             SmallVectorImpl<SDValue> &Results,
5489                                             SelectionDAG &DAG) const {
5490  switch (N->getOpcode()) {
5491  case ISD::ATOMIC_LOAD: {
5492    SDLoc DL(N);
5493    SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::Other);
5494    SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
5495    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
5496    SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
5497                                          DL, Tys, Ops, MVT::i128, MMO);
5498    Results.push_back(lowerGR128ToI128(DAG, Res));
5499    Results.push_back(Res.getValue(1));
5500    break;
5501  }
5502  case ISD::ATOMIC_STORE: {
5503    SDLoc DL(N);
5504    SDVTList Tys = DAG.getVTList(MVT::Other);
5505    SDValue Ops[] = { N->getOperand(0),
5506                      lowerI128ToGR128(DAG, N->getOperand(2)),
5507                      N->getOperand(1) };
5508    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
5509    SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_STORE_128,
5510                                          DL, Tys, Ops, MVT::i128, MMO);
5511    // We have to enforce sequential consistency by performing a
5512    // serialization operation after the store.
5513    if (cast<AtomicSDNode>(N)->getOrdering() ==
5514        AtomicOrdering::SequentiallyConsistent)
5515      Res = SDValue(DAG.getMachineNode(SystemZ::Serialize, DL,
5516                                       MVT::Other, Res), 0);
5517    Results.push_back(Res);
5518    break;
5519  }
5520  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
5521    SDLoc DL(N);
5522    SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other);
5523    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
5524                      lowerI128ToGR128(DAG, N->getOperand(2)),
5525                      lowerI128ToGR128(DAG, N->getOperand(3)) };
5526    MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
5527    SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP_128,
5528                                          DL, Tys, Ops, MVT::i128, MMO);
5529    SDValue Success = emitSETCC(DAG, DL, Res.getValue(1),
5530                                SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
5531    Success = DAG.getZExtOrTrunc(Success, DL, N->getValueType(1));
5532    Results.push_back(lowerGR128ToI128(DAG, Res));
5533    Results.push_back(Success);
5534    Results.push_back(Res.getValue(2));
5535    break;
5536  }
5537  default:
5538    llvm_unreachable("Unexpected node to lower");
5539  }
5540}
5541
5542void
5543SystemZTargetLowering::ReplaceNodeResults(SDNode *N,
5544                                          SmallVectorImpl<SDValue> &Results,
5545                                          SelectionDAG &DAG) const {
5546  return LowerOperationWrapper(N, Results, DAG);
5547}
5548
5549const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
5550#define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
5551  switch ((SystemZISD::NodeType)Opcode) {
5552    case SystemZISD::FIRST_NUMBER: break;
5553    OPCODE(RET_FLAG);
5554    OPCODE(CALL);
5555    OPCODE(SIBCALL);
5556    OPCODE(TLS_GDCALL);
5557    OPCODE(TLS_LDCALL);
5558    OPCODE(PCREL_WRAPPER);
5559    OPCODE(PCREL_OFFSET);
5560    OPCODE(IABS);
5561    OPCODE(ICMP);
5562    OPCODE(FCMP);
5563    OPCODE(STRICT_FCMP);
5564    OPCODE(STRICT_FCMPS);
5565    OPCODE(TM);
5566    OPCODE(BR_CCMASK);
5567    OPCODE(SELECT_CCMASK);
5568    OPCODE(ADJDYNALLOC);
5569    OPCODE(PROBED_ALLOCA);
5570    OPCODE(POPCNT);
5571    OPCODE(SMUL_LOHI);
5572    OPCODE(UMUL_LOHI);
5573    OPCODE(SDIVREM);
5574    OPCODE(UDIVREM);
5575    OPCODE(SADDO);
5576    OPCODE(SSUBO);
5577    OPCODE(UADDO);
5578    OPCODE(USUBO);
5579    OPCODE(ADDCARRY);
5580    OPCODE(SUBCARRY);
5581    OPCODE(GET_CCMASK);
5582    OPCODE(MVC);
5583    OPCODE(MVC_LOOP);
5584    OPCODE(NC);
5585    OPCODE(NC_LOOP);
5586    OPCODE(OC);
5587    OPCODE(OC_LOOP);
5588    OPCODE(XC);
5589    OPCODE(XC_LOOP);
5590    OPCODE(CLC);
5591    OPCODE(CLC_LOOP);
5592    OPCODE(STPCPY);
5593    OPCODE(STRCMP);
5594    OPCODE(SEARCH_STRING);
5595    OPCODE(IPM);
5596    OPCODE(MEMBARRIER);
5597    OPCODE(TBEGIN);
5598    OPCODE(TBEGIN_NOFLOAT);
5599    OPCODE(TEND);
5600    OPCODE(BYTE_MASK);
5601    OPCODE(ROTATE_MASK);
5602    OPCODE(REPLICATE);
5603    OPCODE(JOIN_DWORDS);
5604    OPCODE(SPLAT);
5605    OPCODE(MERGE_HIGH);
5606    OPCODE(MERGE_LOW);
5607    OPCODE(SHL_DOUBLE);
5608    OPCODE(PERMUTE_DWORDS);
5609    OPCODE(PERMUTE);
5610    OPCODE(PACK);
5611    OPCODE(PACKS_CC);
5612    OPCODE(PACKLS_CC);
5613    OPCODE(UNPACK_HIGH);
5614    OPCODE(UNPACKL_HIGH);
5615    OPCODE(UNPACK_LOW);
5616    OPCODE(UNPACKL_LOW);
5617    OPCODE(VSHL_BY_SCALAR);
5618    OPCODE(VSRL_BY_SCALAR);
5619    OPCODE(VSRA_BY_SCALAR);
5620    OPCODE(VSUM);
5621    OPCODE(VICMPE);
5622    OPCODE(VICMPH);
5623    OPCODE(VICMPHL);
5624    OPCODE(VICMPES);
5625    OPCODE(VICMPHS);
5626    OPCODE(VICMPHLS);
5627    OPCODE(VFCMPE);
5628    OPCODE(STRICT_VFCMPE);
5629    OPCODE(STRICT_VFCMPES);
5630    OPCODE(VFCMPH);
5631    OPCODE(STRICT_VFCMPH);
5632    OPCODE(STRICT_VFCMPHS);
5633    OPCODE(VFCMPHE);
5634    OPCODE(STRICT_VFCMPHE);
5635    OPCODE(STRICT_VFCMPHES);
5636    OPCODE(VFCMPES);
5637    OPCODE(VFCMPHS);
5638    OPCODE(VFCMPHES);
5639    OPCODE(VFTCI);
5640    OPCODE(VEXTEND);
5641    OPCODE(STRICT_VEXTEND);
5642    OPCODE(VROUND);
5643    OPCODE(STRICT_VROUND);
5644    OPCODE(VTM);
5645    OPCODE(VFAE_CC);
5646    OPCODE(VFAEZ_CC);
5647    OPCODE(VFEE_CC);
5648    OPCODE(VFEEZ_CC);
5649    OPCODE(VFENE_CC);
5650    OPCODE(VFENEZ_CC);
5651    OPCODE(VISTR_CC);
5652    OPCODE(VSTRC_CC);
5653    OPCODE(VSTRCZ_CC);
5654    OPCODE(VSTRS_CC);
5655    OPCODE(VSTRSZ_CC);
5656    OPCODE(TDC);
5657    OPCODE(ATOMIC_SWAPW);
5658    OPCODE(ATOMIC_LOADW_ADD);
5659    OPCODE(ATOMIC_LOADW_SUB);
5660    OPCODE(ATOMIC_LOADW_AND);
5661    OPCODE(ATOMIC_LOADW_OR);
5662    OPCODE(ATOMIC_LOADW_XOR);
5663    OPCODE(ATOMIC_LOADW_NAND);
5664    OPCODE(ATOMIC_LOADW_MIN);
5665    OPCODE(ATOMIC_LOADW_MAX);
5666    OPCODE(ATOMIC_LOADW_UMIN);
5667    OPCODE(ATOMIC_LOADW_UMAX);
5668    OPCODE(ATOMIC_CMP_SWAPW);
5669    OPCODE(ATOMIC_CMP_SWAP);
5670    OPCODE(ATOMIC_LOAD_128);
5671    OPCODE(ATOMIC_STORE_128);
5672    OPCODE(ATOMIC_CMP_SWAP_128);
5673    OPCODE(LRV);
5674    OPCODE(STRV);
5675    OPCODE(VLER);
5676    OPCODE(VSTER);
5677    OPCODE(PREFETCH);
5678  }
5679  return nullptr;
5680#undef OPCODE
5681}
5682
5683// Return true if VT is a vector whose elements are a whole number of bytes
5684// in width. Also check for presence of vector support.
5685bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
5686  if (!Subtarget.hasVector())
5687    return false;
5688
5689  return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0 && VT.isSimple();
5690}
5691
5692// Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
5693// producing a result of type ResVT.  Op is a possibly bitcast version
5694// of the input vector and Index is the index (based on type VecVT) that
5695// should be extracted.  Return the new extraction if a simplification
5696// was possible or if Force is true.
5697SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
5698                                              EVT VecVT, SDValue Op,
5699                                              unsigned Index,
5700                                              DAGCombinerInfo &DCI,
5701                                              bool Force) const {
5702  SelectionDAG &DAG = DCI.DAG;
5703
5704  // The number of bytes being extracted.
5705  unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
5706
5707  for (;;) {
5708    unsigned Opcode = Op.getOpcode();
5709    if (Opcode == ISD::BITCAST)
5710      // Look through bitcasts.
5711      Op = Op.getOperand(0);
5712    else if ((Opcode == ISD::VECTOR_SHUFFLE || Opcode == SystemZISD::SPLAT) &&
5713             canTreatAsByteVector(Op.getValueType())) {
5714      // Get a VPERM-like permute mask and see whether the bytes covered
5715      // by the extracted element are a contiguous sequence from one
5716      // source operand.
5717      SmallVector<int, SystemZ::VectorBytes> Bytes;
5718      if (!getVPermMask(Op, Bytes))
5719        break;
5720      int First;
5721      if (!getShuffleInput(Bytes, Index * BytesPerElement,
5722                           BytesPerElement, First))
5723        break;
5724      if (First < 0)
5725        return DAG.getUNDEF(ResVT);
5726      // Make sure the contiguous sequence starts at a multiple of the
5727      // original element size.
5728      unsigned Byte = unsigned(First) % Bytes.size();
5729      if (Byte % BytesPerElement != 0)
5730        break;
5731      // We can get the extracted value directly from an input.
5732      Index = Byte / BytesPerElement;
5733      Op = Op.getOperand(unsigned(First) / Bytes.size());
5734      Force = true;
5735    } else if (Opcode == ISD::BUILD_VECTOR &&
5736               canTreatAsByteVector(Op.getValueType())) {
5737      // We can only optimize this case if the BUILD_VECTOR elements are
5738      // at least as wide as the extracted value.
5739      EVT OpVT = Op.getValueType();
5740      unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
5741      if (OpBytesPerElement < BytesPerElement)
5742        break;
5743      // Make sure that the least-significant bit of the extracted value
5744      // is the least significant bit of an input.
5745      unsigned End = (Index + 1) * BytesPerElement;
5746      if (End % OpBytesPerElement != 0)
5747        break;
5748      // We're extracting the low part of one operand of the BUILD_VECTOR.
5749      Op = Op.getOperand(End / OpBytesPerElement - 1);
5750      if (!Op.getValueType().isInteger()) {
5751        EVT VT = MVT::getIntegerVT(Op.getValueSizeInBits());
5752        Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
5753        DCI.AddToWorklist(Op.getNode());
5754      }
5755      EVT VT = MVT::getIntegerVT(ResVT.getSizeInBits());
5756      Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
5757      if (VT != ResVT) {
5758        DCI.AddToWorklist(Op.getNode());
5759        Op = DAG.getNode(ISD::BITCAST, DL, ResVT, Op);
5760      }
5761      return Op;
5762    } else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
5763                Opcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
5764                Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
5765               canTreatAsByteVector(Op.getValueType()) &&
5766               canTreatAsByteVector(Op.getOperand(0).getValueType())) {
5767      // Make sure that only the unextended bits are significant.
5768      EVT ExtVT = Op.getValueType();
5769      EVT OpVT = Op.getOperand(0).getValueType();
5770      unsigned ExtBytesPerElement = ExtVT.getVectorElementType().getStoreSize();
5771      unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
5772      unsigned Byte = Index * BytesPerElement;
5773      unsigned SubByte = Byte % ExtBytesPerElement;
5774      unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement;
5775      if (SubByte < MinSubByte ||
5776          SubByte + BytesPerElement > ExtBytesPerElement)
5777        break;
5778      // Get the byte offset of the unextended element
5779      Byte = Byte / ExtBytesPerElement * OpBytesPerElement;
5780      // ...then add the byte offset relative to that element.
5781      Byte += SubByte - MinSubByte;
5782      if (Byte % BytesPerElement != 0)
5783        break;
5784      Op = Op.getOperand(0);
5785      Index = Byte / BytesPerElement;
5786      Force = true;
5787    } else
5788      break;
5789  }
5790  if (Force) {
5791    if (Op.getValueType() != VecVT) {
5792      Op = DAG.getNode(ISD::BITCAST, DL, VecVT, Op);
5793      DCI.AddToWorklist(Op.getNode());
5794    }
5795    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op,
5796                       DAG.getConstant(Index, DL, MVT::i32));
5797  }
5798  return SDValue();
5799}
5800
5801// Optimize vector operations in scalar value Op on the basis that Op
5802// is truncated to TruncVT.
5803SDValue SystemZTargetLowering::combineTruncateExtract(
5804    const SDLoc &DL, EVT TruncVT, SDValue Op, DAGCombinerInfo &DCI) const {
5805  // If we have (trunc (extract_vector_elt X, Y)), try to turn it into
5806  // (extract_vector_elt (bitcast X), Y'), where (bitcast X) has elements
5807  // of type TruncVT.
5808  if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5809      TruncVT.getSizeInBits() % 8 == 0) {
5810    SDValue Vec = Op.getOperand(0);
5811    EVT VecVT = Vec.getValueType();
5812    if (canTreatAsByteVector(VecVT)) {
5813      if (auto *IndexN = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
5814        unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
5815        unsigned TruncBytes = TruncVT.getStoreSize();
5816        if (BytesPerElement % TruncBytes == 0) {
5817          // Calculate the value of Y' in the above description.  We are
5818          // splitting the original elements into Scale equal-sized pieces
5819          // and for truncation purposes want the last (least-significant)
5820          // of these pieces for IndexN.  This is easiest to do by calculating
5821          // the start index of the following element and then subtracting 1.
5822          unsigned Scale = BytesPerElement / TruncBytes;
5823          unsigned NewIndex = (IndexN->getZExtValue() + 1) * Scale - 1;
5824
5825          // Defer the creation of the bitcast from X to combineExtract,
5826          // which might be able to optimize the extraction.
5827          VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8),
5828                                   VecVT.getStoreSize() / TruncBytes);
5829          EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT);
5830          return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true);
5831        }
5832      }
5833    }
5834  }
5835  return SDValue();
5836}
5837
5838SDValue SystemZTargetLowering::combineZERO_EXTEND(
5839    SDNode *N, DAGCombinerInfo &DCI) const {
5840  // Convert (zext (select_ccmask C1, C2)) into (select_ccmask C1', C2')
5841  SelectionDAG &DAG = DCI.DAG;
5842  SDValue N0 = N->getOperand(0);
5843  EVT VT = N->getValueType(0);
5844  if (N0.getOpcode() == SystemZISD::SELECT_CCMASK) {
5845    auto *TrueOp = dyn_cast<ConstantSDNode>(N0.getOperand(0));
5846    auto *FalseOp = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5847    if (TrueOp && FalseOp) {
5848      SDLoc DL(N0);
5849      SDValue Ops[] = { DAG.getConstant(TrueOp->getZExtValue(), DL, VT),
5850                        DAG.getConstant(FalseOp->getZExtValue(), DL, VT),
5851                        N0.getOperand(2), N0.getOperand(3), N0.getOperand(4) };
5852      SDValue NewSelect = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VT, Ops);
5853      // If N0 has multiple uses, change other uses as well.
5854      if (!N0.hasOneUse()) {
5855        SDValue TruncSelect =
5856          DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), NewSelect);
5857        DCI.CombineTo(N0.getNode(), TruncSelect);
5858      }
5859      return NewSelect;
5860    }
5861  }
5862  return SDValue();
5863}
5864
5865SDValue SystemZTargetLowering::combineSIGN_EXTEND_INREG(
5866    SDNode *N, DAGCombinerInfo &DCI) const {
5867  // Convert (sext_in_reg (setcc LHS, RHS, COND), i1)
5868  // and (sext_in_reg (any_extend (setcc LHS, RHS, COND)), i1)
5869  // into (select_cc LHS, RHS, -1, 0, COND)
5870  SelectionDAG &DAG = DCI.DAG;
5871  SDValue N0 = N->getOperand(0);
5872  EVT VT = N->getValueType(0);
5873  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
5874  if (N0.hasOneUse() && N0.getOpcode() == ISD::ANY_EXTEND)
5875    N0 = N0.getOperand(0);
5876  if (EVT == MVT::i1 && N0.hasOneUse() && N0.getOpcode() == ISD::SETCC) {
5877    SDLoc DL(N0);
5878    SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1),
5879                      DAG.getConstant(-1, DL, VT), DAG.getConstant(0, DL, VT),
5880                      N0.getOperand(2) };
5881    return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
5882  }
5883  return SDValue();
5884}
5885
5886SDValue SystemZTargetLowering::combineSIGN_EXTEND(
5887    SDNode *N, DAGCombinerInfo &DCI) const {
5888  // Convert (sext (ashr (shl X, C1), C2)) to
5889  // (ashr (shl (anyext X), C1'), C2')), since wider shifts are as
5890  // cheap as narrower ones.
5891  SelectionDAG &DAG = DCI.DAG;
5892  SDValue N0 = N->getOperand(0);
5893  EVT VT = N->getValueType(0);
5894  if (N0.hasOneUse() && N0.getOpcode() == ISD::SRA) {
5895    auto *SraAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5896    SDValue Inner = N0.getOperand(0);
5897    if (SraAmt && Inner.hasOneUse() && Inner.getOpcode() == ISD::SHL) {
5898      if (auto *ShlAmt = dyn_cast<ConstantSDNode>(Inner.getOperand(1))) {
5899        unsigned Extra = (VT.getSizeInBits() - N0.getValueSizeInBits());
5900        unsigned NewShlAmt = ShlAmt->getZExtValue() + Extra;
5901        unsigned NewSraAmt = SraAmt->getZExtValue() + Extra;
5902        EVT ShiftVT = N0.getOperand(1).getValueType();
5903        SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Inner), VT,
5904                                  Inner.getOperand(0));
5905        SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(Inner), VT, Ext,
5906                                  DAG.getConstant(NewShlAmt, SDLoc(Inner),
5907                                                  ShiftVT));
5908        return DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl,
5909                           DAG.getConstant(NewSraAmt, SDLoc(N0), ShiftVT));
5910      }
5911    }
5912  }
5913  return SDValue();
5914}
5915
5916SDValue SystemZTargetLowering::combineMERGE(
5917    SDNode *N, DAGCombinerInfo &DCI) const {
5918  SelectionDAG &DAG = DCI.DAG;
5919  unsigned Opcode = N->getOpcode();
5920  SDValue Op0 = N->getOperand(0);
5921  SDValue Op1 = N->getOperand(1);
5922  if (Op0.getOpcode() == ISD::BITCAST)
5923    Op0 = Op0.getOperand(0);
5924  if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
5925    // (z_merge_* 0, 0) -> 0.  This is mostly useful for using VLLEZF
5926    // for v4f32.
5927    if (Op1 == N->getOperand(0))
5928      return Op1;
5929    // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
5930    EVT VT = Op1.getValueType();
5931    unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
5932    if (ElemBytes <= 4) {
5933      Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
5934                SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
5935      EVT InVT = VT.changeVectorElementTypeToInteger();
5936      EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
5937                                   SystemZ::VectorBytes / ElemBytes / 2);
5938      if (VT != InVT) {
5939        Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
5940        DCI.AddToWorklist(Op1.getNode());
5941      }
5942      SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
5943      DCI.AddToWorklist(Op.getNode());
5944      return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
5945    }
5946  }
5947  return SDValue();
5948}
5949
5950SDValue SystemZTargetLowering::combineLOAD(
5951    SDNode *N, DAGCombinerInfo &DCI) const {
5952  SelectionDAG &DAG = DCI.DAG;
5953  EVT LdVT = N->getValueType(0);
5954  if (LdVT.isVector() || LdVT.isInteger())
5955    return SDValue();
5956  // Transform a scalar load that is REPLICATEd as well as having other
5957  // use(s) to the form where the other use(s) use the first element of the
5958  // REPLICATE instead of the load. Otherwise instruction selection will not
5959  // produce a VLREP. Avoid extracting to a GPR, so only do this for floating
5960  // point loads.
5961
5962  SDValue Replicate;
5963  SmallVector<SDNode*, 8> OtherUses;
5964  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
5965       UI != UE; ++UI) {
5966    if (UI->getOpcode() == SystemZISD::REPLICATE) {
5967      if (Replicate)
5968        return SDValue(); // Should never happen
5969      Replicate = SDValue(*UI, 0);
5970    }
5971    else if (UI.getUse().getResNo() == 0)
5972      OtherUses.push_back(*UI);
5973  }
5974  if (!Replicate || OtherUses.empty())
5975    return SDValue();
5976
5977  SDLoc DL(N);
5978  SDValue Extract0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, LdVT,
5979                              Replicate, DAG.getConstant(0, DL, MVT::i32));
5980  // Update uses of the loaded Value while preserving old chains.
5981  for (SDNode *U : OtherUses) {
5982    SmallVector<SDValue, 8> Ops;
5983    for (SDValue Op : U->ops())
5984      Ops.push_back((Op.getNode() == N && Op.getResNo() == 0) ? Extract0 : Op);
5985    DAG.UpdateNodeOperands(U, Ops);
5986  }
5987  return SDValue(N, 0);
5988}
5989
5990bool SystemZTargetLowering::canLoadStoreByteSwapped(EVT VT) const {
5991  if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)
5992    return true;
5993  if (Subtarget.hasVectorEnhancements2())
5994    if (VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64)
5995      return true;
5996  return false;
5997}
5998
5999static bool isVectorElementSwap(ArrayRef<int> M, EVT VT) {
6000  if (!VT.isVector() || !VT.isSimple() ||
6001      VT.getSizeInBits() != 128 ||
6002      VT.getScalarSizeInBits() % 8 != 0)
6003    return false;
6004
6005  unsigned NumElts = VT.getVectorNumElements();
6006  for (unsigned i = 0; i < NumElts; ++i) {
6007    if (M[i] < 0) continue; // ignore UNDEF indices
6008    if ((unsigned) M[i] != NumElts - 1 - i)
6009      return false;
6010  }
6011
6012  return true;
6013}
6014
6015SDValue SystemZTargetLowering::combineSTORE(
6016    SDNode *N, DAGCombinerInfo &DCI) const {
6017  SelectionDAG &DAG = DCI.DAG;
6018  auto *SN = cast<StoreSDNode>(N);
6019  auto &Op1 = N->getOperand(1);
6020  EVT MemVT = SN->getMemoryVT();
6021  // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
6022  // for the extraction to be done on a vMiN value, so that we can use VSTE.
6023  // If X has wider elements then convert it to:
6024  // (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z).
6025  if (MemVT.isInteger() && SN->isTruncatingStore()) {
6026    if (SDValue Value =
6027            combineTruncateExtract(SDLoc(N), MemVT, SN->getValue(), DCI)) {
6028      DCI.AddToWorklist(Value.getNode());
6029
6030      // Rewrite the store with the new form of stored value.
6031      return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value,
6032                               SN->getBasePtr(), SN->getMemoryVT(),
6033                               SN->getMemOperand());
6034    }
6035  }
6036  // Combine STORE (BSWAP) into STRVH/STRV/STRVG/VSTBR
6037  if (!SN->isTruncatingStore() &&
6038      Op1.getOpcode() == ISD::BSWAP &&
6039      Op1.getNode()->hasOneUse() &&
6040      canLoadStoreByteSwapped(Op1.getValueType())) {
6041
6042      SDValue BSwapOp = Op1.getOperand(0);
6043
6044      if (BSwapOp.getValueType() == MVT::i16)
6045        BSwapOp = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), MVT::i32, BSwapOp);
6046
6047      SDValue Ops[] = {
6048        N->getOperand(0), BSwapOp, N->getOperand(2)
6049      };
6050
6051      return
6052        DAG.getMemIntrinsicNode(SystemZISD::STRV, SDLoc(N), DAG.getVTList(MVT::Other),
6053                                Ops, MemVT, SN->getMemOperand());
6054    }
6055  // Combine STORE (element-swap) into VSTER
6056  if (!SN->isTruncatingStore() &&
6057      Op1.getOpcode() == ISD::VECTOR_SHUFFLE &&
6058      Op1.getNode()->hasOneUse() &&
6059      Subtarget.hasVectorEnhancements2()) {
6060    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op1.getNode());
6061    ArrayRef<int> ShuffleMask = SVN->getMask();
6062    if (isVectorElementSwap(ShuffleMask, Op1.getValueType())) {
6063      SDValue Ops[] = {
6064        N->getOperand(0), Op1.getOperand(0), N->getOperand(2)
6065      };
6066
6067      return DAG.getMemIntrinsicNode(SystemZISD::VSTER, SDLoc(N),
6068                                     DAG.getVTList(MVT::Other),
6069                                     Ops, MemVT, SN->getMemOperand());
6070    }
6071  }
6072
6073  return SDValue();
6074}
6075
6076SDValue SystemZTargetLowering::combineVECTOR_SHUFFLE(
6077    SDNode *N, DAGCombinerInfo &DCI) const {
6078  SelectionDAG &DAG = DCI.DAG;
6079  // Combine element-swap (LOAD) into VLER
6080  if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
6081      N->getOperand(0).hasOneUse() &&
6082      Subtarget.hasVectorEnhancements2()) {
6083    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
6084    ArrayRef<int> ShuffleMask = SVN->getMask();
6085    if (isVectorElementSwap(ShuffleMask, N->getValueType(0))) {
6086      SDValue Load = N->getOperand(0);
6087      LoadSDNode *LD = cast<LoadSDNode>(Load);
6088
6089      // Create the element-swapping load.
6090      SDValue Ops[] = {
6091        LD->getChain(),    // Chain
6092        LD->getBasePtr()   // Ptr
6093      };
6094      SDValue ESLoad =
6095        DAG.getMemIntrinsicNode(SystemZISD::VLER, SDLoc(N),
6096                                DAG.getVTList(LD->getValueType(0), MVT::Other),
6097                                Ops, LD->getMemoryVT(), LD->getMemOperand());
6098
6099      // First, combine the VECTOR_SHUFFLE away.  This makes the value produced
6100      // by the load dead.
6101      DCI.CombineTo(N, ESLoad);
6102
6103      // Next, combine the load away, we give it a bogus result value but a real
6104      // chain result.  The result value is dead because the shuffle is dead.
6105      DCI.CombineTo(Load.getNode(), ESLoad, ESLoad.getValue(1));
6106
6107      // Return N so it doesn't get rechecked!
6108      return SDValue(N, 0);
6109    }
6110  }
6111
6112  return SDValue();
6113}
6114
6115SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
6116    SDNode *N, DAGCombinerInfo &DCI) const {
6117  SelectionDAG &DAG = DCI.DAG;
6118
6119  if (!Subtarget.hasVector())
6120    return SDValue();
6121
6122  // Look through bitcasts that retain the number of vector elements.
6123  SDValue Op = N->getOperand(0);
6124  if (Op.getOpcode() == ISD::BITCAST &&
6125      Op.getValueType().isVector() &&
6126      Op.getOperand(0).getValueType().isVector() &&
6127      Op.getValueType().getVectorNumElements() ==
6128      Op.getOperand(0).getValueType().getVectorNumElements())
6129    Op = Op.getOperand(0);
6130
6131  // Pull BSWAP out of a vector extraction.
6132  if (Op.getOpcode() == ISD::BSWAP && Op.hasOneUse()) {
6133    EVT VecVT = Op.getValueType();
6134    EVT EltVT = VecVT.getVectorElementType();
6135    Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), EltVT,
6136                     Op.getOperand(0), N->getOperand(1));
6137    DCI.AddToWorklist(Op.getNode());
6138    Op = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Op);
6139    if (EltVT != N->getValueType(0)) {
6140      DCI.AddToWorklist(Op.getNode());
6141      Op = DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op);
6142    }
6143    return Op;
6144  }
6145
6146  // Try to simplify a vector extraction.
6147  if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
6148    SDValue Op0 = N->getOperand(0);
6149    EVT VecVT = Op0.getValueType();
6150    return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
6151                          IndexN->getZExtValue(), DCI, false);
6152  }
6153  return SDValue();
6154}
6155
6156SDValue SystemZTargetLowering::combineJOIN_DWORDS(
6157    SDNode *N, DAGCombinerInfo &DCI) const {
6158  SelectionDAG &DAG = DCI.DAG;
6159  // (join_dwords X, X) == (replicate X)
6160  if (N->getOperand(0) == N->getOperand(1))
6161    return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
6162                       N->getOperand(0));
6163  return SDValue();
6164}
6165
6166static SDValue MergeInputChains(SDNode *N1, SDNode *N2) {
6167  SDValue Chain1 = N1->getOperand(0);
6168  SDValue Chain2 = N2->getOperand(0);
6169
6170  // Trivial case: both nodes take the same chain.
6171  if (Chain1 == Chain2)
6172    return Chain1;
6173
6174  // FIXME - we could handle more complex cases via TokenFactor,
6175  // assuming we can verify that this would not create a cycle.
6176  return SDValue();
6177}
6178
6179SDValue SystemZTargetLowering::combineFP_ROUND(
6180    SDNode *N, DAGCombinerInfo &DCI) const {
6181
6182  if (!Subtarget.hasVector())
6183    return SDValue();
6184
6185  // (fpround (extract_vector_elt X 0))
6186  // (fpround (extract_vector_elt X 1)) ->
6187  // (extract_vector_elt (VROUND X) 0)
6188  // (extract_vector_elt (VROUND X) 2)
6189  //
6190  // This is a special case since the target doesn't really support v2f32s.
6191  unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
6192  SelectionDAG &DAG = DCI.DAG;
6193  SDValue Op0 = N->getOperand(OpNo);
6194  if (N->getValueType(0) == MVT::f32 &&
6195      Op0.hasOneUse() &&
6196      Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6197      Op0.getOperand(0).getValueType() == MVT::v2f64 &&
6198      Op0.getOperand(1).getOpcode() == ISD::Constant &&
6199      cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
6200    SDValue Vec = Op0.getOperand(0);
6201    for (auto *U : Vec->uses()) {
6202      if (U != Op0.getNode() &&
6203          U->hasOneUse() &&
6204          U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6205          U->getOperand(0) == Vec &&
6206          U->getOperand(1).getOpcode() == ISD::Constant &&
6207          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
6208        SDValue OtherRound = SDValue(*U->use_begin(), 0);
6209        if (OtherRound.getOpcode() == N->getOpcode() &&
6210            OtherRound.getOperand(OpNo) == SDValue(U, 0) &&
6211            OtherRound.getValueType() == MVT::f32) {
6212          SDValue VRound, Chain;
6213          if (N->isStrictFPOpcode()) {
6214            Chain = MergeInputChains(N, OtherRound.getNode());
6215            if (!Chain)
6216              continue;
6217            VRound = DAG.getNode(SystemZISD::STRICT_VROUND, SDLoc(N),
6218                                 {MVT::v4f32, MVT::Other}, {Chain, Vec});
6219            Chain = VRound.getValue(1);
6220          } else
6221            VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
6222                                 MVT::v4f32, Vec);
6223          DCI.AddToWorklist(VRound.getNode());
6224          SDValue Extract1 =
6225            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
6226                        VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
6227          DCI.AddToWorklist(Extract1.getNode());
6228          DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
6229          if (Chain)
6230            DAG.ReplaceAllUsesOfValueWith(OtherRound.getValue(1), Chain);
6231          SDValue Extract0 =
6232            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
6233                        VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
6234          if (Chain)
6235            return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op0),
6236                               N->getVTList(), Extract0, Chain);
6237          return Extract0;
6238        }
6239      }
6240    }
6241  }
6242  return SDValue();
6243}
6244
6245SDValue SystemZTargetLowering::combineFP_EXTEND(
6246    SDNode *N, DAGCombinerInfo &DCI) const {
6247
6248  if (!Subtarget.hasVector())
6249    return SDValue();
6250
6251  // (fpextend (extract_vector_elt X 0))
6252  // (fpextend (extract_vector_elt X 2)) ->
6253  // (extract_vector_elt (VEXTEND X) 0)
6254  // (extract_vector_elt (VEXTEND X) 1)
6255  //
6256  // This is a special case since the target doesn't really support v2f32s.
6257  unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
6258  SelectionDAG &DAG = DCI.DAG;
6259  SDValue Op0 = N->getOperand(OpNo);
6260  if (N->getValueType(0) == MVT::f64 &&
6261      Op0.hasOneUse() &&
6262      Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6263      Op0.getOperand(0).getValueType() == MVT::v4f32 &&
6264      Op0.getOperand(1).getOpcode() == ISD::Constant &&
6265      cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
6266    SDValue Vec = Op0.getOperand(0);
6267    for (auto *U : Vec->uses()) {
6268      if (U != Op0.getNode() &&
6269          U->hasOneUse() &&
6270          U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6271          U->getOperand(0) == Vec &&
6272          U->getOperand(1).getOpcode() == ISD::Constant &&
6273          cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 2) {
6274        SDValue OtherExtend = SDValue(*U->use_begin(), 0);
6275        if (OtherExtend.getOpcode() == N->getOpcode() &&
6276            OtherExtend.getOperand(OpNo) == SDValue(U, 0) &&
6277            OtherExtend.getValueType() == MVT::f64) {
6278          SDValue VExtend, Chain;
6279          if (N->isStrictFPOpcode()) {
6280            Chain = MergeInputChains(N, OtherExtend.getNode());
6281            if (!Chain)
6282              continue;
6283            VExtend = DAG.getNode(SystemZISD::STRICT_VEXTEND, SDLoc(N),
6284                                  {MVT::v2f64, MVT::Other}, {Chain, Vec});
6285            Chain = VExtend.getValue(1);
6286          } else
6287            VExtend = DAG.getNode(SystemZISD::VEXTEND, SDLoc(N),
6288                                  MVT::v2f64, Vec);
6289          DCI.AddToWorklist(VExtend.getNode());
6290          SDValue Extract1 =
6291            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f64,
6292                        VExtend, DAG.getConstant(1, SDLoc(U), MVT::i32));
6293          DCI.AddToWorklist(Extract1.getNode());
6294          DAG.ReplaceAllUsesOfValueWith(OtherExtend, Extract1);
6295          if (Chain)
6296            DAG.ReplaceAllUsesOfValueWith(OtherExtend.getValue(1), Chain);
6297          SDValue Extract0 =
6298            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f64,
6299                        VExtend, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
6300          if (Chain)
6301            return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op0),
6302                               N->getVTList(), Extract0, Chain);
6303          return Extract0;
6304        }
6305      }
6306    }
6307  }
6308  return SDValue();
6309}
6310
6311SDValue SystemZTargetLowering::combineINT_TO_FP(
6312    SDNode *N, DAGCombinerInfo &DCI) const {
6313  if (DCI.Level != BeforeLegalizeTypes)
6314    return SDValue();
6315  unsigned Opcode = N->getOpcode();
6316  EVT OutVT = N->getValueType(0);
6317  SelectionDAG &DAG = DCI.DAG;
6318  SDValue Op = N->getOperand(0);
6319  unsigned OutScalarBits = OutVT.getScalarSizeInBits();
6320  unsigned InScalarBits = Op->getValueType(0).getScalarSizeInBits();
6321
6322  // Insert an extension before type-legalization to avoid scalarization, e.g.:
6323  // v2f64 = uint_to_fp v2i16
6324  // =>
6325  // v2f64 = uint_to_fp (v2i64 zero_extend v2i16)
6326  if (OutVT.isVector() && OutScalarBits > InScalarBits) {
6327    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(OutVT.getScalarSizeInBits()),
6328                                 OutVT.getVectorNumElements());
6329    unsigned ExtOpcode =
6330      (Opcode == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
6331    SDValue ExtOp = DAG.getNode(ExtOpcode, SDLoc(N), ExtVT, Op);
6332    return DAG.getNode(Opcode, SDLoc(N), OutVT, ExtOp);
6333  }
6334  return SDValue();
6335}
6336
6337SDValue SystemZTargetLowering::combineBSWAP(
6338    SDNode *N, DAGCombinerInfo &DCI) const {
6339  SelectionDAG &DAG = DCI.DAG;
6340  // Combine BSWAP (LOAD) into LRVH/LRV/LRVG/VLBR
6341  if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
6342      N->getOperand(0).hasOneUse() &&
6343      canLoadStoreByteSwapped(N->getValueType(0))) {
6344      SDValue Load = N->getOperand(0);
6345      LoadSDNode *LD = cast<LoadSDNode>(Load);
6346
6347      // Create the byte-swapping load.
6348      SDValue Ops[] = {
6349        LD->getChain(),    // Chain
6350        LD->getBasePtr()   // Ptr
6351      };
6352      EVT LoadVT = N->getValueType(0);
6353      if (LoadVT == MVT::i16)
6354        LoadVT = MVT::i32;
6355      SDValue BSLoad =
6356        DAG.getMemIntrinsicNode(SystemZISD::LRV, SDLoc(N),
6357                                DAG.getVTList(LoadVT, MVT::Other),
6358                                Ops, LD->getMemoryVT(), LD->getMemOperand());
6359
6360      // If this is an i16 load, insert the truncate.
6361      SDValue ResVal = BSLoad;
6362      if (N->getValueType(0) == MVT::i16)
6363        ResVal = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16, BSLoad);
6364
6365      // First, combine the bswap away.  This makes the value produced by the
6366      // load dead.
6367      DCI.CombineTo(N, ResVal);
6368
6369      // Next, combine the load away, we give it a bogus result value but a real
6370      // chain result.  The result value is dead because the bswap is dead.
6371      DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
6372
6373      // Return N so it doesn't get rechecked!
6374      return SDValue(N, 0);
6375    }
6376
6377  // Look through bitcasts that retain the number of vector elements.
6378  SDValue Op = N->getOperand(0);
6379  if (Op.getOpcode() == ISD::BITCAST &&
6380      Op.getValueType().isVector() &&
6381      Op.getOperand(0).getValueType().isVector() &&
6382      Op.getValueType().getVectorNumElements() ==
6383      Op.getOperand(0).getValueType().getVectorNumElements())
6384    Op = Op.getOperand(0);
6385
6386  // Push BSWAP into a vector insertion if at least one side then simplifies.
6387  if (Op.getOpcode() == ISD::INSERT_VECTOR_ELT && Op.hasOneUse()) {
6388    SDValue Vec = Op.getOperand(0);
6389    SDValue Elt = Op.getOperand(1);
6390    SDValue Idx = Op.getOperand(2);
6391
6392    if (DAG.isConstantIntBuildVectorOrConstantInt(Vec) ||
6393        Vec.getOpcode() == ISD::BSWAP || Vec.isUndef() ||
6394        DAG.isConstantIntBuildVectorOrConstantInt(Elt) ||
6395        Elt.getOpcode() == ISD::BSWAP || Elt.isUndef() ||
6396        (canLoadStoreByteSwapped(N->getValueType(0)) &&
6397         ISD::isNON_EXTLoad(Elt.getNode()) && Elt.hasOneUse())) {
6398      EVT VecVT = N->getValueType(0);
6399      EVT EltVT = N->getValueType(0).getVectorElementType();
6400      if (VecVT != Vec.getValueType()) {
6401        Vec = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Vec);
6402        DCI.AddToWorklist(Vec.getNode());
6403      }
6404      if (EltVT != Elt.getValueType()) {
6405        Elt = DAG.getNode(ISD::BITCAST, SDLoc(N), EltVT, Elt);
6406        DCI.AddToWorklist(Elt.getNode());
6407      }
6408      Vec = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Vec);
6409      DCI.AddToWorklist(Vec.getNode());
6410      Elt = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Elt);
6411      DCI.AddToWorklist(Elt.getNode());
6412      return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VecVT,
6413                         Vec, Elt, Idx);
6414    }
6415  }
6416
6417  // Push BSWAP into a vector shuffle if at least one side then simplifies.
6418  ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(Op);
6419  if (SV && Op.hasOneUse()) {
6420    SDValue Op0 = Op.getOperand(0);
6421    SDValue Op1 = Op.getOperand(1);
6422
6423    if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
6424        Op0.getOpcode() == ISD::BSWAP || Op0.isUndef() ||
6425        DAG.isConstantIntBuildVectorOrConstantInt(Op1) ||
6426        Op1.getOpcode() == ISD::BSWAP || Op1.isUndef()) {
6427      EVT VecVT = N->getValueType(0);
6428      if (VecVT != Op0.getValueType()) {
6429        Op0 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op0);
6430        DCI.AddToWorklist(Op0.getNode());
6431      }
6432      if (VecVT != Op1.getValueType()) {
6433        Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op1);
6434        DCI.AddToWorklist(Op1.getNode());
6435      }
6436      Op0 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op0);
6437      DCI.AddToWorklist(Op0.getNode());
6438      Op1 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op1);
6439      DCI.AddToWorklist(Op1.getNode());
6440      return DAG.getVectorShuffle(VecVT, SDLoc(N), Op0, Op1, SV->getMask());
6441    }
6442  }
6443
6444  return SDValue();
6445}
6446
6447static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
6448  // We have a SELECT_CCMASK or BR_CCMASK comparing the condition code
6449  // set by the CCReg instruction using the CCValid / CCMask masks,
6450  // If the CCReg instruction is itself a ICMP testing the condition
6451  // code set by some other instruction, see whether we can directly
6452  // use that condition code.
6453
6454  // Verify that we have an ICMP against some constant.
6455  if (CCValid != SystemZ::CCMASK_ICMP)
6456    return false;
6457  auto *ICmp = CCReg.getNode();
6458  if (ICmp->getOpcode() != SystemZISD::ICMP)
6459    return false;
6460  auto *CompareLHS = ICmp->getOperand(0).getNode();
6461  auto *CompareRHS = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
6462  if (!CompareRHS)
6463    return false;
6464
6465  // Optimize the case where CompareLHS is a SELECT_CCMASK.
6466  if (CompareLHS->getOpcode() == SystemZISD::SELECT_CCMASK) {
6467    // Verify that we have an appropriate mask for a EQ or NE comparison.
6468    bool Invert = false;
6469    if (CCMask == SystemZ::CCMASK_CMP_NE)
6470      Invert = !Invert;
6471    else if (CCMask != SystemZ::CCMASK_CMP_EQ)
6472      return false;
6473
6474    // Verify that the ICMP compares against one of select values.
6475    auto *TrueVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(0));
6476    if (!TrueVal)
6477      return false;
6478    auto *FalseVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
6479    if (!FalseVal)
6480      return false;
6481    if (CompareRHS->getZExtValue() == FalseVal->getZExtValue())
6482      Invert = !Invert;
6483    else if (CompareRHS->getZExtValue() != TrueVal->getZExtValue())
6484      return false;
6485
6486    // Compute the effective CC mask for the new branch or select.
6487    auto *NewCCValid = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(2));
6488    auto *NewCCMask = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(3));
6489    if (!NewCCValid || !NewCCMask)
6490      return false;
6491    CCValid = NewCCValid->getZExtValue();
6492    CCMask = NewCCMask->getZExtValue();
6493    if (Invert)
6494      CCMask ^= CCValid;
6495
6496    // Return the updated CCReg link.
6497    CCReg = CompareLHS->getOperand(4);
6498    return true;
6499  }
6500
6501  // Optimize the case where CompareRHS is (SRA (SHL (IPM))).
6502  if (CompareLHS->getOpcode() == ISD::SRA) {
6503    auto *SRACount = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
6504    if (!SRACount || SRACount->getZExtValue() != 30)
6505      return false;
6506    auto *SHL = CompareLHS->getOperand(0).getNode();
6507    if (SHL->getOpcode() != ISD::SHL)
6508      return false;
6509    auto *SHLCount = dyn_cast<ConstantSDNode>(SHL->getOperand(1));
6510    if (!SHLCount || SHLCount->getZExtValue() != 30 - SystemZ::IPM_CC)
6511      return false;
6512    auto *IPM = SHL->getOperand(0).getNode();
6513    if (IPM->getOpcode() != SystemZISD::IPM)
6514      return false;
6515
6516    // Avoid introducing CC spills (because SRA would clobber CC).
6517    if (!CompareLHS->hasOneUse())
6518      return false;
6519    // Verify that the ICMP compares against zero.
6520    if (CompareRHS->getZExtValue() != 0)
6521      return false;
6522
6523    // Compute the effective CC mask for the new branch or select.
6524    CCMask = SystemZ::reverseCCMask(CCMask);
6525
6526    // Return the updated CCReg link.
6527    CCReg = IPM->getOperand(0);
6528    return true;
6529  }
6530
6531  return false;
6532}
6533
6534SDValue SystemZTargetLowering::combineBR_CCMASK(
6535    SDNode *N, DAGCombinerInfo &DCI) const {
6536  SelectionDAG &DAG = DCI.DAG;
6537
6538  // Combine BR_CCMASK (ICMP (SELECT_CCMASK)) into a single BR_CCMASK.
6539  auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
6540  auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
6541  if (!CCValid || !CCMask)
6542    return SDValue();
6543
6544  int CCValidVal = CCValid->getZExtValue();
6545  int CCMaskVal = CCMask->getZExtValue();
6546  SDValue Chain = N->getOperand(0);
6547  SDValue CCReg = N->getOperand(4);
6548
6549  if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
6550    return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
6551                       Chain,
6552                       DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
6553                       DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
6554                       N->getOperand(3), CCReg);
6555  return SDValue();
6556}
6557
6558SDValue SystemZTargetLowering::combineSELECT_CCMASK(
6559    SDNode *N, DAGCombinerInfo &DCI) const {
6560  SelectionDAG &DAG = DCI.DAG;
6561
6562  // Combine SELECT_CCMASK (ICMP (SELECT_CCMASK)) into a single SELECT_CCMASK.
6563  auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(2));
6564  auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(3));
6565  if (!CCValid || !CCMask)
6566    return SDValue();
6567
6568  int CCValidVal = CCValid->getZExtValue();
6569  int CCMaskVal = CCMask->getZExtValue();
6570  SDValue CCReg = N->getOperand(4);
6571
6572  if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
6573    return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
6574                       N->getOperand(0), N->getOperand(1),
6575                       DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
6576                       DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
6577                       CCReg);
6578  return SDValue();
6579}
6580
6581
6582SDValue SystemZTargetLowering::combineGET_CCMASK(
6583    SDNode *N, DAGCombinerInfo &DCI) const {
6584
6585  // Optimize away GET_CCMASK (SELECT_CCMASK) if the CC masks are compatible
6586  auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
6587  auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
6588  if (!CCValid || !CCMask)
6589    return SDValue();
6590  int CCValidVal = CCValid->getZExtValue();
6591  int CCMaskVal = CCMask->getZExtValue();
6592
6593  SDValue Select = N->getOperand(0);
6594  if (Select->getOpcode() != SystemZISD::SELECT_CCMASK)
6595    return SDValue();
6596
6597  auto *SelectCCValid = dyn_cast<ConstantSDNode>(Select->getOperand(2));
6598  auto *SelectCCMask = dyn_cast<ConstantSDNode>(Select->getOperand(3));
6599  if (!SelectCCValid || !SelectCCMask)
6600    return SDValue();
6601  int SelectCCValidVal = SelectCCValid->getZExtValue();
6602  int SelectCCMaskVal = SelectCCMask->getZExtValue();
6603
6604  auto *TrueVal = dyn_cast<ConstantSDNode>(Select->getOperand(0));
6605  auto *FalseVal = dyn_cast<ConstantSDNode>(Select->getOperand(1));
6606  if (!TrueVal || !FalseVal)
6607    return SDValue();
6608  if (TrueVal->getZExtValue() != 0 && FalseVal->getZExtValue() == 0)
6609    ;
6610  else if (TrueVal->getZExtValue() == 0 && FalseVal->getZExtValue() != 0)
6611    SelectCCMaskVal ^= SelectCCValidVal;
6612  else
6613    return SDValue();
6614
6615  if (SelectCCValidVal & ~CCValidVal)
6616    return SDValue();
6617  if (SelectCCMaskVal != (CCMaskVal & SelectCCValidVal))
6618    return SDValue();
6619
6620  return Select->getOperand(4);
6621}
6622
6623SDValue SystemZTargetLowering::combineIntDIVREM(
6624    SDNode *N, DAGCombinerInfo &DCI) const {
6625  SelectionDAG &DAG = DCI.DAG;
6626  EVT VT = N->getValueType(0);
6627  // In the case where the divisor is a vector of constants a cheaper
6628  // sequence of instructions can replace the divide. BuildSDIV is called to
6629  // do this during DAG combining, but it only succeeds when it can build a
6630  // multiplication node. The only option for SystemZ is ISD::SMUL_LOHI, and
6631  // since it is not Legal but Custom it can only happen before
6632  // legalization. Therefore we must scalarize this early before Combine
6633  // 1. For widened vectors, this is already the result of type legalization.
6634  if (DCI.Level == BeforeLegalizeTypes && VT.isVector() && isTypeLegal(VT) &&
6635      DAG.isConstantIntBuildVectorOrConstantInt(N->getOperand(1)))
6636    return DAG.UnrollVectorOp(N);
6637  return SDValue();
6638}
6639
6640SDValue SystemZTargetLowering::combineINTRINSIC(
6641    SDNode *N, DAGCombinerInfo &DCI) const {
6642  SelectionDAG &DAG = DCI.DAG;
6643
6644  unsigned Id = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
6645  switch (Id) {
6646  // VECTOR LOAD (RIGHTMOST) WITH LENGTH with a length operand of 15
6647  // or larger is simply a vector load.
6648  case Intrinsic::s390_vll:
6649  case Intrinsic::s390_vlrl:
6650    if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
6651      if (C->getZExtValue() >= 15)
6652        return DAG.getLoad(N->getValueType(0), SDLoc(N), N->getOperand(0),
6653                           N->getOperand(3), MachinePointerInfo());
6654    break;
6655  // Likewise for VECTOR STORE (RIGHTMOST) WITH LENGTH.
6656  case Intrinsic::s390_vstl:
6657  case Intrinsic::s390_vstrl:
6658    if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
6659      if (C->getZExtValue() >= 15)
6660        return DAG.getStore(N->getOperand(0), SDLoc(N), N->getOperand(2),
6661                            N->getOperand(4), MachinePointerInfo());
6662    break;
6663  }
6664
6665  return SDValue();
6666}
6667
6668SDValue SystemZTargetLowering::unwrapAddress(SDValue N) const {
6669  if (N->getOpcode() == SystemZISD::PCREL_WRAPPER)
6670    return N->getOperand(0);
6671  return N;
6672}
6673
6674SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
6675                                                 DAGCombinerInfo &DCI) const {
6676  switch(N->getOpcode()) {
6677  default: break;
6678  case ISD::ZERO_EXTEND:        return combineZERO_EXTEND(N, DCI);
6679  case ISD::SIGN_EXTEND:        return combineSIGN_EXTEND(N, DCI);
6680  case ISD::SIGN_EXTEND_INREG:  return combineSIGN_EXTEND_INREG(N, DCI);
6681  case SystemZISD::MERGE_HIGH:
6682  case SystemZISD::MERGE_LOW:   return combineMERGE(N, DCI);
6683  case ISD::LOAD:               return combineLOAD(N, DCI);
6684  case ISD::STORE:              return combineSTORE(N, DCI);
6685  case ISD::VECTOR_SHUFFLE:     return combineVECTOR_SHUFFLE(N, DCI);
6686  case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI);
6687  case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
6688  case ISD::STRICT_FP_ROUND:
6689  case ISD::FP_ROUND:           return combineFP_ROUND(N, DCI);
6690  case ISD::STRICT_FP_EXTEND:
6691  case ISD::FP_EXTEND:          return combineFP_EXTEND(N, DCI);
6692  case ISD::SINT_TO_FP:
6693  case ISD::UINT_TO_FP:         return combineINT_TO_FP(N, DCI);
6694  case ISD::BSWAP:              return combineBSWAP(N, DCI);
6695  case SystemZISD::BR_CCMASK:   return combineBR_CCMASK(N, DCI);
6696  case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
6697  case SystemZISD::GET_CCMASK:  return combineGET_CCMASK(N, DCI);
6698  case ISD::SDIV:
6699  case ISD::UDIV:
6700  case ISD::SREM:
6701  case ISD::UREM:               return combineIntDIVREM(N, DCI);
6702  case ISD::INTRINSIC_W_CHAIN:
6703  case ISD::INTRINSIC_VOID:     return combineINTRINSIC(N, DCI);
6704  }
6705
6706  return SDValue();
6707}
6708
6709// Return the demanded elements for the OpNo source operand of Op. DemandedElts
6710// are for Op.
6711static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
6712                                    unsigned OpNo) {
6713  EVT VT = Op.getValueType();
6714  unsigned NumElts = (VT.isVector() ? VT.getVectorNumElements() : 1);
6715  APInt SrcDemE;
6716  unsigned Opcode = Op.getOpcode();
6717  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
6718    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6719    switch (Id) {
6720    case Intrinsic::s390_vpksh:   // PACKS
6721    case Intrinsic::s390_vpksf:
6722    case Intrinsic::s390_vpksg:
6723    case Intrinsic::s390_vpkshs:  // PACKS_CC
6724    case Intrinsic::s390_vpksfs:
6725    case Intrinsic::s390_vpksgs:
6726    case Intrinsic::s390_vpklsh:  // PACKLS
6727    case Intrinsic::s390_vpklsf:
6728    case Intrinsic::s390_vpklsg:
6729    case Intrinsic::s390_vpklshs: // PACKLS_CC
6730    case Intrinsic::s390_vpklsfs:
6731    case Intrinsic::s390_vpklsgs:
6732      // VECTOR PACK truncates the elements of two source vectors into one.
6733      SrcDemE = DemandedElts;
6734      if (OpNo == 2)
6735        SrcDemE.lshrInPlace(NumElts / 2);
6736      SrcDemE = SrcDemE.trunc(NumElts / 2);
6737      break;
6738      // VECTOR UNPACK extends half the elements of the source vector.
6739    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
6740    case Intrinsic::s390_vuphh:
6741    case Intrinsic::s390_vuphf:
6742    case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
6743    case Intrinsic::s390_vuplhh:
6744    case Intrinsic::s390_vuplhf:
6745      SrcDemE = APInt(NumElts * 2, 0);
6746      SrcDemE.insertBits(DemandedElts, 0);
6747      break;
6748    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
6749    case Intrinsic::s390_vuplhw:
6750    case Intrinsic::s390_vuplf:
6751    case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
6752    case Intrinsic::s390_vupllh:
6753    case Intrinsic::s390_vupllf:
6754      SrcDemE = APInt(NumElts * 2, 0);
6755      SrcDemE.insertBits(DemandedElts, NumElts);
6756      break;
6757    case Intrinsic::s390_vpdi: {
6758      // VECTOR PERMUTE DWORD IMMEDIATE selects one element from each source.
6759      SrcDemE = APInt(NumElts, 0);
6760      if (!DemandedElts[OpNo - 1])
6761        break;
6762      unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
6763      unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
6764      // Demand input element 0 or 1, given by the mask bit value.
6765      SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
6766      break;
6767    }
6768    case Intrinsic::s390_vsldb: {
6769      // VECTOR SHIFT LEFT DOUBLE BY BYTE
6770      assert(VT == MVT::v16i8 && "Unexpected type.");
6771      unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
6772      assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
6773      unsigned NumSrc0Els = 16 - FirstIdx;
6774      SrcDemE = APInt(NumElts, 0);
6775      if (OpNo == 1) {
6776        APInt DemEls = DemandedElts.trunc(NumSrc0Els);
6777        SrcDemE.insertBits(DemEls, FirstIdx);
6778      } else {
6779        APInt DemEls = DemandedElts.lshr(NumSrc0Els);
6780        SrcDemE.insertBits(DemEls, 0);
6781      }
6782      break;
6783    }
6784    case Intrinsic::s390_vperm:
6785      SrcDemE = APInt(NumElts, 1);
6786      break;
6787    default:
6788      llvm_unreachable("Unhandled intrinsic.");
6789      break;
6790    }
6791  } else {
6792    switch (Opcode) {
6793    case SystemZISD::JOIN_DWORDS:
6794      // Scalar operand.
6795      SrcDemE = APInt(1, 1);
6796      break;
6797    case SystemZISD::SELECT_CCMASK:
6798      SrcDemE = DemandedElts;
6799      break;
6800    default:
6801      llvm_unreachable("Unhandled opcode.");
6802      break;
6803    }
6804  }
6805  return SrcDemE;
6806}
6807
6808static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
6809                                  const APInt &DemandedElts,
6810                                  const SelectionDAG &DAG, unsigned Depth,
6811                                  unsigned OpNo) {
6812  APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
6813  APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
6814  KnownBits LHSKnown =
6815      DAG.computeKnownBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
6816  KnownBits RHSKnown =
6817      DAG.computeKnownBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
6818  Known.Zero = LHSKnown.Zero & RHSKnown.Zero;
6819  Known.One = LHSKnown.One & RHSKnown.One;
6820}
6821
6822void
6823SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
6824                                                     KnownBits &Known,
6825                                                     const APInt &DemandedElts,
6826                                                     const SelectionDAG &DAG,
6827                                                     unsigned Depth) const {
6828  Known.resetAll();
6829
6830  // Intrinsic CC result is returned in the two low bits.
6831  unsigned tmp0, tmp1; // not used
6832  if (Op.getResNo() == 1 && isIntrinsicWithCC(Op, tmp0, tmp1)) {
6833    Known.Zero.setBitsFrom(2);
6834    return;
6835  }
6836  EVT VT = Op.getValueType();
6837  if (Op.getResNo() != 0 || VT == MVT::Untyped)
6838    return;
6839  assert (Known.getBitWidth() == VT.getScalarSizeInBits() &&
6840          "KnownBits does not match VT in bitwidth");
6841  assert ((!VT.isVector() ||
6842           (DemandedElts.getBitWidth() == VT.getVectorNumElements())) &&
6843          "DemandedElts does not match VT number of elements");
6844  unsigned BitWidth = Known.getBitWidth();
6845  unsigned Opcode = Op.getOpcode();
6846  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
6847    bool IsLogical = false;
6848    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6849    switch (Id) {
6850    case Intrinsic::s390_vpksh:   // PACKS
6851    case Intrinsic::s390_vpksf:
6852    case Intrinsic::s390_vpksg:
6853    case Intrinsic::s390_vpkshs:  // PACKS_CC
6854    case Intrinsic::s390_vpksfs:
6855    case Intrinsic::s390_vpksgs:
6856    case Intrinsic::s390_vpklsh:  // PACKLS
6857    case Intrinsic::s390_vpklsf:
6858    case Intrinsic::s390_vpklsg:
6859    case Intrinsic::s390_vpklshs: // PACKLS_CC
6860    case Intrinsic::s390_vpklsfs:
6861    case Intrinsic::s390_vpklsgs:
6862    case Intrinsic::s390_vpdi:
6863    case Intrinsic::s390_vsldb:
6864    case Intrinsic::s390_vperm:
6865      computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 1);
6866      break;
6867    case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
6868    case Intrinsic::s390_vuplhh:
6869    case Intrinsic::s390_vuplhf:
6870    case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
6871    case Intrinsic::s390_vupllh:
6872    case Intrinsic::s390_vupllf:
6873      IsLogical = true;
6874      LLVM_FALLTHROUGH;
6875    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
6876    case Intrinsic::s390_vuphh:
6877    case Intrinsic::s390_vuphf:
6878    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
6879    case Intrinsic::s390_vuplhw:
6880    case Intrinsic::s390_vuplf: {
6881      SDValue SrcOp = Op.getOperand(1);
6882      APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
6883      Known = DAG.computeKnownBits(SrcOp, SrcDemE, Depth + 1);
6884      if (IsLogical) {
6885        Known = Known.zext(BitWidth);
6886      } else
6887        Known = Known.sext(BitWidth);
6888      break;
6889    }
6890    default:
6891      break;
6892    }
6893  } else {
6894    switch (Opcode) {
6895    case SystemZISD::JOIN_DWORDS:
6896    case SystemZISD::SELECT_CCMASK:
6897      computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 0);
6898      break;
6899    case SystemZISD::REPLICATE: {
6900      SDValue SrcOp = Op.getOperand(0);
6901      Known = DAG.computeKnownBits(SrcOp, Depth + 1);
6902      if (Known.getBitWidth() < BitWidth && isa<ConstantSDNode>(SrcOp))
6903        Known = Known.sext(BitWidth); // VREPI sign extends the immedate.
6904      break;
6905    }
6906    default:
6907      break;
6908    }
6909  }
6910
6911  // Known has the width of the source operand(s). Adjust if needed to match
6912  // the passed bitwidth.
6913  if (Known.getBitWidth() != BitWidth)
6914    Known = Known.anyextOrTrunc(BitWidth);
6915}
6916
6917static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
6918                                        const SelectionDAG &DAG, unsigned Depth,
6919                                        unsigned OpNo) {
6920  APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
6921  unsigned LHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
6922  if (LHS == 1) return 1; // Early out.
6923  APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
6924  unsigned RHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
6925  if (RHS == 1) return 1; // Early out.
6926  unsigned Common = std::min(LHS, RHS);
6927  unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
6928  EVT VT = Op.getValueType();
6929  unsigned VTBits = VT.getScalarSizeInBits();
6930  if (SrcBitWidth > VTBits) { // PACK
6931    unsigned SrcExtraBits = SrcBitWidth - VTBits;
6932    if (Common > SrcExtraBits)
6933      return (Common - SrcExtraBits);
6934    return 1;
6935  }
6936  assert (SrcBitWidth == VTBits && "Expected operands of same bitwidth.");
6937  return Common;
6938}
6939
6940unsigned
6941SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
6942    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6943    unsigned Depth) const {
6944  if (Op.getResNo() != 0)
6945    return 1;
6946  unsigned Opcode = Op.getOpcode();
6947  if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
6948    unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6949    switch (Id) {
6950    case Intrinsic::s390_vpksh:   // PACKS
6951    case Intrinsic::s390_vpksf:
6952    case Intrinsic::s390_vpksg:
6953    case Intrinsic::s390_vpkshs:  // PACKS_CC
6954    case Intrinsic::s390_vpksfs:
6955    case Intrinsic::s390_vpksgs:
6956    case Intrinsic::s390_vpklsh:  // PACKLS
6957    case Intrinsic::s390_vpklsf:
6958    case Intrinsic::s390_vpklsg:
6959    case Intrinsic::s390_vpklshs: // PACKLS_CC
6960    case Intrinsic::s390_vpklsfs:
6961    case Intrinsic::s390_vpklsgs:
6962    case Intrinsic::s390_vpdi:
6963    case Intrinsic::s390_vsldb:
6964    case Intrinsic::s390_vperm:
6965      return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 1);
6966    case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
6967    case Intrinsic::s390_vuphh:
6968    case Intrinsic::s390_vuphf:
6969    case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
6970    case Intrinsic::s390_vuplhw:
6971    case Intrinsic::s390_vuplf: {
6972      SDValue PackedOp = Op.getOperand(1);
6973      APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 1);
6974      unsigned Tmp = DAG.ComputeNumSignBits(PackedOp, SrcDemE, Depth + 1);
6975      EVT VT = Op.getValueType();
6976      unsigned VTBits = VT.getScalarSizeInBits();
6977      Tmp += VTBits - PackedOp.getScalarValueSizeInBits();
6978      return Tmp;
6979    }
6980    default:
6981      break;
6982    }
6983  } else {
6984    switch (Opcode) {
6985    case SystemZISD::SELECT_CCMASK:
6986      return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 0);
6987    default:
6988      break;
6989    }
6990  }
6991
6992  return 1;
6993}
6994
6995unsigned
6996SystemZTargetLowering::getStackProbeSize(MachineFunction &MF) const {
6997  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
6998  unsigned StackAlign = TFI->getStackAlignment();
6999  assert(StackAlign >=1 && isPowerOf2_32(StackAlign) &&
7000         "Unexpected stack alignment");
7001  // The default stack probe size is 4096 if the function has no
7002  // stack-probe-size attribute.
7003  unsigned StackProbeSize = 4096;
7004  const Function &Fn = MF.getFunction();
7005  if (Fn.hasFnAttribute("stack-probe-size"))
7006    Fn.getFnAttribute("stack-probe-size")
7007        .getValueAsString()
7008        .getAsInteger(0, StackProbeSize);
7009  // Round down to the stack alignment.
7010  StackProbeSize &= ~(StackAlign - 1);
7011  return StackProbeSize ? StackProbeSize : StackAlign;
7012}
7013
7014//===----------------------------------------------------------------------===//
7015// Custom insertion
7016//===----------------------------------------------------------------------===//
7017
7018// Force base value Base into a register before MI.  Return the register.
7019static Register forceReg(MachineInstr &MI, MachineOperand &Base,
7020                         const SystemZInstrInfo *TII) {
7021  if (Base.isReg())
7022    return Base.getReg();
7023
7024  MachineBasicBlock *MBB = MI.getParent();
7025  MachineFunction &MF = *MBB->getParent();
7026  MachineRegisterInfo &MRI = MF.getRegInfo();
7027
7028  Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
7029  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
7030      .add(Base)
7031      .addImm(0)
7032      .addReg(0);
7033  return Reg;
7034}
7035
7036// The CC operand of MI might be missing a kill marker because there
7037// were multiple uses of CC, and ISel didn't know which to mark.
7038// Figure out whether MI should have had a kill marker.
7039static bool checkCCKill(MachineInstr &MI, MachineBasicBlock *MBB) {
7040  // Scan forward through BB for a use/def of CC.
7041  MachineBasicBlock::iterator miI(std::next(MachineBasicBlock::iterator(MI)));
7042  for (MachineBasicBlock::iterator miE = MBB->end(); miI != miE; ++miI) {
7043    const MachineInstr& mi = *miI;
7044    if (mi.readsRegister(SystemZ::CC))
7045      return false;
7046    if (mi.definesRegister(SystemZ::CC))
7047      break; // Should have kill-flag - update below.
7048  }
7049
7050  // If we hit the end of the block, check whether CC is live into a
7051  // successor.
7052  if (miI == MBB->end()) {
7053    for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI)
7054      if ((*SI)->isLiveIn(SystemZ::CC))
7055        return false;
7056  }
7057
7058  return true;
7059}
7060
7061// Return true if it is OK for this Select pseudo-opcode to be cascaded
7062// together with other Select pseudo-opcodes into a single basic-block with
7063// a conditional jump around it.
7064static bool isSelectPseudo(MachineInstr &MI) {
7065  switch (MI.getOpcode()) {
7066  case SystemZ::Select32:
7067  case SystemZ::Select64:
7068  case SystemZ::SelectF32:
7069  case SystemZ::SelectF64:
7070  case SystemZ::SelectF128:
7071  case SystemZ::SelectVR32:
7072  case SystemZ::SelectVR64:
7073  case SystemZ::SelectVR128:
7074    return true;
7075
7076  default:
7077    return false;
7078  }
7079}
7080
7081// Helper function, which inserts PHI functions into SinkMBB:
7082//   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
7083// where %FalseValue(i) and %TrueValue(i) are taken from Selects.
7084static void createPHIsForSelects(SmallVector<MachineInstr*, 8> &Selects,
7085                                 MachineBasicBlock *TrueMBB,
7086                                 MachineBasicBlock *FalseMBB,
7087                                 MachineBasicBlock *SinkMBB) {
7088  MachineFunction *MF = TrueMBB->getParent();
7089  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
7090
7091  MachineInstr *FirstMI = Selects.front();
7092  unsigned CCValid = FirstMI->getOperand(3).getImm();
7093  unsigned CCMask = FirstMI->getOperand(4).getImm();
7094
7095  MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
7096
7097  // As we are creating the PHIs, we have to be careful if there is more than
7098  // one.  Later Selects may reference the results of earlier Selects, but later
7099  // PHIs have to reference the individual true/false inputs from earlier PHIs.
7100  // That also means that PHI construction must work forward from earlier to
7101  // later, and that the code must maintain a mapping from earlier PHI's
7102  // destination registers, and the registers that went into the PHI.
7103  DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
7104
7105  for (auto MI : Selects) {
7106    Register DestReg = MI->getOperand(0).getReg();
7107    Register TrueReg = MI->getOperand(1).getReg();
7108    Register FalseReg = MI->getOperand(2).getReg();
7109
7110    // If this Select we are generating is the opposite condition from
7111    // the jump we generated, then we have to swap the operands for the
7112    // PHI that is going to be generated.
7113    if (MI->getOperand(4).getImm() == (CCValid ^ CCMask))
7114      std::swap(TrueReg, FalseReg);
7115
7116    if (RegRewriteTable.find(TrueReg) != RegRewriteTable.end())
7117      TrueReg = RegRewriteTable[TrueReg].first;
7118
7119    if (RegRewriteTable.find(FalseReg) != RegRewriteTable.end())
7120      FalseReg = RegRewriteTable[FalseReg].second;
7121
7122    DebugLoc DL = MI->getDebugLoc();
7123    BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(SystemZ::PHI), DestReg)
7124      .addReg(TrueReg).addMBB(TrueMBB)
7125      .addReg(FalseReg).addMBB(FalseMBB);
7126
7127    // Add this PHI to the rewrite table.
7128    RegRewriteTable[DestReg] = std::make_pair(TrueReg, FalseReg);
7129  }
7130
7131  MF->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
7132}
7133
7134// Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
7135MachineBasicBlock *
7136SystemZTargetLowering::emitSelect(MachineInstr &MI,
7137                                  MachineBasicBlock *MBB) const {
7138  assert(isSelectPseudo(MI) && "Bad call to emitSelect()");
7139  const SystemZInstrInfo *TII =
7140      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
7141
7142  unsigned CCValid = MI.getOperand(3).getImm();
7143  unsigned CCMask = MI.getOperand(4).getImm();
7144
7145  // If we have a sequence of Select* pseudo instructions using the
7146  // same condition code value, we want to expand all of them into
7147  // a single pair of basic blocks using the same condition.
7148  SmallVector<MachineInstr*, 8> Selects;
7149  SmallVector<MachineInstr*, 8> DbgValues;
7150  Selects.push_back(&MI);
7151  unsigned Count = 0;
7152  for (MachineBasicBlock::iterator NextMIIt =
7153         std::next(MachineBasicBlock::iterator(MI));
7154       NextMIIt != MBB->end(); ++NextMIIt) {
7155    if (isSelectPseudo(*NextMIIt)) {
7156      assert(NextMIIt->getOperand(3).getImm() == CCValid &&
7157             "Bad CCValid operands since CC was not redefined.");
7158      if (NextMIIt->getOperand(4).getImm() == CCMask ||
7159          NextMIIt->getOperand(4).getImm() == (CCValid ^ CCMask)) {
7160        Selects.push_back(&*NextMIIt);
7161        continue;
7162      }
7163      break;
7164    }
7165    if (NextMIIt->definesRegister(SystemZ::CC) ||
7166        NextMIIt->usesCustomInsertionHook())
7167      break;
7168    bool User = false;
7169    for (auto SelMI : Selects)
7170      if (NextMIIt->readsVirtualRegister(SelMI->getOperand(0).getReg())) {
7171        User = true;
7172        break;
7173      }
7174    if (NextMIIt->isDebugInstr()) {
7175      if (User) {
7176        assert(NextMIIt->isDebugValue() && "Unhandled debug opcode.");
7177        DbgValues.push_back(&*NextMIIt);
7178      }
7179    }
7180    else if (User || ++Count > 20)
7181      break;
7182  }
7183
7184  MachineInstr *LastMI = Selects.back();
7185  bool CCKilled =
7186      (LastMI->killsRegister(SystemZ::CC) || checkCCKill(*LastMI, MBB));
7187  MachineBasicBlock *StartMBB = MBB;
7188  MachineBasicBlock *JoinMBB  = SystemZ::splitBlockAfter(LastMI, MBB);
7189  MachineBasicBlock *FalseMBB = SystemZ::emitBlockAfter(StartMBB);
7190
7191  // Unless CC was killed in the last Select instruction, mark it as
7192  // live-in to both FalseMBB and JoinMBB.
7193  if (!CCKilled) {
7194    FalseMBB->addLiveIn(SystemZ::CC);
7195    JoinMBB->addLiveIn(SystemZ::CC);
7196  }
7197
7198  //  StartMBB:
7199  //   BRC CCMask, JoinMBB
7200  //   # fallthrough to FalseMBB
7201  MBB = StartMBB;
7202  BuildMI(MBB, MI.getDebugLoc(), TII->get(SystemZ::BRC))
7203    .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
7204  MBB->addSuccessor(JoinMBB);
7205  MBB->addSuccessor(FalseMBB);
7206
7207  //  FalseMBB:
7208  //   # fallthrough to JoinMBB
7209  MBB = FalseMBB;
7210  MBB->addSuccessor(JoinMBB);
7211
7212  //  JoinMBB:
7213  //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
7214  //  ...
7215  MBB = JoinMBB;
7216  createPHIsForSelects(Selects, StartMBB, FalseMBB, MBB);
7217  for (auto SelMI : Selects)
7218    SelMI->eraseFromParent();
7219
7220  MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI();
7221  for (auto DbgMI : DbgValues)
7222    MBB->splice(InsertPos, StartMBB, DbgMI);
7223
7224  return JoinMBB;
7225}
7226
7227// Implement EmitInstrWithCustomInserter for pseudo CondStore* instruction MI.
7228// StoreOpcode is the store to use and Invert says whether the store should
7229// happen when the condition is false rather than true.  If a STORE ON
7230// CONDITION is available, STOCOpcode is its opcode, otherwise it is 0.
7231MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
7232                                                        MachineBasicBlock *MBB,
7233                                                        unsigned StoreOpcode,
7234                                                        unsigned STOCOpcode,
7235                                                        bool Invert) const {
7236  const SystemZInstrInfo *TII =
7237      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
7238
7239  Register SrcReg = MI.getOperand(0).getReg();
7240  MachineOperand Base = MI.getOperand(1);
7241  int64_t Disp = MI.getOperand(2).getImm();
7242  Register IndexReg = MI.getOperand(3).getReg();
7243  unsigned CCValid = MI.getOperand(4).getImm();
7244  unsigned CCMask = MI.getOperand(5).getImm();
7245  DebugLoc DL = MI.getDebugLoc();
7246
7247  StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
7248
7249  // Use STOCOpcode if possible.  We could use different store patterns in
7250  // order to avoid matching the index register, but the performance trade-offs
7251  // might be more complicated in that case.
7252  if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) {
7253    if (Invert)
7254      CCMask ^= CCValid;
7255
7256    // ISel pattern matching also adds a load memory operand of the same
7257    // address, so take special care to find the storing memory operand.
7258    MachineMemOperand *MMO = nullptr;
7259    for (auto *I : MI.memoperands())
7260      if (I->isStore()) {
7261          MMO = I;
7262          break;
7263        }
7264
7265    BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
7266      .addReg(SrcReg)
7267      .add(Base)
7268      .addImm(Disp)
7269      .addImm(CCValid)
7270      .addImm(CCMask)
7271      .addMemOperand(MMO);
7272
7273    MI.eraseFromParent();
7274    return MBB;
7275  }
7276
7277  // Get the condition needed to branch around the store.
7278  if (!Invert)
7279    CCMask ^= CCValid;
7280
7281  MachineBasicBlock *StartMBB = MBB;
7282  MachineBasicBlock *JoinMBB  = SystemZ::splitBlockBefore(MI, MBB);
7283  MachineBasicBlock *FalseMBB = SystemZ::emitBlockAfter(StartMBB);
7284
7285  // Unless CC was killed in the CondStore instruction, mark it as
7286  // live-in to both FalseMBB and JoinMBB.
7287  if (!MI.killsRegister(SystemZ::CC) && !checkCCKill(MI, JoinMBB)) {
7288    FalseMBB->addLiveIn(SystemZ::CC);
7289    JoinMBB->addLiveIn(SystemZ::CC);
7290  }
7291
7292  //  StartMBB:
7293  //   BRC CCMask, JoinMBB
7294  //   # fallthrough to FalseMBB
7295  MBB = StartMBB;
7296  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7297    .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
7298  MBB->addSuccessor(JoinMBB);
7299  MBB->addSuccessor(FalseMBB);
7300
7301  //  FalseMBB:
7302  //   store %SrcReg, %Disp(%Index,%Base)
7303  //   # fallthrough to JoinMBB
7304  MBB = FalseMBB;
7305  BuildMI(MBB, DL, TII->get(StoreOpcode))
7306      .addReg(SrcReg)
7307      .add(Base)
7308      .addImm(Disp)
7309      .addReg(IndexReg);
7310  MBB->addSuccessor(JoinMBB);
7311
7312  MI.eraseFromParent();
7313  return JoinMBB;
7314}
7315
7316// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
7317// or ATOMIC_SWAP{,W} instruction MI.  BinOpcode is the instruction that
7318// performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
7319// BitSize is the width of the field in bits, or 0 if this is a partword
7320// ATOMIC_LOADW_* or ATOMIC_SWAPW instruction, in which case the bitsize
7321// is one of the operands.  Invert says whether the field should be
7322// inverted after performing BinOpcode (e.g. for NAND).
7323MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
7324    MachineInstr &MI, MachineBasicBlock *MBB, unsigned BinOpcode,
7325    unsigned BitSize, bool Invert) const {
7326  MachineFunction &MF = *MBB->getParent();
7327  const SystemZInstrInfo *TII =
7328      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
7329  MachineRegisterInfo &MRI = MF.getRegInfo();
7330  bool IsSubWord = (BitSize < 32);
7331
7332  // Extract the operands.  Base can be a register or a frame index.
7333  // Src2 can be a register or immediate.
7334  Register Dest = MI.getOperand(0).getReg();
7335  MachineOperand Base = earlyUseOperand(MI.getOperand(1));
7336  int64_t Disp = MI.getOperand(2).getImm();
7337  MachineOperand Src2 = earlyUseOperand(MI.getOperand(3));
7338  Register BitShift = IsSubWord ? MI.getOperand(4).getReg() : Register();
7339  Register NegBitShift = IsSubWord ? MI.getOperand(5).getReg() : Register();
7340  DebugLoc DL = MI.getDebugLoc();
7341  if (IsSubWord)
7342    BitSize = MI.getOperand(6).getImm();
7343
7344  // Subword operations use 32-bit registers.
7345  const TargetRegisterClass *RC = (BitSize <= 32 ?
7346                                   &SystemZ::GR32BitRegClass :
7347                                   &SystemZ::GR64BitRegClass);
7348  unsigned LOpcode  = BitSize <= 32 ? SystemZ::L  : SystemZ::LG;
7349  unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
7350
7351  // Get the right opcodes for the displacement.
7352  LOpcode  = TII->getOpcodeForOffset(LOpcode,  Disp);
7353  CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
7354  assert(LOpcode && CSOpcode && "Displacement out of range");
7355
7356  // Create virtual registers for temporary results.
7357  Register OrigVal       = MRI.createVirtualRegister(RC);
7358  Register OldVal        = MRI.createVirtualRegister(RC);
7359  Register NewVal        = (BinOpcode || IsSubWord ?
7360                            MRI.createVirtualRegister(RC) : Src2.getReg());
7361  Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
7362  Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
7363
7364  // Insert a basic block for the main loop.
7365  MachineBasicBlock *StartMBB = MBB;
7366  MachineBasicBlock *DoneMBB  = SystemZ::splitBlockBefore(MI, MBB);
7367  MachineBasicBlock *LoopMBB  = SystemZ::emitBlockAfter(StartMBB);
7368
7369  //  StartMBB:
7370  //   ...
7371  //   %OrigVal = L Disp(%Base)
7372  //   # fall through to LoopMMB
7373  MBB = StartMBB;
7374  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
7375  MBB->addSuccessor(LoopMBB);
7376
7377  //  LoopMBB:
7378  //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, LoopMBB ]
7379  //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
7380  //   %RotatedNewVal = OP %RotatedOldVal, %Src2
7381  //   %NewVal        = RLL %RotatedNewVal, 0(%NegBitShift)
7382  //   %Dest          = CS %OldVal, %NewVal, Disp(%Base)
7383  //   JNE LoopMBB
7384  //   # fall through to DoneMMB
7385  MBB = LoopMBB;
7386  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
7387    .addReg(OrigVal).addMBB(StartMBB)
7388    .addReg(Dest).addMBB(LoopMBB);
7389  if (IsSubWord)
7390    BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
7391      .addReg(OldVal).addReg(BitShift).addImm(0);
7392  if (Invert) {
7393    // Perform the operation normally and then invert every bit of the field.
7394    Register Tmp = MRI.createVirtualRegister(RC);
7395    BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2);
7396    if (BitSize <= 32)
7397      // XILF with the upper BitSize bits set.
7398      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
7399        .addReg(Tmp).addImm(-1U << (32 - BitSize));
7400    else {
7401      // Use LCGR and add -1 to the result, which is more compact than
7402      // an XILF, XILH pair.
7403      Register Tmp2 = MRI.createVirtualRegister(RC);
7404      BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp);
7405      BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal)
7406        .addReg(Tmp2).addImm(-1);
7407    }
7408  } else if (BinOpcode)
7409    // A simply binary operation.
7410    BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
7411        .addReg(RotatedOldVal)
7412        .add(Src2);
7413  else if (IsSubWord)
7414    // Use RISBG to rotate Src2 into position and use it to replace the
7415    // field in RotatedOldVal.
7416    BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedNewVal)
7417      .addReg(RotatedOldVal).addReg(Src2.getReg())
7418      .addImm(32).addImm(31 + BitSize).addImm(32 - BitSize);
7419  if (IsSubWord)
7420    BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
7421      .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
7422  BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
7423      .addReg(OldVal)
7424      .addReg(NewVal)
7425      .add(Base)
7426      .addImm(Disp);
7427  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7428    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
7429  MBB->addSuccessor(LoopMBB);
7430  MBB->addSuccessor(DoneMBB);
7431
7432  MI.eraseFromParent();
7433  return DoneMBB;
7434}
7435
7436// Implement EmitInstrWithCustomInserter for pseudo
7437// ATOMIC_LOAD{,W}_{,U}{MIN,MAX} instruction MI.  CompareOpcode is the
7438// instruction that should be used to compare the current field with the
7439// minimum or maximum value.  KeepOldMask is the BRC condition-code mask
7440// for when the current field should be kept.  BitSize is the width of
7441// the field in bits, or 0 if this is a partword ATOMIC_LOADW_* instruction.
7442MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
7443    MachineInstr &MI, MachineBasicBlock *MBB, unsigned CompareOpcode,
7444    unsigned KeepOldMask, unsigned BitSize) const {
7445  MachineFunction &MF = *MBB->getParent();
7446  const SystemZInstrInfo *TII =
7447      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
7448  MachineRegisterInfo &MRI = MF.getRegInfo();
7449  bool IsSubWord = (BitSize < 32);
7450
7451  // Extract the operands.  Base can be a register or a frame index.
7452  Register Dest = MI.getOperand(0).getReg();
7453  MachineOperand Base = earlyUseOperand(MI.getOperand(1));
7454  int64_t Disp = MI.getOperand(2).getImm();
7455  Register Src2 = MI.getOperand(3).getReg();
7456  Register BitShift = (IsSubWord ? MI.getOperand(4).getReg() : Register());
7457  Register NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : Register());
7458  DebugLoc DL = MI.getDebugLoc();
7459  if (IsSubWord)
7460    BitSize = MI.getOperand(6).getImm();
7461
7462  // Subword operations use 32-bit registers.
7463  const TargetRegisterClass *RC = (BitSize <= 32 ?
7464                                   &SystemZ::GR32BitRegClass :
7465                                   &SystemZ::GR64BitRegClass);
7466  unsigned LOpcode  = BitSize <= 32 ? SystemZ::L  : SystemZ::LG;
7467  unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
7468
7469  // Get the right opcodes for the displacement.
7470  LOpcode  = TII->getOpcodeForOffset(LOpcode,  Disp);
7471  CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
7472  assert(LOpcode && CSOpcode && "Displacement out of range");
7473
7474  // Create virtual registers for temporary results.
7475  Register OrigVal       = MRI.createVirtualRegister(RC);
7476  Register OldVal        = MRI.createVirtualRegister(RC);
7477  Register NewVal        = MRI.createVirtualRegister(RC);
7478  Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
7479  Register RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2);
7480  Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
7481
7482  // Insert 3 basic blocks for the loop.
7483  MachineBasicBlock *StartMBB  = MBB;
7484  MachineBasicBlock *DoneMBB   = SystemZ::splitBlockBefore(MI, MBB);
7485  MachineBasicBlock *LoopMBB   = SystemZ::emitBlockAfter(StartMBB);
7486  MachineBasicBlock *UseAltMBB = SystemZ::emitBlockAfter(LoopMBB);
7487  MachineBasicBlock *UpdateMBB = SystemZ::emitBlockAfter(UseAltMBB);
7488
7489  //  StartMBB:
7490  //   ...
7491  //   %OrigVal     = L Disp(%Base)
7492  //   # fall through to LoopMMB
7493  MBB = StartMBB;
7494  BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
7495  MBB->addSuccessor(LoopMBB);
7496
7497  //  LoopMBB:
7498  //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, UpdateMBB ]
7499  //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
7500  //   CompareOpcode %RotatedOldVal, %Src2
7501  //   BRC KeepOldMask, UpdateMBB
7502  MBB = LoopMBB;
7503  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
7504    .addReg(OrigVal).addMBB(StartMBB)
7505    .addReg(Dest).addMBB(UpdateMBB);
7506  if (IsSubWord)
7507    BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
7508      .addReg(OldVal).addReg(BitShift).addImm(0);
7509  BuildMI(MBB, DL, TII->get(CompareOpcode))
7510    .addReg(RotatedOldVal).addReg(Src2);
7511  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7512    .addImm(SystemZ::CCMASK_ICMP).addImm(KeepOldMask).addMBB(UpdateMBB);
7513  MBB->addSuccessor(UpdateMBB);
7514  MBB->addSuccessor(UseAltMBB);
7515
7516  //  UseAltMBB:
7517  //   %RotatedAltVal = RISBG %RotatedOldVal, %Src2, 32, 31 + BitSize, 0
7518  //   # fall through to UpdateMMB
7519  MBB = UseAltMBB;
7520  if (IsSubWord)
7521    BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal)
7522      .addReg(RotatedOldVal).addReg(Src2)
7523      .addImm(32).addImm(31 + BitSize).addImm(0);
7524  MBB->addSuccessor(UpdateMBB);
7525
7526  //  UpdateMBB:
7527  //   %RotatedNewVal = PHI [ %RotatedOldVal, LoopMBB ],
7528  //                        [ %RotatedAltVal, UseAltMBB ]
7529  //   %NewVal        = RLL %RotatedNewVal, 0(%NegBitShift)
7530  //   %Dest          = CS %OldVal, %NewVal, Disp(%Base)
7531  //   JNE LoopMBB
7532  //   # fall through to DoneMMB
7533  MBB = UpdateMBB;
7534  BuildMI(MBB, DL, TII->get(SystemZ::PHI), RotatedNewVal)
7535    .addReg(RotatedOldVal).addMBB(LoopMBB)
7536    .addReg(RotatedAltVal).addMBB(UseAltMBB);
7537  if (IsSubWord)
7538    BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
7539      .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
7540  BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
7541      .addReg(OldVal)
7542      .addReg(NewVal)
7543      .add(Base)
7544      .addImm(Disp);
7545  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7546    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
7547  MBB->addSuccessor(LoopMBB);
7548  MBB->addSuccessor(DoneMBB);
7549
7550  MI.eraseFromParent();
7551  return DoneMBB;
7552}
7553
7554// Implement EmitInstrWithCustomInserter for pseudo ATOMIC_CMP_SWAPW
7555// instruction MI.
7556MachineBasicBlock *
7557SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
7558                                          MachineBasicBlock *MBB) const {
7559
7560  MachineFunction &MF = *MBB->getParent();
7561  const SystemZInstrInfo *TII =
7562      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
7563  MachineRegisterInfo &MRI = MF.getRegInfo();
7564
7565  // Extract the operands.  Base can be a register or a frame index.
7566  Register Dest = MI.getOperand(0).getReg();
7567  MachineOperand Base = earlyUseOperand(MI.getOperand(1));
7568  int64_t Disp = MI.getOperand(2).getImm();
7569  Register OrigCmpVal = MI.getOperand(3).getReg();
7570  Register OrigSwapVal = MI.getOperand(4).getReg();
7571  Register BitShift = MI.getOperand(5).getReg();
7572  Register NegBitShift = MI.getOperand(6).getReg();
7573  int64_t BitSize = MI.getOperand(7).getImm();
7574  DebugLoc DL = MI.getDebugLoc();
7575
7576  const TargetRegisterClass *RC = &SystemZ::GR32BitRegClass;
7577
7578  // Get the right opcodes for the displacement.
7579  unsigned LOpcode  = TII->getOpcodeForOffset(SystemZ::L,  Disp);
7580  unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp);
7581  assert(LOpcode && CSOpcode && "Displacement out of range");
7582
7583  // Create virtual registers for temporary results.
7584  Register OrigOldVal = MRI.createVirtualRegister(RC);
7585  Register OldVal = MRI.createVirtualRegister(RC);
7586  Register CmpVal = MRI.createVirtualRegister(RC);
7587  Register SwapVal = MRI.createVirtualRegister(RC);
7588  Register StoreVal = MRI.createVirtualRegister(RC);
7589  Register RetryOldVal = MRI.createVirtualRegister(RC);
7590  Register RetryCmpVal = MRI.createVirtualRegister(RC);
7591  Register RetrySwapVal = MRI.createVirtualRegister(RC);
7592
7593  // Insert 2 basic blocks for the loop.
7594  MachineBasicBlock *StartMBB = MBB;
7595  MachineBasicBlock *DoneMBB  = SystemZ::splitBlockBefore(MI, MBB);
7596  MachineBasicBlock *LoopMBB  = SystemZ::emitBlockAfter(StartMBB);
7597  MachineBasicBlock *SetMBB   = SystemZ::emitBlockAfter(LoopMBB);
7598
7599  //  StartMBB:
7600  //   ...
7601  //   %OrigOldVal     = L Disp(%Base)
7602  //   # fall through to LoopMMB
7603  MBB = StartMBB;
7604  BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
7605      .add(Base)
7606      .addImm(Disp)
7607      .addReg(0);
7608  MBB->addSuccessor(LoopMBB);
7609
7610  //  LoopMBB:
7611  //   %OldVal        = phi [ %OrigOldVal, EntryBB ], [ %RetryOldVal, SetMBB ]
7612  //   %CmpVal        = phi [ %OrigCmpVal, EntryBB ], [ %RetryCmpVal, SetMBB ]
7613  //   %SwapVal       = phi [ %OrigSwapVal, EntryBB ], [ %RetrySwapVal, SetMBB ]
7614  //   %Dest          = RLL %OldVal, BitSize(%BitShift)
7615  //                      ^^ The low BitSize bits contain the field
7616  //                         of interest.
7617  //   %RetryCmpVal   = RISBG32 %CmpVal, %Dest, 32, 63-BitSize, 0
7618  //                      ^^ Replace the upper 32-BitSize bits of the
7619  //                         comparison value with those that we loaded,
7620  //                         so that we can use a full word comparison.
7621  //   CR %Dest, %RetryCmpVal
7622  //   JNE DoneMBB
7623  //   # Fall through to SetMBB
7624  MBB = LoopMBB;
7625  BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
7626    .addReg(OrigOldVal).addMBB(StartMBB)
7627    .addReg(RetryOldVal).addMBB(SetMBB);
7628  BuildMI(MBB, DL, TII->get(SystemZ::PHI), CmpVal)
7629    .addReg(OrigCmpVal).addMBB(StartMBB)
7630    .addReg(RetryCmpVal).addMBB(SetMBB);
7631  BuildMI(MBB, DL, TII->get(SystemZ::PHI), SwapVal)
7632    .addReg(OrigSwapVal).addMBB(StartMBB)
7633    .addReg(RetrySwapVal).addMBB(SetMBB);
7634  BuildMI(MBB, DL, TII->get(SystemZ::RLL), Dest)
7635    .addReg(OldVal).addReg(BitShift).addImm(BitSize);
7636  BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetryCmpVal)
7637    .addReg(CmpVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
7638  BuildMI(MBB, DL, TII->get(SystemZ::CR))
7639    .addReg(Dest).addReg(RetryCmpVal);
7640  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7641    .addImm(SystemZ::CCMASK_ICMP)
7642    .addImm(SystemZ::CCMASK_CMP_NE).addMBB(DoneMBB);
7643  MBB->addSuccessor(DoneMBB);
7644  MBB->addSuccessor(SetMBB);
7645
7646  //  SetMBB:
7647  //   %RetrySwapVal = RISBG32 %SwapVal, %Dest, 32, 63-BitSize, 0
7648  //                      ^^ Replace the upper 32-BitSize bits of the new
7649  //                         value with those that we loaded.
7650  //   %StoreVal    = RLL %RetrySwapVal, -BitSize(%NegBitShift)
7651  //                      ^^ Rotate the new field to its proper position.
7652  //   %RetryOldVal = CS %Dest, %StoreVal, Disp(%Base)
7653  //   JNE LoopMBB
7654  //   # fall through to ExitMMB
7655  MBB = SetMBB;
7656  BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetrySwapVal)
7657    .addReg(SwapVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
7658  BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
7659    .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
7660  BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
7661      .addReg(OldVal)
7662      .addReg(StoreVal)
7663      .add(Base)
7664      .addImm(Disp);
7665  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7666    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
7667  MBB->addSuccessor(LoopMBB);
7668  MBB->addSuccessor(DoneMBB);
7669
7670  // If the CC def wasn't dead in the ATOMIC_CMP_SWAPW, mark CC as live-in
7671  // to the block after the loop.  At this point, CC may have been defined
7672  // either by the CR in LoopMBB or by the CS in SetMBB.
7673  if (!MI.registerDefIsDead(SystemZ::CC))
7674    DoneMBB->addLiveIn(SystemZ::CC);
7675
7676  MI.eraseFromParent();
7677  return DoneMBB;
7678}
7679
7680// Emit a move from two GR64s to a GR128.
7681MachineBasicBlock *
7682SystemZTargetLowering::emitPair128(MachineInstr &MI,
7683                                   MachineBasicBlock *MBB) const {
7684  MachineFunction &MF = *MBB->getParent();
7685  const SystemZInstrInfo *TII =
7686      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
7687  MachineRegisterInfo &MRI = MF.getRegInfo();
7688  DebugLoc DL = MI.getDebugLoc();
7689
7690  Register Dest = MI.getOperand(0).getReg();
7691  Register Hi = MI.getOperand(1).getReg();
7692  Register Lo = MI.getOperand(2).getReg();
7693  Register Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
7694  Register Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
7695
7696  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Tmp1);
7697  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Tmp2)
7698    .addReg(Tmp1).addReg(Hi).addImm(SystemZ::subreg_h64);
7699  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
7700    .addReg(Tmp2).addReg(Lo).addImm(SystemZ::subreg_l64);
7701
7702  MI.eraseFromParent();
7703  return MBB;
7704}
7705
7706// Emit an extension from a GR64 to a GR128.  ClearEven is true
7707// if the high register of the GR128 value must be cleared or false if
7708// it's "don't care".
7709MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
7710                                                     MachineBasicBlock *MBB,
7711                                                     bool ClearEven) const {
7712  MachineFunction &MF = *MBB->getParent();
7713  const SystemZInstrInfo *TII =
7714      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
7715  MachineRegisterInfo &MRI = MF.getRegInfo();
7716  DebugLoc DL = MI.getDebugLoc();
7717
7718  Register Dest = MI.getOperand(0).getReg();
7719  Register Src = MI.getOperand(1).getReg();
7720  Register In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
7721
7722  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
7723  if (ClearEven) {
7724    Register NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
7725    Register Zero64 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
7726
7727    BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
7728      .addImm(0);
7729    BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
7730      .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_h64);
7731    In128 = NewIn128;
7732  }
7733  BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
7734    .addReg(In128).addReg(Src).addImm(SystemZ::subreg_l64);
7735
7736  MI.eraseFromParent();
7737  return MBB;
7738}
7739
7740MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
7741    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
7742  MachineFunction &MF = *MBB->getParent();
7743  const SystemZInstrInfo *TII =
7744      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
7745  MachineRegisterInfo &MRI = MF.getRegInfo();
7746  DebugLoc DL = MI.getDebugLoc();
7747
7748  MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
7749  uint64_t DestDisp = MI.getOperand(1).getImm();
7750  MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
7751  uint64_t SrcDisp = MI.getOperand(3).getImm();
7752  uint64_t Length = MI.getOperand(4).getImm();
7753
7754  // When generating more than one CLC, all but the last will need to
7755  // branch to the end when a difference is found.
7756  MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
7757                               SystemZ::splitBlockAfter(MI, MBB) : nullptr);
7758
7759  // Check for the loop form, in which operand 5 is the trip count.
7760  if (MI.getNumExplicitOperands() > 5) {
7761    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
7762
7763    Register StartCountReg = MI.getOperand(5).getReg();
7764    Register StartSrcReg   = forceReg(MI, SrcBase, TII);
7765    Register StartDestReg  = (HaveSingleBase ? StartSrcReg :
7766                              forceReg(MI, DestBase, TII));
7767
7768    const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
7769    Register ThisSrcReg  = MRI.createVirtualRegister(RC);
7770    Register ThisDestReg = (HaveSingleBase ? ThisSrcReg :
7771                            MRI.createVirtualRegister(RC));
7772    Register NextSrcReg  = MRI.createVirtualRegister(RC);
7773    Register NextDestReg = (HaveSingleBase ? NextSrcReg :
7774                            MRI.createVirtualRegister(RC));
7775
7776    RC = &SystemZ::GR64BitRegClass;
7777    Register ThisCountReg = MRI.createVirtualRegister(RC);
7778    Register NextCountReg = MRI.createVirtualRegister(RC);
7779
7780    MachineBasicBlock *StartMBB = MBB;
7781    MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB);
7782    MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB);
7783    MachineBasicBlock *NextMBB =
7784        (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB);
7785
7786    //  StartMBB:
7787    //   # fall through to LoopMMB
7788    MBB->addSuccessor(LoopMBB);
7789
7790    //  LoopMBB:
7791    //   %ThisDestReg = phi [ %StartDestReg, StartMBB ],
7792    //                      [ %NextDestReg, NextMBB ]
7793    //   %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
7794    //                     [ %NextSrcReg, NextMBB ]
7795    //   %ThisCountReg = phi [ %StartCountReg, StartMBB ],
7796    //                       [ %NextCountReg, NextMBB ]
7797    //   ( PFD 2, 768+DestDisp(%ThisDestReg) )
7798    //   Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
7799    //   ( JLH EndMBB )
7800    //
7801    // The prefetch is used only for MVC.  The JLH is used only for CLC.
7802    MBB = LoopMBB;
7803
7804    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
7805      .addReg(StartDestReg).addMBB(StartMBB)
7806      .addReg(NextDestReg).addMBB(NextMBB);
7807    if (!HaveSingleBase)
7808      BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
7809        .addReg(StartSrcReg).addMBB(StartMBB)
7810        .addReg(NextSrcReg).addMBB(NextMBB);
7811    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
7812      .addReg(StartCountReg).addMBB(StartMBB)
7813      .addReg(NextCountReg).addMBB(NextMBB);
7814    if (Opcode == SystemZ::MVC)
7815      BuildMI(MBB, DL, TII->get(SystemZ::PFD))
7816        .addImm(SystemZ::PFD_WRITE)
7817        .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
7818    BuildMI(MBB, DL, TII->get(Opcode))
7819      .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
7820      .addReg(ThisSrcReg).addImm(SrcDisp);
7821    if (EndMBB) {
7822      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7823        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
7824        .addMBB(EndMBB);
7825      MBB->addSuccessor(EndMBB);
7826      MBB->addSuccessor(NextMBB);
7827    }
7828
7829    // NextMBB:
7830    //   %NextDestReg = LA 256(%ThisDestReg)
7831    //   %NextSrcReg = LA 256(%ThisSrcReg)
7832    //   %NextCountReg = AGHI %ThisCountReg, -1
7833    //   CGHI %NextCountReg, 0
7834    //   JLH LoopMBB
7835    //   # fall through to DoneMMB
7836    //
7837    // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
7838    MBB = NextMBB;
7839
7840    BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
7841      .addReg(ThisDestReg).addImm(256).addReg(0);
7842    if (!HaveSingleBase)
7843      BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
7844        .addReg(ThisSrcReg).addImm(256).addReg(0);
7845    BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
7846      .addReg(ThisCountReg).addImm(-1);
7847    BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
7848      .addReg(NextCountReg).addImm(0);
7849    BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7850      .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
7851      .addMBB(LoopMBB);
7852    MBB->addSuccessor(LoopMBB);
7853    MBB->addSuccessor(DoneMBB);
7854
7855    DestBase = MachineOperand::CreateReg(NextDestReg, false);
7856    SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
7857    Length &= 255;
7858    if (EndMBB && !Length)
7859      // If the loop handled the whole CLC range, DoneMBB will be empty with
7860      // CC live-through into EndMBB, so add it as live-in.
7861      DoneMBB->addLiveIn(SystemZ::CC);
7862    MBB = DoneMBB;
7863  }
7864  // Handle any remaining bytes with straight-line code.
7865  while (Length > 0) {
7866    uint64_t ThisLength = std::min(Length, uint64_t(256));
7867    // The previous iteration might have created out-of-range displacements.
7868    // Apply them using LAY if so.
7869    if (!isUInt<12>(DestDisp)) {
7870      Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
7871      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
7872          .add(DestBase)
7873          .addImm(DestDisp)
7874          .addReg(0);
7875      DestBase = MachineOperand::CreateReg(Reg, false);
7876      DestDisp = 0;
7877    }
7878    if (!isUInt<12>(SrcDisp)) {
7879      Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
7880      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
7881          .add(SrcBase)
7882          .addImm(SrcDisp)
7883          .addReg(0);
7884      SrcBase = MachineOperand::CreateReg(Reg, false);
7885      SrcDisp = 0;
7886    }
7887    BuildMI(*MBB, MI, DL, TII->get(Opcode))
7888        .add(DestBase)
7889        .addImm(DestDisp)
7890        .addImm(ThisLength)
7891        .add(SrcBase)
7892        .addImm(SrcDisp)
7893        .setMemRefs(MI.memoperands());
7894    DestDisp += ThisLength;
7895    SrcDisp += ThisLength;
7896    Length -= ThisLength;
7897    // If there's another CLC to go, branch to the end if a difference
7898    // was found.
7899    if (EndMBB && Length > 0) {
7900      MachineBasicBlock *NextMBB = SystemZ::splitBlockBefore(MI, MBB);
7901      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7902        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
7903        .addMBB(EndMBB);
7904      MBB->addSuccessor(EndMBB);
7905      MBB->addSuccessor(NextMBB);
7906      MBB = NextMBB;
7907    }
7908  }
7909  if (EndMBB) {
7910    MBB->addSuccessor(EndMBB);
7911    MBB = EndMBB;
7912    MBB->addLiveIn(SystemZ::CC);
7913  }
7914
7915  MI.eraseFromParent();
7916  return MBB;
7917}
7918
7919// Decompose string pseudo-instruction MI into a loop that continually performs
7920// Opcode until CC != 3.
7921MachineBasicBlock *SystemZTargetLowering::emitStringWrapper(
7922    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
7923  MachineFunction &MF = *MBB->getParent();
7924  const SystemZInstrInfo *TII =
7925      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
7926  MachineRegisterInfo &MRI = MF.getRegInfo();
7927  DebugLoc DL = MI.getDebugLoc();
7928
7929  uint64_t End1Reg = MI.getOperand(0).getReg();
7930  uint64_t Start1Reg = MI.getOperand(1).getReg();
7931  uint64_t Start2Reg = MI.getOperand(2).getReg();
7932  uint64_t CharReg = MI.getOperand(3).getReg();
7933
7934  const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
7935  uint64_t This1Reg = MRI.createVirtualRegister(RC);
7936  uint64_t This2Reg = MRI.createVirtualRegister(RC);
7937  uint64_t End2Reg  = MRI.createVirtualRegister(RC);
7938
7939  MachineBasicBlock *StartMBB = MBB;
7940  MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB);
7941  MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB);
7942
7943  //  StartMBB:
7944  //   # fall through to LoopMMB
7945  MBB->addSuccessor(LoopMBB);
7946
7947  //  LoopMBB:
7948  //   %This1Reg = phi [ %Start1Reg, StartMBB ], [ %End1Reg, LoopMBB ]
7949  //   %This2Reg = phi [ %Start2Reg, StartMBB ], [ %End2Reg, LoopMBB ]
7950  //   R0L = %CharReg
7951  //   %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
7952  //   JO LoopMBB
7953  //   # fall through to DoneMMB
7954  //
7955  // The load of R0L can be hoisted by post-RA LICM.
7956  MBB = LoopMBB;
7957
7958  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This1Reg)
7959    .addReg(Start1Reg).addMBB(StartMBB)
7960    .addReg(End1Reg).addMBB(LoopMBB);
7961  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This2Reg)
7962    .addReg(Start2Reg).addMBB(StartMBB)
7963    .addReg(End2Reg).addMBB(LoopMBB);
7964  BuildMI(MBB, DL, TII->get(TargetOpcode::COPY), SystemZ::R0L).addReg(CharReg);
7965  BuildMI(MBB, DL, TII->get(Opcode))
7966    .addReg(End1Reg, RegState::Define).addReg(End2Reg, RegState::Define)
7967    .addReg(This1Reg).addReg(This2Reg);
7968  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7969    .addImm(SystemZ::CCMASK_ANY).addImm(SystemZ::CCMASK_3).addMBB(LoopMBB);
7970  MBB->addSuccessor(LoopMBB);
7971  MBB->addSuccessor(DoneMBB);
7972
7973  DoneMBB->addLiveIn(SystemZ::CC);
7974
7975  MI.eraseFromParent();
7976  return DoneMBB;
7977}
7978
7979// Update TBEGIN instruction with final opcode and register clobbers.
7980MachineBasicBlock *SystemZTargetLowering::emitTransactionBegin(
7981    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode,
7982    bool NoFloat) const {
7983  MachineFunction &MF = *MBB->getParent();
7984  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
7985  const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
7986
7987  // Update opcode.
7988  MI.setDesc(TII->get(Opcode));
7989
7990  // We cannot handle a TBEGIN that clobbers the stack or frame pointer.
7991  // Make sure to add the corresponding GRSM bits if they are missing.
7992  uint64_t Control = MI.getOperand(2).getImm();
7993  static const unsigned GPRControlBit[16] = {
7994    0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000,
7995    0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100
7996  };
7997  Control |= GPRControlBit[15];
7998  if (TFI->hasFP(MF))
7999    Control |= GPRControlBit[11];
8000  MI.getOperand(2).setImm(Control);
8001
8002  // Add GPR clobbers.
8003  for (int I = 0; I < 16; I++) {
8004    if ((Control & GPRControlBit[I]) == 0) {
8005      unsigned Reg = SystemZMC::GR64Regs[I];
8006      MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
8007    }
8008  }
8009
8010  // Add FPR/VR clobbers.
8011  if (!NoFloat && (Control & 4) != 0) {
8012    if (Subtarget.hasVector()) {
8013      for (int I = 0; I < 32; I++) {
8014        unsigned Reg = SystemZMC::VR128Regs[I];
8015        MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
8016      }
8017    } else {
8018      for (int I = 0; I < 16; I++) {
8019        unsigned Reg = SystemZMC::FP64Regs[I];
8020        MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
8021      }
8022    }
8023  }
8024
8025  return MBB;
8026}
8027
8028MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
8029    MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
8030  MachineFunction &MF = *MBB->getParent();
8031  MachineRegisterInfo *MRI = &MF.getRegInfo();
8032  const SystemZInstrInfo *TII =
8033      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
8034  DebugLoc DL = MI.getDebugLoc();
8035
8036  Register SrcReg = MI.getOperand(0).getReg();
8037
8038  // Create new virtual register of the same class as source.
8039  const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
8040  Register DstReg = MRI->createVirtualRegister(RC);
8041
8042  // Replace pseudo with a normal load-and-test that models the def as
8043  // well.
8044  BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
8045    .addReg(SrcReg)
8046    .setMIFlags(MI.getFlags());
8047  MI.eraseFromParent();
8048
8049  return MBB;
8050}
8051
8052MachineBasicBlock *SystemZTargetLowering::emitProbedAlloca(
8053    MachineInstr &MI, MachineBasicBlock *MBB) const {
8054  MachineFunction &MF = *MBB->getParent();
8055  MachineRegisterInfo *MRI = &MF.getRegInfo();
8056  const SystemZInstrInfo *TII =
8057      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
8058  DebugLoc DL = MI.getDebugLoc();
8059  const unsigned ProbeSize = getStackProbeSize(MF);
8060  Register DstReg = MI.getOperand(0).getReg();
8061  Register SizeReg = MI.getOperand(2).getReg();
8062
8063  MachineBasicBlock *StartMBB = MBB;
8064  MachineBasicBlock *DoneMBB  = SystemZ::splitBlockAfter(MI, MBB);
8065  MachineBasicBlock *LoopTestMBB  = SystemZ::emitBlockAfter(StartMBB);
8066  MachineBasicBlock *LoopBodyMBB = SystemZ::emitBlockAfter(LoopTestMBB);
8067  MachineBasicBlock *TailTestMBB = SystemZ::emitBlockAfter(LoopBodyMBB);
8068  MachineBasicBlock *TailMBB = SystemZ::emitBlockAfter(TailTestMBB);
8069
8070  MachineMemOperand *VolLdMMO = MF.getMachineMemOperand(MachinePointerInfo(),
8071    MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad, 8, Align(1));
8072
8073  Register PHIReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass);
8074  Register IncReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass);
8075
8076  //  LoopTestMBB
8077  //  BRC TailTestMBB
8078  //  # fallthrough to LoopBodyMBB
8079  StartMBB->addSuccessor(LoopTestMBB);
8080  MBB = LoopTestMBB;
8081  BuildMI(MBB, DL, TII->get(SystemZ::PHI), PHIReg)
8082    .addReg(SizeReg)
8083    .addMBB(StartMBB)
8084    .addReg(IncReg)
8085    .addMBB(LoopBodyMBB);
8086  BuildMI(MBB, DL, TII->get(SystemZ::CLGFI))
8087    .addReg(PHIReg)
8088    .addImm(ProbeSize);
8089  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8090    .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_LT)
8091    .addMBB(TailTestMBB);
8092  MBB->addSuccessor(LoopBodyMBB);
8093  MBB->addSuccessor(TailTestMBB);
8094
8095  //  LoopBodyMBB: Allocate and probe by means of a volatile compare.
8096  //  J LoopTestMBB
8097  MBB = LoopBodyMBB;
8098  BuildMI(MBB, DL, TII->get(SystemZ::SLGFI), IncReg)
8099    .addReg(PHIReg)
8100    .addImm(ProbeSize);
8101  BuildMI(MBB, DL, TII->get(SystemZ::SLGFI), SystemZ::R15D)
8102    .addReg(SystemZ::R15D)
8103    .addImm(ProbeSize);
8104  BuildMI(MBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D)
8105    .addReg(SystemZ::R15D).addImm(ProbeSize - 8).addReg(0)
8106    .setMemRefs(VolLdMMO);
8107  BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(LoopTestMBB);
8108  MBB->addSuccessor(LoopTestMBB);
8109
8110  //  TailTestMBB
8111  //  BRC DoneMBB
8112  //  # fallthrough to TailMBB
8113  MBB = TailTestMBB;
8114  BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
8115    .addReg(PHIReg)
8116    .addImm(0);
8117  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8118    .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
8119    .addMBB(DoneMBB);
8120  MBB->addSuccessor(TailMBB);
8121  MBB->addSuccessor(DoneMBB);
8122
8123  //  TailMBB
8124  //  # fallthrough to DoneMBB
8125  MBB = TailMBB;
8126  BuildMI(MBB, DL, TII->get(SystemZ::SLGR), SystemZ::R15D)
8127    .addReg(SystemZ::R15D)
8128    .addReg(PHIReg);
8129  BuildMI(MBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D)
8130    .addReg(SystemZ::R15D).addImm(-8).addReg(PHIReg)
8131    .setMemRefs(VolLdMMO);
8132  MBB->addSuccessor(DoneMBB);
8133
8134  //  DoneMBB
8135  MBB = DoneMBB;
8136  BuildMI(*MBB, MBB->begin(), DL, TII->get(TargetOpcode::COPY), DstReg)
8137    .addReg(SystemZ::R15D);
8138
8139  MI.eraseFromParent();
8140  return DoneMBB;
8141}
8142
8143MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
8144    MachineInstr &MI, MachineBasicBlock *MBB) const {
8145  switch (MI.getOpcode()) {
8146  case SystemZ::Select32:
8147  case SystemZ::Select64:
8148  case SystemZ::SelectF32:
8149  case SystemZ::SelectF64:
8150  case SystemZ::SelectF128:
8151  case SystemZ::SelectVR32:
8152  case SystemZ::SelectVR64:
8153  case SystemZ::SelectVR128:
8154    return emitSelect(MI, MBB);
8155
8156  case SystemZ::CondStore8Mux:
8157    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
8158  case SystemZ::CondStore8MuxInv:
8159    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, true);
8160  case SystemZ::CondStore16Mux:
8161    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, false);
8162  case SystemZ::CondStore16MuxInv:
8163    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, true);
8164  case SystemZ::CondStore32Mux:
8165    return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, false);
8166  case SystemZ::CondStore32MuxInv:
8167    return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, true);
8168  case SystemZ::CondStore8:
8169    return emitCondStore(MI, MBB, SystemZ::STC, 0, false);
8170  case SystemZ::CondStore8Inv:
8171    return emitCondStore(MI, MBB, SystemZ::STC, 0, true);
8172  case SystemZ::CondStore16:
8173    return emitCondStore(MI, MBB, SystemZ::STH, 0, false);
8174  case SystemZ::CondStore16Inv:
8175    return emitCondStore(MI, MBB, SystemZ::STH, 0, true);
8176  case SystemZ::CondStore32:
8177    return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, false);
8178  case SystemZ::CondStore32Inv:
8179    return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, true);
8180  case SystemZ::CondStore64:
8181    return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, false);
8182  case SystemZ::CondStore64Inv:
8183    return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, true);
8184  case SystemZ::CondStoreF32:
8185    return emitCondStore(MI, MBB, SystemZ::STE, 0, false);
8186  case SystemZ::CondStoreF32Inv:
8187    return emitCondStore(MI, MBB, SystemZ::STE, 0, true);
8188  case SystemZ::CondStoreF64:
8189    return emitCondStore(MI, MBB, SystemZ::STD, 0, false);
8190  case SystemZ::CondStoreF64Inv:
8191    return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
8192
8193  case SystemZ::PAIR128:
8194    return emitPair128(MI, MBB);
8195  case SystemZ::AEXT128:
8196    return emitExt128(MI, MBB, false);
8197  case SystemZ::ZEXT128:
8198    return emitExt128(MI, MBB, true);
8199
8200  case SystemZ::ATOMIC_SWAPW:
8201    return emitAtomicLoadBinary(MI, MBB, 0, 0);
8202  case SystemZ::ATOMIC_SWAP_32:
8203    return emitAtomicLoadBinary(MI, MBB, 0, 32);
8204  case SystemZ::ATOMIC_SWAP_64:
8205    return emitAtomicLoadBinary(MI, MBB, 0, 64);
8206
8207  case SystemZ::ATOMIC_LOADW_AR:
8208    return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 0);
8209  case SystemZ::ATOMIC_LOADW_AFI:
8210    return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 0);
8211  case SystemZ::ATOMIC_LOAD_AR:
8212    return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 32);
8213  case SystemZ::ATOMIC_LOAD_AHI:
8214    return emitAtomicLoadBinary(MI, MBB, SystemZ::AHI, 32);
8215  case SystemZ::ATOMIC_LOAD_AFI:
8216    return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 32);
8217  case SystemZ::ATOMIC_LOAD_AGR:
8218    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGR, 64);
8219  case SystemZ::ATOMIC_LOAD_AGHI:
8220    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGHI, 64);
8221  case SystemZ::ATOMIC_LOAD_AGFI:
8222    return emitAtomicLoadBinary(MI, MBB, SystemZ::AGFI, 64);
8223
8224  case SystemZ::ATOMIC_LOADW_SR:
8225    return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 0);
8226  case SystemZ::ATOMIC_LOAD_SR:
8227    return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 32);
8228  case SystemZ::ATOMIC_LOAD_SGR:
8229    return emitAtomicLoadBinary(MI, MBB, SystemZ::SGR, 64);
8230
8231  case SystemZ::ATOMIC_LOADW_NR:
8232    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
8233  case SystemZ::ATOMIC_LOADW_NILH:
8234    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0);
8235  case SystemZ::ATOMIC_LOAD_NR:
8236    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
8237  case SystemZ::ATOMIC_LOAD_NILL:
8238    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32);
8239  case SystemZ::ATOMIC_LOAD_NILH:
8240    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32);
8241  case SystemZ::ATOMIC_LOAD_NILF:
8242    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32);
8243  case SystemZ::ATOMIC_LOAD_NGR:
8244    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
8245  case SystemZ::ATOMIC_LOAD_NILL64:
8246    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64);
8247  case SystemZ::ATOMIC_LOAD_NILH64:
8248    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64);
8249  case SystemZ::ATOMIC_LOAD_NIHL64:
8250    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64);
8251  case SystemZ::ATOMIC_LOAD_NIHH64:
8252    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64);
8253  case SystemZ::ATOMIC_LOAD_NILF64:
8254    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64);
8255  case SystemZ::ATOMIC_LOAD_NIHF64:
8256    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64);
8257
8258  case SystemZ::ATOMIC_LOADW_OR:
8259    return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
8260  case SystemZ::ATOMIC_LOADW_OILH:
8261    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0);
8262  case SystemZ::ATOMIC_LOAD_OR:
8263    return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
8264  case SystemZ::ATOMIC_LOAD_OILL:
8265    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32);
8266  case SystemZ::ATOMIC_LOAD_OILH:
8267    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32);
8268  case SystemZ::ATOMIC_LOAD_OILF:
8269    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32);
8270  case SystemZ::ATOMIC_LOAD_OGR:
8271    return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
8272  case SystemZ::ATOMIC_LOAD_OILL64:
8273    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64);
8274  case SystemZ::ATOMIC_LOAD_OILH64:
8275    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64);
8276  case SystemZ::ATOMIC_LOAD_OIHL64:
8277    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64);
8278  case SystemZ::ATOMIC_LOAD_OIHH64:
8279    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64);
8280  case SystemZ::ATOMIC_LOAD_OILF64:
8281    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64);
8282  case SystemZ::ATOMIC_LOAD_OIHF64:
8283    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64);
8284
8285  case SystemZ::ATOMIC_LOADW_XR:
8286    return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
8287  case SystemZ::ATOMIC_LOADW_XILF:
8288    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0);
8289  case SystemZ::ATOMIC_LOAD_XR:
8290    return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
8291  case SystemZ::ATOMIC_LOAD_XILF:
8292    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32);
8293  case SystemZ::ATOMIC_LOAD_XGR:
8294    return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
8295  case SystemZ::ATOMIC_LOAD_XILF64:
8296    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64);
8297  case SystemZ::ATOMIC_LOAD_XIHF64:
8298    return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64);
8299
8300  case SystemZ::ATOMIC_LOADW_NRi:
8301    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
8302  case SystemZ::ATOMIC_LOADW_NILHi:
8303    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true);
8304  case SystemZ::ATOMIC_LOAD_NRi:
8305    return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
8306  case SystemZ::ATOMIC_LOAD_NILLi:
8307    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true);
8308  case SystemZ::ATOMIC_LOAD_NILHi:
8309    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true);
8310  case SystemZ::ATOMIC_LOAD_NILFi:
8311    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true);
8312  case SystemZ::ATOMIC_LOAD_NGRi:
8313    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
8314  case SystemZ::ATOMIC_LOAD_NILL64i:
8315    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true);
8316  case SystemZ::ATOMIC_LOAD_NILH64i:
8317    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true);
8318  case SystemZ::ATOMIC_LOAD_NIHL64i:
8319    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true);
8320  case SystemZ::ATOMIC_LOAD_NIHH64i:
8321    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true);
8322  case SystemZ::ATOMIC_LOAD_NILF64i:
8323    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true);
8324  case SystemZ::ATOMIC_LOAD_NIHF64i:
8325    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true);
8326
8327  case SystemZ::ATOMIC_LOADW_MIN:
8328    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
8329                                SystemZ::CCMASK_CMP_LE, 0);
8330  case SystemZ::ATOMIC_LOAD_MIN_32:
8331    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
8332                                SystemZ::CCMASK_CMP_LE, 32);
8333  case SystemZ::ATOMIC_LOAD_MIN_64:
8334    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
8335                                SystemZ::CCMASK_CMP_LE, 64);
8336
8337  case SystemZ::ATOMIC_LOADW_MAX:
8338    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
8339                                SystemZ::CCMASK_CMP_GE, 0);
8340  case SystemZ::ATOMIC_LOAD_MAX_32:
8341    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
8342                                SystemZ::CCMASK_CMP_GE, 32);
8343  case SystemZ::ATOMIC_LOAD_MAX_64:
8344    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
8345                                SystemZ::CCMASK_CMP_GE, 64);
8346
8347  case SystemZ::ATOMIC_LOADW_UMIN:
8348    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
8349                                SystemZ::CCMASK_CMP_LE, 0);
8350  case SystemZ::ATOMIC_LOAD_UMIN_32:
8351    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
8352                                SystemZ::CCMASK_CMP_LE, 32);
8353  case SystemZ::ATOMIC_LOAD_UMIN_64:
8354    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
8355                                SystemZ::CCMASK_CMP_LE, 64);
8356
8357  case SystemZ::ATOMIC_LOADW_UMAX:
8358    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
8359                                SystemZ::CCMASK_CMP_GE, 0);
8360  case SystemZ::ATOMIC_LOAD_UMAX_32:
8361    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
8362                                SystemZ::CCMASK_CMP_GE, 32);
8363  case SystemZ::ATOMIC_LOAD_UMAX_64:
8364    return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
8365                                SystemZ::CCMASK_CMP_GE, 64);
8366
8367  case SystemZ::ATOMIC_CMP_SWAPW:
8368    return emitAtomicCmpSwapW(MI, MBB);
8369  case SystemZ::MVCSequence:
8370  case SystemZ::MVCLoop:
8371    return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
8372  case SystemZ::NCSequence:
8373  case SystemZ::NCLoop:
8374    return emitMemMemWrapper(MI, MBB, SystemZ::NC);
8375  case SystemZ::OCSequence:
8376  case SystemZ::OCLoop:
8377    return emitMemMemWrapper(MI, MBB, SystemZ::OC);
8378  case SystemZ::XCSequence:
8379  case SystemZ::XCLoop:
8380    return emitMemMemWrapper(MI, MBB, SystemZ::XC);
8381  case SystemZ::CLCSequence:
8382  case SystemZ::CLCLoop:
8383    return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
8384  case SystemZ::CLSTLoop:
8385    return emitStringWrapper(MI, MBB, SystemZ::CLST);
8386  case SystemZ::MVSTLoop:
8387    return emitStringWrapper(MI, MBB, SystemZ::MVST);
8388  case SystemZ::SRSTLoop:
8389    return emitStringWrapper(MI, MBB, SystemZ::SRST);
8390  case SystemZ::TBEGIN:
8391    return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, false);
8392  case SystemZ::TBEGIN_nofloat:
8393    return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true);
8394  case SystemZ::TBEGINC:
8395    return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true);
8396  case SystemZ::LTEBRCompare_VecPseudo:
8397    return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTEBR);
8398  case SystemZ::LTDBRCompare_VecPseudo:
8399    return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTDBR);
8400  case SystemZ::LTXBRCompare_VecPseudo:
8401    return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
8402
8403  case SystemZ::PROBED_ALLOCA:
8404    return emitProbedAlloca(MI, MBB);
8405
8406  case TargetOpcode::STACKMAP:
8407  case TargetOpcode::PATCHPOINT:
8408    return emitPatchPoint(MI, MBB);
8409
8410  default:
8411    llvm_unreachable("Unexpected instr type to insert");
8412  }
8413}
8414
8415// This is only used by the isel schedulers, and is needed only to prevent
8416// compiler from crashing when list-ilp is used.
8417const TargetRegisterClass *
8418SystemZTargetLowering::getRepRegClassFor(MVT VT) const {
8419  if (VT == MVT::Untyped)
8420    return &SystemZ::ADDR128BitRegClass;
8421  return TargetLowering::getRepRegClassFor(VT);
8422}
8423