1//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the X86 implementation of the TargetInstrInfo class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "X86InstrInfo.h"
14#include "X86.h"
15#include "X86InstrBuilder.h"
16#include "X86InstrFoldTables.h"
17#include "X86MachineFunctionInfo.h"
18#include "X86Subtarget.h"
19#include "X86TargetMachine.h"
20#include "llvm/ADT/STLExtras.h"
21#include "llvm/ADT/Sequence.h"
22#include "llvm/CodeGen/LiveIntervals.h"
23#include "llvm/CodeGen/LivePhysRegs.h"
24#include "llvm/CodeGen/LiveVariables.h"
25#include "llvm/CodeGen/MachineCombinerPattern.h"
26#include "llvm/CodeGen/MachineConstantPool.h"
27#include "llvm/CodeGen/MachineDominators.h"
28#include "llvm/CodeGen/MachineFrameInfo.h"
29#include "llvm/CodeGen/MachineInstr.h"
30#include "llvm/CodeGen/MachineInstrBuilder.h"
31#include "llvm/CodeGen/MachineModuleInfo.h"
32#include "llvm/CodeGen/MachineOperand.h"
33#include "llvm/CodeGen/MachineRegisterInfo.h"
34#include "llvm/CodeGen/StackMaps.h"
35#include "llvm/IR/DebugInfoMetadata.h"
36#include "llvm/IR/DerivedTypes.h"
37#include "llvm/IR/Function.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/MC/MCAsmInfo.h"
40#include "llvm/MC/MCExpr.h"
41#include "llvm/MC/MCInst.h"
42#include "llvm/Support/CommandLine.h"
43#include "llvm/Support/Debug.h"
44#include "llvm/Support/ErrorHandling.h"
45#include "llvm/Support/raw_ostream.h"
46#include "llvm/Target/TargetOptions.h"
47#include <optional>
48
49using namespace llvm;
50
51#define DEBUG_TYPE "x86-instr-info"
52
53#define GET_INSTRINFO_CTOR_DTOR
54#include "X86GenInstrInfo.inc"
55
56static cl::opt<bool>
57    NoFusing("disable-spill-fusing",
58             cl::desc("Disable fusing of spill code into instructions"),
59             cl::Hidden);
60static cl::opt<bool>
61    PrintFailedFusing("print-failed-fuse-candidates",
62                      cl::desc("Print instructions that the allocator wants to"
63                               " fuse, but the X86 backend currently can't"),
64                      cl::Hidden);
65static cl::opt<bool>
66    ReMatPICStubLoad("remat-pic-stub-load",
67                     cl::desc("Re-materialize load from stub in PIC mode"),
68                     cl::init(false), cl::Hidden);
69static cl::opt<unsigned>
70    PartialRegUpdateClearance("partial-reg-update-clearance",
71                              cl::desc("Clearance between two register writes "
72                                       "for inserting XOR to avoid partial "
73                                       "register update"),
74                              cl::init(64), cl::Hidden);
75static cl::opt<unsigned> UndefRegClearance(
76    "undef-reg-clearance",
77    cl::desc("How many idle instructions we would like before "
78             "certain undef register reads"),
79    cl::init(128), cl::Hidden);
80
81// Pin the vtable to this file.
82void X86InstrInfo::anchor() {}
83
84X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
85    : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
86                                               : X86::ADJCALLSTACKDOWN32),
87                      (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
88                                               : X86::ADJCALLSTACKUP32),
89                      X86::CATCHRET, (STI.is64Bit() ? X86::RET64 : X86::RET32)),
90      Subtarget(STI), RI(STI.getTargetTriple()) {}
91
92const TargetRegisterClass *
93X86InstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
94                          const TargetRegisterInfo *TRI,
95                          const MachineFunction &MF) const {
96  auto *RC = TargetInstrInfo::getRegClass(MCID, OpNum, TRI, MF);
97  // If the target does not have egpr, then r16-r31 will be resereved for all
98  // instructions.
99  if (!RC || !Subtarget.hasEGPR())
100    return RC;
101
102  if (X86II::canUseApxExtendedReg(MCID))
103    return RC;
104
105  switch (RC->getID()) {
106  default:
107    return RC;
108  case X86::GR8RegClassID:
109    return &X86::GR8_NOREX2RegClass;
110  case X86::GR16RegClassID:
111    return &X86::GR16_NOREX2RegClass;
112  case X86::GR32RegClassID:
113    return &X86::GR32_NOREX2RegClass;
114  case X86::GR64RegClassID:
115    return &X86::GR64_NOREX2RegClass;
116  case X86::GR32_NOSPRegClassID:
117    return &X86::GR32_NOREX2_NOSPRegClass;
118  case X86::GR64_NOSPRegClassID:
119    return &X86::GR64_NOREX2_NOSPRegClass;
120  }
121}
122
123bool X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
124                                         Register &SrcReg, Register &DstReg,
125                                         unsigned &SubIdx) const {
126  switch (MI.getOpcode()) {
127  default:
128    break;
129  case X86::MOVSX16rr8:
130  case X86::MOVZX16rr8:
131  case X86::MOVSX32rr8:
132  case X86::MOVZX32rr8:
133  case X86::MOVSX64rr8:
134    if (!Subtarget.is64Bit())
135      // It's not always legal to reference the low 8-bit of the larger
136      // register in 32-bit mode.
137      return false;
138    [[fallthrough]];
139  case X86::MOVSX32rr16:
140  case X86::MOVZX32rr16:
141  case X86::MOVSX64rr16:
142  case X86::MOVSX64rr32: {
143    if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
144      // Be conservative.
145      return false;
146    SrcReg = MI.getOperand(1).getReg();
147    DstReg = MI.getOperand(0).getReg();
148    switch (MI.getOpcode()) {
149    default:
150      llvm_unreachable("Unreachable!");
151    case X86::MOVSX16rr8:
152    case X86::MOVZX16rr8:
153    case X86::MOVSX32rr8:
154    case X86::MOVZX32rr8:
155    case X86::MOVSX64rr8:
156      SubIdx = X86::sub_8bit;
157      break;
158    case X86::MOVSX32rr16:
159    case X86::MOVZX32rr16:
160    case X86::MOVSX64rr16:
161      SubIdx = X86::sub_16bit;
162      break;
163    case X86::MOVSX64rr32:
164      SubIdx = X86::sub_32bit;
165      break;
166    }
167    return true;
168  }
169  }
170  return false;
171}
172
173bool X86InstrInfo::isDataInvariant(MachineInstr &MI) {
174  if (MI.mayLoad() || MI.mayStore())
175    return false;
176
177  // Some target-independent operations that trivially lower to data-invariant
178  // instructions.
179  if (MI.isCopyLike() || MI.isInsertSubreg())
180    return true;
181
182  unsigned Opcode = MI.getOpcode();
183  using namespace X86;
184  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
185  // However, they set flags and are perhaps the most surprisingly constant
186  // time operations so we call them out here separately.
187  if (isIMUL(Opcode))
188    return true;
189  // Bit scanning and counting instructions that are somewhat surprisingly
190  // constant time as they scan across bits and do other fairly complex
191  // operations like popcnt, but are believed to be constant time on x86.
192  // However, these set flags.
193  if (isBSF(Opcode) || isBSR(Opcode) || isLZCNT(Opcode) || isPOPCNT(Opcode) ||
194      isTZCNT(Opcode))
195    return true;
196  // Bit manipulation instructions are effectively combinations of basic
197  // arithmetic ops, and should still execute in constant time. These also
198  // set flags.
199  if (isBLCFILL(Opcode) || isBLCI(Opcode) || isBLCIC(Opcode) ||
200      isBLCMSK(Opcode) || isBLCS(Opcode) || isBLSFILL(Opcode) ||
201      isBLSI(Opcode) || isBLSIC(Opcode) || isBLSMSK(Opcode) || isBLSR(Opcode) ||
202      isTZMSK(Opcode))
203    return true;
204  // Bit extracting and clearing instructions should execute in constant time,
205  // and set flags.
206  if (isBEXTR(Opcode) || isBZHI(Opcode))
207    return true;
208  // Shift and rotate.
209  if (isROL(Opcode) || isROR(Opcode) || isSAR(Opcode) || isSHL(Opcode) ||
210      isSHR(Opcode) || isSHLD(Opcode) || isSHRD(Opcode))
211    return true;
212  // Basic arithmetic is constant time on the input but does set flags.
213  if (isADC(Opcode) || isADD(Opcode) || isAND(Opcode) || isOR(Opcode) ||
214      isSBB(Opcode) || isSUB(Opcode) || isXOR(Opcode))
215    return true;
216  // Arithmetic with just 32-bit and 64-bit variants and no immediates.
217  if (isANDN(Opcode))
218    return true;
219  // Unary arithmetic operations.
220  if (isDEC(Opcode) || isINC(Opcode) || isNEG(Opcode))
221    return true;
222  // Unlike other arithmetic, NOT doesn't set EFLAGS.
223  if (isNOT(Opcode))
224    return true;
225  // Various move instructions used to zero or sign extend things. Note that we
226  // intentionally don't support the _NOREX variants as we can't handle that
227  // register constraint anyways.
228  if (isMOVSX(Opcode) || isMOVZX(Opcode) || isMOVSXD(Opcode) || isMOV(Opcode))
229    return true;
230  // Arithmetic instructions that are both constant time and don't set flags.
231  if (isRORX(Opcode) || isSARX(Opcode) || isSHLX(Opcode) || isSHRX(Opcode))
232    return true;
233  // LEA doesn't actually access memory, and its arithmetic is constant time.
234  if (isLEA(Opcode))
235    return true;
236  // By default, assume that the instruction is not data invariant.
237  return false;
238}
239
240bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) {
241  switch (MI.getOpcode()) {
242  default:
243    // By default, assume that the load will immediately leak.
244    return false;
245
246  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
247  // However, they set flags and are perhaps the most surprisingly constant
248  // time operations so we call them out here separately.
249  case X86::IMUL16rm:
250  case X86::IMUL16rmi:
251  case X86::IMUL32rm:
252  case X86::IMUL32rmi:
253  case X86::IMUL64rm:
254  case X86::IMUL64rmi32:
255
256  // Bit scanning and counting instructions that are somewhat surprisingly
257  // constant time as they scan across bits and do other fairly complex
258  // operations like popcnt, but are believed to be constant time on x86.
259  // However, these set flags.
260  case X86::BSF16rm:
261  case X86::BSF32rm:
262  case X86::BSF64rm:
263  case X86::BSR16rm:
264  case X86::BSR32rm:
265  case X86::BSR64rm:
266  case X86::LZCNT16rm:
267  case X86::LZCNT32rm:
268  case X86::LZCNT64rm:
269  case X86::POPCNT16rm:
270  case X86::POPCNT32rm:
271  case X86::POPCNT64rm:
272  case X86::TZCNT16rm:
273  case X86::TZCNT32rm:
274  case X86::TZCNT64rm:
275
276  // Bit manipulation instructions are effectively combinations of basic
277  // arithmetic ops, and should still execute in constant time. These also
278  // set flags.
279  case X86::BLCFILL32rm:
280  case X86::BLCFILL64rm:
281  case X86::BLCI32rm:
282  case X86::BLCI64rm:
283  case X86::BLCIC32rm:
284  case X86::BLCIC64rm:
285  case X86::BLCMSK32rm:
286  case X86::BLCMSK64rm:
287  case X86::BLCS32rm:
288  case X86::BLCS64rm:
289  case X86::BLSFILL32rm:
290  case X86::BLSFILL64rm:
291  case X86::BLSI32rm:
292  case X86::BLSI64rm:
293  case X86::BLSIC32rm:
294  case X86::BLSIC64rm:
295  case X86::BLSMSK32rm:
296  case X86::BLSMSK64rm:
297  case X86::BLSR32rm:
298  case X86::BLSR64rm:
299  case X86::TZMSK32rm:
300  case X86::TZMSK64rm:
301
302  // Bit extracting and clearing instructions should execute in constant time,
303  // and set flags.
304  case X86::BEXTR32rm:
305  case X86::BEXTR64rm:
306  case X86::BEXTRI32mi:
307  case X86::BEXTRI64mi:
308  case X86::BZHI32rm:
309  case X86::BZHI64rm:
310
311  // Basic arithmetic is constant time on the input but does set flags.
312  case X86::ADC8rm:
313  case X86::ADC16rm:
314  case X86::ADC32rm:
315  case X86::ADC64rm:
316  case X86::ADD8rm:
317  case X86::ADD16rm:
318  case X86::ADD32rm:
319  case X86::ADD64rm:
320  case X86::AND8rm:
321  case X86::AND16rm:
322  case X86::AND32rm:
323  case X86::AND64rm:
324  case X86::ANDN32rm:
325  case X86::ANDN64rm:
326  case X86::OR8rm:
327  case X86::OR16rm:
328  case X86::OR32rm:
329  case X86::OR64rm:
330  case X86::SBB8rm:
331  case X86::SBB16rm:
332  case X86::SBB32rm:
333  case X86::SBB64rm:
334  case X86::SUB8rm:
335  case X86::SUB16rm:
336  case X86::SUB32rm:
337  case X86::SUB64rm:
338  case X86::XOR8rm:
339  case X86::XOR16rm:
340  case X86::XOR32rm:
341  case X86::XOR64rm:
342
343  // Integer multiply w/o affecting flags is still believed to be constant
344  // time on x86. Called out separately as this is among the most surprising
345  // instructions to exhibit that behavior.
346  case X86::MULX32rm:
347  case X86::MULX64rm:
348
349  // Arithmetic instructions that are both constant time and don't set flags.
350  case X86::RORX32mi:
351  case X86::RORX64mi:
352  case X86::SARX32rm:
353  case X86::SARX64rm:
354  case X86::SHLX32rm:
355  case X86::SHLX64rm:
356  case X86::SHRX32rm:
357  case X86::SHRX64rm:
358
359  // Conversions are believed to be constant time and don't set flags.
360  case X86::CVTTSD2SI64rm:
361  case X86::VCVTTSD2SI64rm:
362  case X86::VCVTTSD2SI64Zrm:
363  case X86::CVTTSD2SIrm:
364  case X86::VCVTTSD2SIrm:
365  case X86::VCVTTSD2SIZrm:
366  case X86::CVTTSS2SI64rm:
367  case X86::VCVTTSS2SI64rm:
368  case X86::VCVTTSS2SI64Zrm:
369  case X86::CVTTSS2SIrm:
370  case X86::VCVTTSS2SIrm:
371  case X86::VCVTTSS2SIZrm:
372  case X86::CVTSI2SDrm:
373  case X86::VCVTSI2SDrm:
374  case X86::VCVTSI2SDZrm:
375  case X86::CVTSI2SSrm:
376  case X86::VCVTSI2SSrm:
377  case X86::VCVTSI2SSZrm:
378  case X86::CVTSI642SDrm:
379  case X86::VCVTSI642SDrm:
380  case X86::VCVTSI642SDZrm:
381  case X86::CVTSI642SSrm:
382  case X86::VCVTSI642SSrm:
383  case X86::VCVTSI642SSZrm:
384  case X86::CVTSS2SDrm:
385  case X86::VCVTSS2SDrm:
386  case X86::VCVTSS2SDZrm:
387  case X86::CVTSD2SSrm:
388  case X86::VCVTSD2SSrm:
389  case X86::VCVTSD2SSZrm:
390  // AVX512 added unsigned integer conversions.
391  case X86::VCVTTSD2USI64Zrm:
392  case X86::VCVTTSD2USIZrm:
393  case X86::VCVTTSS2USI64Zrm:
394  case X86::VCVTTSS2USIZrm:
395  case X86::VCVTUSI2SDZrm:
396  case X86::VCVTUSI642SDZrm:
397  case X86::VCVTUSI2SSZrm:
398  case X86::VCVTUSI642SSZrm:
399
400  // Loads to register don't set flags.
401  case X86::MOV8rm:
402  case X86::MOV8rm_NOREX:
403  case X86::MOV16rm:
404  case X86::MOV32rm:
405  case X86::MOV64rm:
406  case X86::MOVSX16rm8:
407  case X86::MOVSX32rm16:
408  case X86::MOVSX32rm8:
409  case X86::MOVSX32rm8_NOREX:
410  case X86::MOVSX64rm16:
411  case X86::MOVSX64rm32:
412  case X86::MOVSX64rm8:
413  case X86::MOVZX16rm8:
414  case X86::MOVZX32rm16:
415  case X86::MOVZX32rm8:
416  case X86::MOVZX32rm8_NOREX:
417  case X86::MOVZX64rm16:
418  case X86::MOVZX64rm8:
419    return true;
420  }
421}
422
423int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
424  const MachineFunction *MF = MI.getParent()->getParent();
425  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
426
427  if (isFrameInstr(MI)) {
428    int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
429    SPAdj -= getFrameAdjustment(MI);
430    if (!isFrameSetup(MI))
431      SPAdj = -SPAdj;
432    return SPAdj;
433  }
434
435  // To know whether a call adjusts the stack, we need information
436  // that is bound to the following ADJCALLSTACKUP pseudo.
437  // Look for the next ADJCALLSTACKUP that follows the call.
438  if (MI.isCall()) {
439    const MachineBasicBlock *MBB = MI.getParent();
440    auto I = ++MachineBasicBlock::const_iterator(MI);
441    for (auto E = MBB->end(); I != E; ++I) {
442      if (I->getOpcode() == getCallFrameDestroyOpcode() || I->isCall())
443        break;
444    }
445
446    // If we could not find a frame destroy opcode, then it has already
447    // been simplified, so we don't care.
448    if (I->getOpcode() != getCallFrameDestroyOpcode())
449      return 0;
450
451    return -(I->getOperand(1).getImm());
452  }
453
454  // Currently handle only PUSHes we can reasonably expect to see
455  // in call sequences
456  switch (MI.getOpcode()) {
457  default:
458    return 0;
459  case X86::PUSH32r:
460  case X86::PUSH32rmm:
461  case X86::PUSH32rmr:
462  case X86::PUSH32i:
463    return 4;
464  case X86::PUSH64r:
465  case X86::PUSH64rmm:
466  case X86::PUSH64rmr:
467  case X86::PUSH64i32:
468    return 8;
469  }
470}
471
472/// Return true and the FrameIndex if the specified
473/// operand and follow operands form a reference to the stack frame.
474bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
475                                  int &FrameIndex) const {
476  if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
477      MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
478      MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
479      MI.getOperand(Op + X86::AddrDisp).isImm() &&
480      MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
481      MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
482      MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
483    FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
484    return true;
485  }
486  return false;
487}
488
489static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
490  switch (Opcode) {
491  default:
492    return false;
493  case X86::MOV8rm:
494  case X86::KMOVBkm:
495  case X86::KMOVBkm_EVEX:
496    MemBytes = 1;
497    return true;
498  case X86::MOV16rm:
499  case X86::KMOVWkm:
500  case X86::KMOVWkm_EVEX:
501  case X86::VMOVSHZrm:
502  case X86::VMOVSHZrm_alt:
503    MemBytes = 2;
504    return true;
505  case X86::MOV32rm:
506  case X86::MOVSSrm:
507  case X86::MOVSSrm_alt:
508  case X86::VMOVSSrm:
509  case X86::VMOVSSrm_alt:
510  case X86::VMOVSSZrm:
511  case X86::VMOVSSZrm_alt:
512  case X86::KMOVDkm:
513  case X86::KMOVDkm_EVEX:
514    MemBytes = 4;
515    return true;
516  case X86::MOV64rm:
517  case X86::LD_Fp64m:
518  case X86::MOVSDrm:
519  case X86::MOVSDrm_alt:
520  case X86::VMOVSDrm:
521  case X86::VMOVSDrm_alt:
522  case X86::VMOVSDZrm:
523  case X86::VMOVSDZrm_alt:
524  case X86::MMX_MOVD64rm:
525  case X86::MMX_MOVQ64rm:
526  case X86::KMOVQkm:
527  case X86::KMOVQkm_EVEX:
528    MemBytes = 8;
529    return true;
530  case X86::MOVAPSrm:
531  case X86::MOVUPSrm:
532  case X86::MOVAPDrm:
533  case X86::MOVUPDrm:
534  case X86::MOVDQArm:
535  case X86::MOVDQUrm:
536  case X86::VMOVAPSrm:
537  case X86::VMOVUPSrm:
538  case X86::VMOVAPDrm:
539  case X86::VMOVUPDrm:
540  case X86::VMOVDQArm:
541  case X86::VMOVDQUrm:
542  case X86::VMOVAPSZ128rm:
543  case X86::VMOVUPSZ128rm:
544  case X86::VMOVAPSZ128rm_NOVLX:
545  case X86::VMOVUPSZ128rm_NOVLX:
546  case X86::VMOVAPDZ128rm:
547  case X86::VMOVUPDZ128rm:
548  case X86::VMOVDQU8Z128rm:
549  case X86::VMOVDQU16Z128rm:
550  case X86::VMOVDQA32Z128rm:
551  case X86::VMOVDQU32Z128rm:
552  case X86::VMOVDQA64Z128rm:
553  case X86::VMOVDQU64Z128rm:
554    MemBytes = 16;
555    return true;
556  case X86::VMOVAPSYrm:
557  case X86::VMOVUPSYrm:
558  case X86::VMOVAPDYrm:
559  case X86::VMOVUPDYrm:
560  case X86::VMOVDQAYrm:
561  case X86::VMOVDQUYrm:
562  case X86::VMOVAPSZ256rm:
563  case X86::VMOVUPSZ256rm:
564  case X86::VMOVAPSZ256rm_NOVLX:
565  case X86::VMOVUPSZ256rm_NOVLX:
566  case X86::VMOVAPDZ256rm:
567  case X86::VMOVUPDZ256rm:
568  case X86::VMOVDQU8Z256rm:
569  case X86::VMOVDQU16Z256rm:
570  case X86::VMOVDQA32Z256rm:
571  case X86::VMOVDQU32Z256rm:
572  case X86::VMOVDQA64Z256rm:
573  case X86::VMOVDQU64Z256rm:
574    MemBytes = 32;
575    return true;
576  case X86::VMOVAPSZrm:
577  case X86::VMOVUPSZrm:
578  case X86::VMOVAPDZrm:
579  case X86::VMOVUPDZrm:
580  case X86::VMOVDQU8Zrm:
581  case X86::VMOVDQU16Zrm:
582  case X86::VMOVDQA32Zrm:
583  case X86::VMOVDQU32Zrm:
584  case X86::VMOVDQA64Zrm:
585  case X86::VMOVDQU64Zrm:
586    MemBytes = 64;
587    return true;
588  }
589}
590
591static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
592  switch (Opcode) {
593  default:
594    return false;
595  case X86::MOV8mr:
596  case X86::KMOVBmk:
597  case X86::KMOVBmk_EVEX:
598    MemBytes = 1;
599    return true;
600  case X86::MOV16mr:
601  case X86::KMOVWmk:
602  case X86::KMOVWmk_EVEX:
603  case X86::VMOVSHZmr:
604    MemBytes = 2;
605    return true;
606  case X86::MOV32mr:
607  case X86::MOVSSmr:
608  case X86::VMOVSSmr:
609  case X86::VMOVSSZmr:
610  case X86::KMOVDmk:
611  case X86::KMOVDmk_EVEX:
612    MemBytes = 4;
613    return true;
614  case X86::MOV64mr:
615  case X86::ST_FpP64m:
616  case X86::MOVSDmr:
617  case X86::VMOVSDmr:
618  case X86::VMOVSDZmr:
619  case X86::MMX_MOVD64mr:
620  case X86::MMX_MOVQ64mr:
621  case X86::MMX_MOVNTQmr:
622  case X86::KMOVQmk:
623  case X86::KMOVQmk_EVEX:
624    MemBytes = 8;
625    return true;
626  case X86::MOVAPSmr:
627  case X86::MOVUPSmr:
628  case X86::MOVAPDmr:
629  case X86::MOVUPDmr:
630  case X86::MOVDQAmr:
631  case X86::MOVDQUmr:
632  case X86::VMOVAPSmr:
633  case X86::VMOVUPSmr:
634  case X86::VMOVAPDmr:
635  case X86::VMOVUPDmr:
636  case X86::VMOVDQAmr:
637  case X86::VMOVDQUmr:
638  case X86::VMOVUPSZ128mr:
639  case X86::VMOVAPSZ128mr:
640  case X86::VMOVUPSZ128mr_NOVLX:
641  case X86::VMOVAPSZ128mr_NOVLX:
642  case X86::VMOVUPDZ128mr:
643  case X86::VMOVAPDZ128mr:
644  case X86::VMOVDQA32Z128mr:
645  case X86::VMOVDQU32Z128mr:
646  case X86::VMOVDQA64Z128mr:
647  case X86::VMOVDQU64Z128mr:
648  case X86::VMOVDQU8Z128mr:
649  case X86::VMOVDQU16Z128mr:
650    MemBytes = 16;
651    return true;
652  case X86::VMOVUPSYmr:
653  case X86::VMOVAPSYmr:
654  case X86::VMOVUPDYmr:
655  case X86::VMOVAPDYmr:
656  case X86::VMOVDQUYmr:
657  case X86::VMOVDQAYmr:
658  case X86::VMOVUPSZ256mr:
659  case X86::VMOVAPSZ256mr:
660  case X86::VMOVUPSZ256mr_NOVLX:
661  case X86::VMOVAPSZ256mr_NOVLX:
662  case X86::VMOVUPDZ256mr:
663  case X86::VMOVAPDZ256mr:
664  case X86::VMOVDQU8Z256mr:
665  case X86::VMOVDQU16Z256mr:
666  case X86::VMOVDQA32Z256mr:
667  case X86::VMOVDQU32Z256mr:
668  case X86::VMOVDQA64Z256mr:
669  case X86::VMOVDQU64Z256mr:
670    MemBytes = 32;
671    return true;
672  case X86::VMOVUPSZmr:
673  case X86::VMOVAPSZmr:
674  case X86::VMOVUPDZmr:
675  case X86::VMOVAPDZmr:
676  case X86::VMOVDQU8Zmr:
677  case X86::VMOVDQU16Zmr:
678  case X86::VMOVDQA32Zmr:
679  case X86::VMOVDQU32Zmr:
680  case X86::VMOVDQA64Zmr:
681  case X86::VMOVDQU64Zmr:
682    MemBytes = 64;
683    return true;
684  }
685  return false;
686}
687
688unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
689                                           int &FrameIndex) const {
690  unsigned Dummy;
691  return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
692}
693
694unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
695                                           int &FrameIndex,
696                                           unsigned &MemBytes) const {
697  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
698    if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
699      return MI.getOperand(0).getReg();
700  return 0;
701}
702
703unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
704                                                 int &FrameIndex) const {
705  unsigned Dummy;
706  if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
707    unsigned Reg;
708    if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
709      return Reg;
710    // Check for post-frame index elimination operations
711    SmallVector<const MachineMemOperand *, 1> Accesses;
712    if (hasLoadFromStackSlot(MI, Accesses)) {
713      FrameIndex =
714          cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
715              ->getFrameIndex();
716      return MI.getOperand(0).getReg();
717    }
718  }
719  return 0;
720}
721
722unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
723                                          int &FrameIndex) const {
724  unsigned Dummy;
725  return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
726}
727
728unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
729                                          int &FrameIndex,
730                                          unsigned &MemBytes) const {
731  if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
732    if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
733        isFrameOperand(MI, 0, FrameIndex))
734      return MI.getOperand(X86::AddrNumOperands).getReg();
735  return 0;
736}
737
738unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
739                                                int &FrameIndex) const {
740  unsigned Dummy;
741  if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
742    unsigned Reg;
743    if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
744      return Reg;
745    // Check for post-frame index elimination operations
746    SmallVector<const MachineMemOperand *, 1> Accesses;
747    if (hasStoreToStackSlot(MI, Accesses)) {
748      FrameIndex =
749          cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
750              ->getFrameIndex();
751      return MI.getOperand(X86::AddrNumOperands).getReg();
752    }
753  }
754  return 0;
755}
756
757/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
758static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
759  // Don't waste compile time scanning use-def chains of physregs.
760  if (!BaseReg.isVirtual())
761    return false;
762  bool isPICBase = false;
763  for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
764                                               E = MRI.def_instr_end();
765       I != E; ++I) {
766    MachineInstr *DefMI = &*I;
767    if (DefMI->getOpcode() != X86::MOVPC32r)
768      return false;
769    assert(!isPICBase && "More than one PIC base?");
770    isPICBase = true;
771  }
772  return isPICBase;
773}
774
775bool X86InstrInfo::isReallyTriviallyReMaterializable(
776    const MachineInstr &MI) const {
777  switch (MI.getOpcode()) {
778  default:
779    // This function should only be called for opcodes with the ReMaterializable
780    // flag set.
781    llvm_unreachable("Unknown rematerializable operation!");
782    break;
783  case X86::IMPLICIT_DEF:
784    // Defer to generic logic.
785    break;
786  case X86::LOAD_STACK_GUARD:
787  case X86::LD_Fp032:
788  case X86::LD_Fp064:
789  case X86::LD_Fp080:
790  case X86::LD_Fp132:
791  case X86::LD_Fp164:
792  case X86::LD_Fp180:
793  case X86::AVX1_SETALLONES:
794  case X86::AVX2_SETALLONES:
795  case X86::AVX512_128_SET0:
796  case X86::AVX512_256_SET0:
797  case X86::AVX512_512_SET0:
798  case X86::AVX512_512_SETALLONES:
799  case X86::AVX512_FsFLD0SD:
800  case X86::AVX512_FsFLD0SH:
801  case X86::AVX512_FsFLD0SS:
802  case X86::AVX512_FsFLD0F128:
803  case X86::AVX_SET0:
804  case X86::FsFLD0SD:
805  case X86::FsFLD0SS:
806  case X86::FsFLD0SH:
807  case X86::FsFLD0F128:
808  case X86::KSET0D:
809  case X86::KSET0Q:
810  case X86::KSET0W:
811  case X86::KSET1D:
812  case X86::KSET1Q:
813  case X86::KSET1W:
814  case X86::MMX_SET0:
815  case X86::MOV32ImmSExti8:
816  case X86::MOV32r0:
817  case X86::MOV32r1:
818  case X86::MOV32r_1:
819  case X86::MOV32ri64:
820  case X86::MOV64ImmSExti8:
821  case X86::V_SET0:
822  case X86::V_SETALLONES:
823  case X86::MOV16ri:
824  case X86::MOV32ri:
825  case X86::MOV64ri:
826  case X86::MOV64ri32:
827  case X86::MOV8ri:
828  case X86::PTILEZEROV:
829    return true;
830
831  case X86::MOV8rm:
832  case X86::MOV8rm_NOREX:
833  case X86::MOV16rm:
834  case X86::MOV32rm:
835  case X86::MOV64rm:
836  case X86::MOVSSrm:
837  case X86::MOVSSrm_alt:
838  case X86::MOVSDrm:
839  case X86::MOVSDrm_alt:
840  case X86::MOVAPSrm:
841  case X86::MOVUPSrm:
842  case X86::MOVAPDrm:
843  case X86::MOVUPDrm:
844  case X86::MOVDQArm:
845  case X86::MOVDQUrm:
846  case X86::VMOVSSrm:
847  case X86::VMOVSSrm_alt:
848  case X86::VMOVSDrm:
849  case X86::VMOVSDrm_alt:
850  case X86::VMOVAPSrm:
851  case X86::VMOVUPSrm:
852  case X86::VMOVAPDrm:
853  case X86::VMOVUPDrm:
854  case X86::VMOVDQArm:
855  case X86::VMOVDQUrm:
856  case X86::VMOVAPSYrm:
857  case X86::VMOVUPSYrm:
858  case X86::VMOVAPDYrm:
859  case X86::VMOVUPDYrm:
860  case X86::VMOVDQAYrm:
861  case X86::VMOVDQUYrm:
862  case X86::MMX_MOVD64rm:
863  case X86::MMX_MOVQ64rm:
864  // AVX-512
865  case X86::VMOVSSZrm:
866  case X86::VMOVSSZrm_alt:
867  case X86::VMOVSDZrm:
868  case X86::VMOVSDZrm_alt:
869  case X86::VMOVSHZrm:
870  case X86::VMOVSHZrm_alt:
871  case X86::VMOVAPDZ128rm:
872  case X86::VMOVAPDZ256rm:
873  case X86::VMOVAPDZrm:
874  case X86::VMOVAPSZ128rm:
875  case X86::VMOVAPSZ256rm:
876  case X86::VMOVAPSZ128rm_NOVLX:
877  case X86::VMOVAPSZ256rm_NOVLX:
878  case X86::VMOVAPSZrm:
879  case X86::VMOVDQA32Z128rm:
880  case X86::VMOVDQA32Z256rm:
881  case X86::VMOVDQA32Zrm:
882  case X86::VMOVDQA64Z128rm:
883  case X86::VMOVDQA64Z256rm:
884  case X86::VMOVDQA64Zrm:
885  case X86::VMOVDQU16Z128rm:
886  case X86::VMOVDQU16Z256rm:
887  case X86::VMOVDQU16Zrm:
888  case X86::VMOVDQU32Z128rm:
889  case X86::VMOVDQU32Z256rm:
890  case X86::VMOVDQU32Zrm:
891  case X86::VMOVDQU64Z128rm:
892  case X86::VMOVDQU64Z256rm:
893  case X86::VMOVDQU64Zrm:
894  case X86::VMOVDQU8Z128rm:
895  case X86::VMOVDQU8Z256rm:
896  case X86::VMOVDQU8Zrm:
897  case X86::VMOVUPDZ128rm:
898  case X86::VMOVUPDZ256rm:
899  case X86::VMOVUPDZrm:
900  case X86::VMOVUPSZ128rm:
901  case X86::VMOVUPSZ256rm:
902  case X86::VMOVUPSZ128rm_NOVLX:
903  case X86::VMOVUPSZ256rm_NOVLX:
904  case X86::VMOVUPSZrm: {
905    // Loads from constant pools are trivially rematerializable.
906    if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
907        MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
908        MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
909        MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
910        MI.isDereferenceableInvariantLoad()) {
911      Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
912      if (BaseReg == 0 || BaseReg == X86::RIP)
913        return true;
914      // Allow re-materialization of PIC load.
915      if (!(!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())) {
916        const MachineFunction &MF = *MI.getParent()->getParent();
917        const MachineRegisterInfo &MRI = MF.getRegInfo();
918        if (regIsPICBase(BaseReg, MRI))
919          return true;
920      }
921    }
922    break;
923  }
924
925  case X86::LEA32r:
926  case X86::LEA64r: {
927    if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
928        MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
929        MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
930        !MI.getOperand(1 + X86::AddrDisp).isReg()) {
931      // lea fi#, lea GV, etc. are all rematerializable.
932      if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
933        return true;
934      Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
935      if (BaseReg == 0)
936        return true;
937      // Allow re-materialization of lea PICBase + x.
938      const MachineFunction &MF = *MI.getParent()->getParent();
939      const MachineRegisterInfo &MRI = MF.getRegInfo();
940      if (regIsPICBase(BaseReg, MRI))
941        return true;
942    }
943    break;
944  }
945  }
946  return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
947}
948
949void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
950                                 MachineBasicBlock::iterator I,
951                                 Register DestReg, unsigned SubIdx,
952                                 const MachineInstr &Orig,
953                                 const TargetRegisterInfo &TRI) const {
954  bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
955  if (ClobbersEFLAGS && MBB.computeRegisterLiveness(&TRI, X86::EFLAGS, I) !=
956                            MachineBasicBlock::LQR_Dead) {
957    // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
958    // effects.
959    int Value;
960    switch (Orig.getOpcode()) {
961    case X86::MOV32r0:
962      Value = 0;
963      break;
964    case X86::MOV32r1:
965      Value = 1;
966      break;
967    case X86::MOV32r_1:
968      Value = -1;
969      break;
970    default:
971      llvm_unreachable("Unexpected instruction!");
972    }
973
974    const DebugLoc &DL = Orig.getDebugLoc();
975    BuildMI(MBB, I, DL, get(X86::MOV32ri))
976        .add(Orig.getOperand(0))
977        .addImm(Value);
978  } else {
979    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
980    MBB.insert(I, MI);
981  }
982
983  MachineInstr &NewMI = *std::prev(I);
984  NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
985}
986
987/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
988bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
989  for (const MachineOperand &MO : MI.operands()) {
990    if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS &&
991        !MO.isDead()) {
992      return true;
993    }
994  }
995  return false;
996}
997
998/// Check whether the shift count for a machine operand is non-zero.
999inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
1000                                              unsigned ShiftAmtOperandIdx) {
1001  // The shift count is six bits with the REX.W prefix and five bits without.
1002  unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
1003  unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
1004  return Imm & ShiftCountMask;
1005}
1006
1007/// Check whether the given shift count is appropriate
1008/// can be represented by a LEA instruction.
1009inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
1010  // Left shift instructions can be transformed into load-effective-address
1011  // instructions if we can encode them appropriately.
1012  // A LEA instruction utilizes a SIB byte to encode its scale factor.
1013  // The SIB.scale field is two bits wide which means that we can encode any
1014  // shift amount less than 4.
1015  return ShAmt < 4 && ShAmt > 0;
1016}
1017
1018static bool findRedundantFlagInstr(MachineInstr &CmpInstr,
1019                                   MachineInstr &CmpValDefInstr,
1020                                   const MachineRegisterInfo *MRI,
1021                                   MachineInstr **AndInstr,
1022                                   const TargetRegisterInfo *TRI,
1023                                   bool &NoSignFlag, bool &ClearsOverflowFlag) {
1024  if (!(CmpValDefInstr.getOpcode() == X86::SUBREG_TO_REG &&
1025        CmpInstr.getOpcode() == X86::TEST64rr) &&
1026      !(CmpValDefInstr.getOpcode() == X86::COPY &&
1027        CmpInstr.getOpcode() == X86::TEST16rr))
1028    return false;
1029
1030  // CmpInstr is a TEST16rr/TEST64rr instruction, and
1031  // `X86InstrInfo::analyzeCompare` guarantees that it's analyzable only if two
1032  // registers are identical.
1033  assert((CmpInstr.getOperand(0).getReg() == CmpInstr.getOperand(1).getReg()) &&
1034         "CmpInstr is an analyzable TEST16rr/TEST64rr, and "
1035         "`X86InstrInfo::analyzeCompare` requires two reg operands are the"
1036         "same.");
1037
1038  // Caller (`X86InstrInfo::optimizeCompareInstr`) guarantees that
1039  // `CmpValDefInstr` defines the value that's used by `CmpInstr`; in this case
1040  // if `CmpValDefInstr` sets the EFLAGS, it is likely that `CmpInstr` is
1041  // redundant.
1042  assert(
1043      (MRI->getVRegDef(CmpInstr.getOperand(0).getReg()) == &CmpValDefInstr) &&
1044      "Caller guarantees that TEST64rr is a user of SUBREG_TO_REG or TEST16rr "
1045      "is a user of COPY sub16bit.");
1046  MachineInstr *VregDefInstr = nullptr;
1047  if (CmpInstr.getOpcode() == X86::TEST16rr) {
1048    if (!CmpValDefInstr.getOperand(1).getReg().isVirtual())
1049      return false;
1050    VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(1).getReg());
1051    if (!VregDefInstr)
1052      return false;
1053    // We can only remove test when AND32ri or AND64ri32 whose imm can fit 16bit
1054    // size, others 32/64 bit ops would test higher bits which test16rr don't
1055    // want to.
1056    if (!((VregDefInstr->getOpcode() == X86::AND32ri ||
1057           VregDefInstr->getOpcode() == X86::AND64ri32) &&
1058          isUInt<16>(VregDefInstr->getOperand(2).getImm())))
1059      return false;
1060  }
1061
1062  if (CmpInstr.getOpcode() == X86::TEST64rr) {
1063    // As seen in X86 td files, CmpValDefInstr.getOperand(1).getImm() is
1064    // typically 0.
1065    if (CmpValDefInstr.getOperand(1).getImm() != 0)
1066      return false;
1067
1068    // As seen in X86 td files, CmpValDefInstr.getOperand(3) is typically
1069    // sub_32bit or sub_xmm.
1070    if (CmpValDefInstr.getOperand(3).getImm() != X86::sub_32bit)
1071      return false;
1072
1073    VregDefInstr = MRI->getVRegDef(CmpValDefInstr.getOperand(2).getReg());
1074  }
1075
1076  assert(VregDefInstr && "Must have a definition (SSA)");
1077
1078  // Requires `CmpValDefInstr` and `VregDefInstr` are from the same MBB
1079  // to simplify the subsequent analysis.
1080  //
1081  // FIXME: If `VregDefInstr->getParent()` is the only predecessor of
1082  // `CmpValDefInstr.getParent()`, this could be handled.
1083  if (VregDefInstr->getParent() != CmpValDefInstr.getParent())
1084    return false;
1085
1086  if (X86::isAND(VregDefInstr->getOpcode())) {
1087    // Get a sequence of instructions like
1088    //   %reg = and* ...                    // Set EFLAGS
1089    //   ...                                // EFLAGS not changed
1090    //   %extended_reg = subreg_to_reg 0, %reg, %subreg.sub_32bit
1091    //   test64rr %extended_reg, %extended_reg, implicit-def $eflags
1092    // or
1093    //   %reg = and32* ...
1094    //   ...                         // EFLAGS not changed.
1095    //   %src_reg = copy %reg.sub_16bit:gr32
1096    //   test16rr %src_reg, %src_reg, implicit-def $eflags
1097    //
1098    // If subsequent readers use a subset of bits that don't change
1099    // after `and*` instructions, it's likely that the test64rr could
1100    // be optimized away.
1101    for (const MachineInstr &Instr :
1102         make_range(std::next(MachineBasicBlock::iterator(VregDefInstr)),
1103                    MachineBasicBlock::iterator(CmpValDefInstr))) {
1104      // There are instructions between 'VregDefInstr' and
1105      // 'CmpValDefInstr' that modifies EFLAGS.
1106      if (Instr.modifiesRegister(X86::EFLAGS, TRI))
1107        return false;
1108    }
1109
1110    *AndInstr = VregDefInstr;
1111
1112    // AND instruction will essentially update SF and clear OF, so
1113    // NoSignFlag should be false in the sense that SF is modified by `AND`.
1114    //
1115    // However, the implementation artifically sets `NoSignFlag` to true
1116    // to poison the SF bit; that is to say, if SF is looked at later, the
1117    // optimization (to erase TEST64rr) will be disabled.
1118    //
1119    // The reason to poison SF bit is that SF bit value could be different
1120    // in the `AND` and `TEST` operation; signed bit is not known for `AND`,
1121    // and is known to be 0 as a result of `TEST64rr`.
1122    //
1123    // FIXME: As opposed to poisoning the SF bit directly, consider peeking into
1124    // the AND instruction and using the static information to guide peephole
1125    // optimization if possible. For example, it's possible to fold a
1126    // conditional move into a copy if the relevant EFLAG bits could be deduced
1127    // from an immediate operand of and operation.
1128    //
1129    NoSignFlag = true;
1130    // ClearsOverflowFlag is true for AND operation (no surprise).
1131    ClearsOverflowFlag = true;
1132    return true;
1133  }
1134  return false;
1135}
1136
1137bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
1138                                  unsigned Opc, bool AllowSP, Register &NewSrc,
1139                                  bool &isKill, MachineOperand &ImplicitOp,
1140                                  LiveVariables *LV, LiveIntervals *LIS) const {
1141  MachineFunction &MF = *MI.getParent()->getParent();
1142  const TargetRegisterClass *RC;
1143  if (AllowSP) {
1144    RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
1145  } else {
1146    RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
1147  }
1148  Register SrcReg = Src.getReg();
1149  isKill = MI.killsRegister(SrcReg);
1150
1151  // For both LEA64 and LEA32 the register already has essentially the right
1152  // type (32-bit or 64-bit) we may just need to forbid SP.
1153  if (Opc != X86::LEA64_32r) {
1154    NewSrc = SrcReg;
1155    assert(!Src.isUndef() && "Undef op doesn't need optimization");
1156
1157    if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
1158      return false;
1159
1160    return true;
1161  }
1162
1163  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
1164  // another we need to add 64-bit registers to the final MI.
1165  if (SrcReg.isPhysical()) {
1166    ImplicitOp = Src;
1167    ImplicitOp.setImplicit();
1168
1169    NewSrc = getX86SubSuperRegister(SrcReg, 64);
1170    assert(NewSrc.isValid() && "Invalid Operand");
1171    assert(!Src.isUndef() && "Undef op doesn't need optimization");
1172  } else {
1173    // Virtual register of the wrong class, we have to create a temporary 64-bit
1174    // vreg to feed into the LEA.
1175    NewSrc = MF.getRegInfo().createVirtualRegister(RC);
1176    MachineInstr *Copy =
1177        BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1178            .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
1179            .addReg(SrcReg, getKillRegState(isKill));
1180
1181    // Which is obviously going to be dead after we're done with it.
1182    isKill = true;
1183
1184    if (LV)
1185      LV->replaceKillInstruction(SrcReg, MI, *Copy);
1186
1187    if (LIS) {
1188      SlotIndex CopyIdx = LIS->InsertMachineInstrInMaps(*Copy);
1189      SlotIndex Idx = LIS->getInstructionIndex(MI);
1190      LiveInterval &LI = LIS->getInterval(SrcReg);
1191      LiveRange::Segment *S = LI.getSegmentContaining(Idx);
1192      if (S->end.getBaseIndex() == Idx)
1193        S->end = CopyIdx.getRegSlot();
1194    }
1195  }
1196
1197  // We've set all the parameters without issue.
1198  return true;
1199}
1200
1201MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
1202                                                         MachineInstr &MI,
1203                                                         LiveVariables *LV,
1204                                                         LiveIntervals *LIS,
1205                                                         bool Is8BitOp) const {
1206  // We handle 8-bit adds and various 16-bit opcodes in the switch below.
1207  MachineBasicBlock &MBB = *MI.getParent();
1208  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
1209  assert((Is8BitOp ||
1210          RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
1211              *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
1212         "Unexpected type for LEA transform");
1213
1214  // TODO: For a 32-bit target, we need to adjust the LEA variables with
1215  // something like this:
1216  //   Opcode = X86::LEA32r;
1217  //   InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1218  //   OutRegLEA =
1219  //       Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
1220  //                : RegInfo.createVirtualRegister(&X86::GR32RegClass);
1221  if (!Subtarget.is64Bit())
1222    return nullptr;
1223
1224  unsigned Opcode = X86::LEA64_32r;
1225  Register InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1226  Register OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
1227  Register InRegLEA2;
1228
1229  // Build and insert into an implicit UNDEF value. This is OK because
1230  // we will be shifting and then extracting the lower 8/16-bits.
1231  // This has the potential to cause partial register stall. e.g.
1232  //   movw    (%rbp,%rcx,2), %dx
1233  //   leal    -65(%rdx), %esi
1234  // But testing has shown this *does* help performance in 64-bit mode (at
1235  // least on modern x86 machines).
1236  MachineBasicBlock::iterator MBBI = MI.getIterator();
1237  Register Dest = MI.getOperand(0).getReg();
1238  Register Src = MI.getOperand(1).getReg();
1239  Register Src2;
1240  bool IsDead = MI.getOperand(0).isDead();
1241  bool IsKill = MI.getOperand(1).isKill();
1242  unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
1243  assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
1244  MachineInstr *ImpDef =
1245      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
1246  MachineInstr *InsMI =
1247      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1248          .addReg(InRegLEA, RegState::Define, SubReg)
1249          .addReg(Src, getKillRegState(IsKill));
1250  MachineInstr *ImpDef2 = nullptr;
1251  MachineInstr *InsMI2 = nullptr;
1252
1253  MachineInstrBuilder MIB =
1254      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
1255  switch (MIOpc) {
1256  default:
1257    llvm_unreachable("Unreachable!");
1258  case X86::SHL8ri:
1259  case X86::SHL16ri: {
1260    unsigned ShAmt = MI.getOperand(2).getImm();
1261    MIB.addReg(0)
1262        .addImm(1LL << ShAmt)
1263        .addReg(InRegLEA, RegState::Kill)
1264        .addImm(0)
1265        .addReg(0);
1266    break;
1267  }
1268  case X86::INC8r:
1269  case X86::INC16r:
1270    addRegOffset(MIB, InRegLEA, true, 1);
1271    break;
1272  case X86::DEC8r:
1273  case X86::DEC16r:
1274    addRegOffset(MIB, InRegLEA, true, -1);
1275    break;
1276  case X86::ADD8ri:
1277  case X86::ADD8ri_DB:
1278  case X86::ADD16ri:
1279  case X86::ADD16ri_DB:
1280    addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
1281    break;
1282  case X86::ADD8rr:
1283  case X86::ADD8rr_DB:
1284  case X86::ADD16rr:
1285  case X86::ADD16rr_DB: {
1286    Src2 = MI.getOperand(2).getReg();
1287    bool IsKill2 = MI.getOperand(2).isKill();
1288    assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
1289    if (Src == Src2) {
1290      // ADD8rr/ADD16rr killed %reg1028, %reg1028
1291      // just a single insert_subreg.
1292      addRegReg(MIB, InRegLEA, true, InRegLEA, false);
1293    } else {
1294      if (Subtarget.is64Bit())
1295        InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
1296      else
1297        InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
1298      // Build and insert into an implicit UNDEF value. This is OK because
1299      // we will be shifting and then extracting the lower 8/16-bits.
1300      ImpDef2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF),
1301                        InRegLEA2);
1302      InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
1303                   .addReg(InRegLEA2, RegState::Define, SubReg)
1304                   .addReg(Src2, getKillRegState(IsKill2));
1305      addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
1306    }
1307    if (LV && IsKill2 && InsMI2)
1308      LV->replaceKillInstruction(Src2, MI, *InsMI2);
1309    break;
1310  }
1311  }
1312
1313  MachineInstr *NewMI = MIB;
1314  MachineInstr *ExtMI =
1315      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
1316          .addReg(Dest, RegState::Define | getDeadRegState(IsDead))
1317          .addReg(OutRegLEA, RegState::Kill, SubReg);
1318
1319  if (LV) {
1320    // Update live variables.
1321    LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
1322    if (InRegLEA2)
1323      LV->getVarInfo(InRegLEA2).Kills.push_back(NewMI);
1324    LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
1325    if (IsKill)
1326      LV->replaceKillInstruction(Src, MI, *InsMI);
1327    if (IsDead)
1328      LV->replaceKillInstruction(Dest, MI, *ExtMI);
1329  }
1330
1331  if (LIS) {
1332    LIS->InsertMachineInstrInMaps(*ImpDef);
1333    SlotIndex InsIdx = LIS->InsertMachineInstrInMaps(*InsMI);
1334    if (ImpDef2)
1335      LIS->InsertMachineInstrInMaps(*ImpDef2);
1336    SlotIndex Ins2Idx;
1337    if (InsMI2)
1338      Ins2Idx = LIS->InsertMachineInstrInMaps(*InsMI2);
1339    SlotIndex NewIdx = LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
1340    SlotIndex ExtIdx = LIS->InsertMachineInstrInMaps(*ExtMI);
1341    LIS->getInterval(InRegLEA);
1342    LIS->getInterval(OutRegLEA);
1343    if (InRegLEA2)
1344      LIS->getInterval(InRegLEA2);
1345
1346    // Move the use of Src up to InsMI.
1347    LiveInterval &SrcLI = LIS->getInterval(Src);
1348    LiveRange::Segment *SrcSeg = SrcLI.getSegmentContaining(NewIdx);
1349    if (SrcSeg->end == NewIdx.getRegSlot())
1350      SrcSeg->end = InsIdx.getRegSlot();
1351
1352    if (InsMI2) {
1353      // Move the use of Src2 up to InsMI2.
1354      LiveInterval &Src2LI = LIS->getInterval(Src2);
1355      LiveRange::Segment *Src2Seg = Src2LI.getSegmentContaining(NewIdx);
1356      if (Src2Seg->end == NewIdx.getRegSlot())
1357        Src2Seg->end = Ins2Idx.getRegSlot();
1358    }
1359
1360    // Move the definition of Dest down to ExtMI.
1361    LiveInterval &DestLI = LIS->getInterval(Dest);
1362    LiveRange::Segment *DestSeg =
1363        DestLI.getSegmentContaining(NewIdx.getRegSlot());
1364    assert(DestSeg->start == NewIdx.getRegSlot() &&
1365           DestSeg->valno->def == NewIdx.getRegSlot());
1366    DestSeg->start = ExtIdx.getRegSlot();
1367    DestSeg->valno->def = ExtIdx.getRegSlot();
1368  }
1369
1370  return ExtMI;
1371}
1372
1373/// This method must be implemented by targets that
1374/// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
1375/// may be able to convert a two-address instruction into a true
1376/// three-address instruction on demand.  This allows the X86 target (for
1377/// example) to convert ADD and SHL instructions into LEA instructions if they
1378/// would require register copies due to two-addressness.
1379///
1380/// This method returns a null pointer if the transformation cannot be
1381/// performed, otherwise it returns the new instruction.
1382///
1383MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
1384                                                  LiveVariables *LV,
1385                                                  LiveIntervals *LIS) const {
1386  // The following opcodes also sets the condition code register(s). Only
1387  // convert them to equivalent lea if the condition code register def's
1388  // are dead!
1389  if (hasLiveCondCodeDef(MI))
1390    return nullptr;
1391
1392  MachineFunction &MF = *MI.getParent()->getParent();
1393  // All instructions input are two-addr instructions.  Get the known operands.
1394  const MachineOperand &Dest = MI.getOperand(0);
1395  const MachineOperand &Src = MI.getOperand(1);
1396
1397  // Ideally, operations with undef should be folded before we get here, but we
1398  // can't guarantee it. Bail out because optimizing undefs is a waste of time.
1399  // Without this, we have to forward undef state to new register operands to
1400  // avoid machine verifier errors.
1401  if (Src.isUndef())
1402    return nullptr;
1403  if (MI.getNumOperands() > 2)
1404    if (MI.getOperand(2).isReg() && MI.getOperand(2).isUndef())
1405      return nullptr;
1406
1407  MachineInstr *NewMI = nullptr;
1408  Register SrcReg, SrcReg2;
1409  bool Is64Bit = Subtarget.is64Bit();
1410
1411  bool Is8BitOp = false;
1412  unsigned NumRegOperands = 2;
1413  unsigned MIOpc = MI.getOpcode();
1414  switch (MIOpc) {
1415  default:
1416    llvm_unreachable("Unreachable!");
1417  case X86::SHL64ri: {
1418    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1419    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1420    if (!isTruncatedShiftCountForLEA(ShAmt))
1421      return nullptr;
1422
1423    // LEA can't handle RSP.
1424    if (Src.getReg().isVirtual() && !MF.getRegInfo().constrainRegClass(
1425                                        Src.getReg(), &X86::GR64_NOSPRegClass))
1426      return nullptr;
1427
1428    NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
1429                .add(Dest)
1430                .addReg(0)
1431                .addImm(1LL << ShAmt)
1432                .add(Src)
1433                .addImm(0)
1434                .addReg(0);
1435    break;
1436  }
1437  case X86::SHL32ri: {
1438    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1439    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1440    if (!isTruncatedShiftCountForLEA(ShAmt))
1441      return nullptr;
1442
1443    unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1444
1445    // LEA can't handle ESP.
1446    bool isKill;
1447    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1448    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1449                        ImplicitOp, LV, LIS))
1450      return nullptr;
1451
1452    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1453                                  .add(Dest)
1454                                  .addReg(0)
1455                                  .addImm(1LL << ShAmt)
1456                                  .addReg(SrcReg, getKillRegState(isKill))
1457                                  .addImm(0)
1458                                  .addReg(0);
1459    if (ImplicitOp.getReg() != 0)
1460      MIB.add(ImplicitOp);
1461    NewMI = MIB;
1462
1463    // Add kills if classifyLEAReg created a new register.
1464    if (LV && SrcReg != Src.getReg())
1465      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1466    break;
1467  }
1468  case X86::SHL8ri:
1469    Is8BitOp = true;
1470    [[fallthrough]];
1471  case X86::SHL16ri: {
1472    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
1473    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
1474    if (!isTruncatedShiftCountForLEA(ShAmt))
1475      return nullptr;
1476    return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1477  }
1478  case X86::INC64r:
1479  case X86::INC32r: {
1480    assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
1481    unsigned Opc = MIOpc == X86::INC64r
1482                       ? X86::LEA64r
1483                       : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1484    bool isKill;
1485    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1486    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1487                        ImplicitOp, LV, LIS))
1488      return nullptr;
1489
1490    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1491                                  .add(Dest)
1492                                  .addReg(SrcReg, getKillRegState(isKill));
1493    if (ImplicitOp.getReg() != 0)
1494      MIB.add(ImplicitOp);
1495
1496    NewMI = addOffset(MIB, 1);
1497
1498    // Add kills if classifyLEAReg created a new register.
1499    if (LV && SrcReg != Src.getReg())
1500      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1501    break;
1502  }
1503  case X86::DEC64r:
1504  case X86::DEC32r: {
1505    assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
1506    unsigned Opc = MIOpc == X86::DEC64r
1507                       ? X86::LEA64r
1508                       : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
1509
1510    bool isKill;
1511    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1512    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
1513                        ImplicitOp, LV, LIS))
1514      return nullptr;
1515
1516    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1517                                  .add(Dest)
1518                                  .addReg(SrcReg, getKillRegState(isKill));
1519    if (ImplicitOp.getReg() != 0)
1520      MIB.add(ImplicitOp);
1521
1522    NewMI = addOffset(MIB, -1);
1523
1524    // Add kills if classifyLEAReg created a new register.
1525    if (LV && SrcReg != Src.getReg())
1526      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1527    break;
1528  }
1529  case X86::DEC8r:
1530  case X86::INC8r:
1531    Is8BitOp = true;
1532    [[fallthrough]];
1533  case X86::DEC16r:
1534  case X86::INC16r:
1535    return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1536  case X86::ADD64rr:
1537  case X86::ADD64rr_DB:
1538  case X86::ADD32rr:
1539  case X86::ADD32rr_DB: {
1540    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1541    unsigned Opc;
1542    if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
1543      Opc = X86::LEA64r;
1544    else
1545      Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1546
1547    const MachineOperand &Src2 = MI.getOperand(2);
1548    bool isKill2;
1549    MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1550    if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2,
1551                        ImplicitOp2, LV, LIS))
1552      return nullptr;
1553
1554    bool isKill;
1555    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1556    if (Src.getReg() == Src2.getReg()) {
1557      // Don't call classify LEAReg a second time on the same register, in case
1558      // the first call inserted a COPY from Src2 and marked it as killed.
1559      isKill = isKill2;
1560      SrcReg = SrcReg2;
1561    } else {
1562      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1563                          ImplicitOp, LV, LIS))
1564        return nullptr;
1565    }
1566
1567    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1568    if (ImplicitOp.getReg() != 0)
1569      MIB.add(ImplicitOp);
1570    if (ImplicitOp2.getReg() != 0)
1571      MIB.add(ImplicitOp2);
1572
1573    NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
1574
1575    // Add kills if classifyLEAReg created a new register.
1576    if (LV) {
1577      if (SrcReg2 != Src2.getReg())
1578        LV->getVarInfo(SrcReg2).Kills.push_back(NewMI);
1579      if (SrcReg != SrcReg2 && SrcReg != Src.getReg())
1580        LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1581    }
1582    NumRegOperands = 3;
1583    break;
1584  }
1585  case X86::ADD8rr:
1586  case X86::ADD8rr_DB:
1587    Is8BitOp = true;
1588    [[fallthrough]];
1589  case X86::ADD16rr:
1590  case X86::ADD16rr_DB:
1591    return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1592  case X86::ADD64ri32:
1593  case X86::ADD64ri32_DB:
1594    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1595    NewMI = addOffset(
1596        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1597        MI.getOperand(2));
1598    break;
1599  case X86::ADD32ri:
1600  case X86::ADD32ri_DB: {
1601    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1602    unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1603
1604    bool isKill;
1605    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1606    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1607                        ImplicitOp, LV, LIS))
1608      return nullptr;
1609
1610    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1611                                  .add(Dest)
1612                                  .addReg(SrcReg, getKillRegState(isKill));
1613    if (ImplicitOp.getReg() != 0)
1614      MIB.add(ImplicitOp);
1615
1616    NewMI = addOffset(MIB, MI.getOperand(2));
1617
1618    // Add kills if classifyLEAReg created a new register.
1619    if (LV && SrcReg != Src.getReg())
1620      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1621    break;
1622  }
1623  case X86::ADD8ri:
1624  case X86::ADD8ri_DB:
1625    Is8BitOp = true;
1626    [[fallthrough]];
1627  case X86::ADD16ri:
1628  case X86::ADD16ri_DB:
1629    return convertToThreeAddressWithLEA(MIOpc, MI, LV, LIS, Is8BitOp);
1630  case X86::SUB8ri:
1631  case X86::SUB16ri:
1632    /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1633    return nullptr;
1634  case X86::SUB32ri: {
1635    if (!MI.getOperand(2).isImm())
1636      return nullptr;
1637    int64_t Imm = MI.getOperand(2).getImm();
1638    if (!isInt<32>(-Imm))
1639      return nullptr;
1640
1641    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1642    unsigned Opc = Is64Bit ? X86::LEA64_32r : X86::LEA32r;
1643
1644    bool isKill;
1645    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1646    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
1647                        ImplicitOp, LV, LIS))
1648      return nullptr;
1649
1650    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1651                                  .add(Dest)
1652                                  .addReg(SrcReg, getKillRegState(isKill));
1653    if (ImplicitOp.getReg() != 0)
1654      MIB.add(ImplicitOp);
1655
1656    NewMI = addOffset(MIB, -Imm);
1657
1658    // Add kills if classifyLEAReg created a new register.
1659    if (LV && SrcReg != Src.getReg())
1660      LV->getVarInfo(SrcReg).Kills.push_back(NewMI);
1661    break;
1662  }
1663
1664  case X86::SUB64ri32: {
1665    if (!MI.getOperand(2).isImm())
1666      return nullptr;
1667    int64_t Imm = MI.getOperand(2).getImm();
1668    if (!isInt<32>(-Imm))
1669      return nullptr;
1670
1671    assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1672
1673    MachineInstrBuilder MIB =
1674        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src);
1675    NewMI = addOffset(MIB, -Imm);
1676    break;
1677  }
1678
1679  case X86::VMOVDQU8Z128rmk:
1680  case X86::VMOVDQU8Z256rmk:
1681  case X86::VMOVDQU8Zrmk:
1682  case X86::VMOVDQU16Z128rmk:
1683  case X86::VMOVDQU16Z256rmk:
1684  case X86::VMOVDQU16Zrmk:
1685  case X86::VMOVDQU32Z128rmk:
1686  case X86::VMOVDQA32Z128rmk:
1687  case X86::VMOVDQU32Z256rmk:
1688  case X86::VMOVDQA32Z256rmk:
1689  case X86::VMOVDQU32Zrmk:
1690  case X86::VMOVDQA32Zrmk:
1691  case X86::VMOVDQU64Z128rmk:
1692  case X86::VMOVDQA64Z128rmk:
1693  case X86::VMOVDQU64Z256rmk:
1694  case X86::VMOVDQA64Z256rmk:
1695  case X86::VMOVDQU64Zrmk:
1696  case X86::VMOVDQA64Zrmk:
1697  case X86::VMOVUPDZ128rmk:
1698  case X86::VMOVAPDZ128rmk:
1699  case X86::VMOVUPDZ256rmk:
1700  case X86::VMOVAPDZ256rmk:
1701  case X86::VMOVUPDZrmk:
1702  case X86::VMOVAPDZrmk:
1703  case X86::VMOVUPSZ128rmk:
1704  case X86::VMOVAPSZ128rmk:
1705  case X86::VMOVUPSZ256rmk:
1706  case X86::VMOVAPSZ256rmk:
1707  case X86::VMOVUPSZrmk:
1708  case X86::VMOVAPSZrmk:
1709  case X86::VBROADCASTSDZ256rmk:
1710  case X86::VBROADCASTSDZrmk:
1711  case X86::VBROADCASTSSZ128rmk:
1712  case X86::VBROADCASTSSZ256rmk:
1713  case X86::VBROADCASTSSZrmk:
1714  case X86::VPBROADCASTDZ128rmk:
1715  case X86::VPBROADCASTDZ256rmk:
1716  case X86::VPBROADCASTDZrmk:
1717  case X86::VPBROADCASTQZ128rmk:
1718  case X86::VPBROADCASTQZ256rmk:
1719  case X86::VPBROADCASTQZrmk: {
1720    unsigned Opc;
1721    switch (MIOpc) {
1722    default:
1723      llvm_unreachable("Unreachable!");
1724    case X86::VMOVDQU8Z128rmk:
1725      Opc = X86::VPBLENDMBZ128rmk;
1726      break;
1727    case X86::VMOVDQU8Z256rmk:
1728      Opc = X86::VPBLENDMBZ256rmk;
1729      break;
1730    case X86::VMOVDQU8Zrmk:
1731      Opc = X86::VPBLENDMBZrmk;
1732      break;
1733    case X86::VMOVDQU16Z128rmk:
1734      Opc = X86::VPBLENDMWZ128rmk;
1735      break;
1736    case X86::VMOVDQU16Z256rmk:
1737      Opc = X86::VPBLENDMWZ256rmk;
1738      break;
1739    case X86::VMOVDQU16Zrmk:
1740      Opc = X86::VPBLENDMWZrmk;
1741      break;
1742    case X86::VMOVDQU32Z128rmk:
1743      Opc = X86::VPBLENDMDZ128rmk;
1744      break;
1745    case X86::VMOVDQU32Z256rmk:
1746      Opc = X86::VPBLENDMDZ256rmk;
1747      break;
1748    case X86::VMOVDQU32Zrmk:
1749      Opc = X86::VPBLENDMDZrmk;
1750      break;
1751    case X86::VMOVDQU64Z128rmk:
1752      Opc = X86::VPBLENDMQZ128rmk;
1753      break;
1754    case X86::VMOVDQU64Z256rmk:
1755      Opc = X86::VPBLENDMQZ256rmk;
1756      break;
1757    case X86::VMOVDQU64Zrmk:
1758      Opc = X86::VPBLENDMQZrmk;
1759      break;
1760    case X86::VMOVUPDZ128rmk:
1761      Opc = X86::VBLENDMPDZ128rmk;
1762      break;
1763    case X86::VMOVUPDZ256rmk:
1764      Opc = X86::VBLENDMPDZ256rmk;
1765      break;
1766    case X86::VMOVUPDZrmk:
1767      Opc = X86::VBLENDMPDZrmk;
1768      break;
1769    case X86::VMOVUPSZ128rmk:
1770      Opc = X86::VBLENDMPSZ128rmk;
1771      break;
1772    case X86::VMOVUPSZ256rmk:
1773      Opc = X86::VBLENDMPSZ256rmk;
1774      break;
1775    case X86::VMOVUPSZrmk:
1776      Opc = X86::VBLENDMPSZrmk;
1777      break;
1778    case X86::VMOVDQA32Z128rmk:
1779      Opc = X86::VPBLENDMDZ128rmk;
1780      break;
1781    case X86::VMOVDQA32Z256rmk:
1782      Opc = X86::VPBLENDMDZ256rmk;
1783      break;
1784    case X86::VMOVDQA32Zrmk:
1785      Opc = X86::VPBLENDMDZrmk;
1786      break;
1787    case X86::VMOVDQA64Z128rmk:
1788      Opc = X86::VPBLENDMQZ128rmk;
1789      break;
1790    case X86::VMOVDQA64Z256rmk:
1791      Opc = X86::VPBLENDMQZ256rmk;
1792      break;
1793    case X86::VMOVDQA64Zrmk:
1794      Opc = X86::VPBLENDMQZrmk;
1795      break;
1796    case X86::VMOVAPDZ128rmk:
1797      Opc = X86::VBLENDMPDZ128rmk;
1798      break;
1799    case X86::VMOVAPDZ256rmk:
1800      Opc = X86::VBLENDMPDZ256rmk;
1801      break;
1802    case X86::VMOVAPDZrmk:
1803      Opc = X86::VBLENDMPDZrmk;
1804      break;
1805    case X86::VMOVAPSZ128rmk:
1806      Opc = X86::VBLENDMPSZ128rmk;
1807      break;
1808    case X86::VMOVAPSZ256rmk:
1809      Opc = X86::VBLENDMPSZ256rmk;
1810      break;
1811    case X86::VMOVAPSZrmk:
1812      Opc = X86::VBLENDMPSZrmk;
1813      break;
1814    case X86::VBROADCASTSDZ256rmk:
1815      Opc = X86::VBLENDMPDZ256rmbk;
1816      break;
1817    case X86::VBROADCASTSDZrmk:
1818      Opc = X86::VBLENDMPDZrmbk;
1819      break;
1820    case X86::VBROADCASTSSZ128rmk:
1821      Opc = X86::VBLENDMPSZ128rmbk;
1822      break;
1823    case X86::VBROADCASTSSZ256rmk:
1824      Opc = X86::VBLENDMPSZ256rmbk;
1825      break;
1826    case X86::VBROADCASTSSZrmk:
1827      Opc = X86::VBLENDMPSZrmbk;
1828      break;
1829    case X86::VPBROADCASTDZ128rmk:
1830      Opc = X86::VPBLENDMDZ128rmbk;
1831      break;
1832    case X86::VPBROADCASTDZ256rmk:
1833      Opc = X86::VPBLENDMDZ256rmbk;
1834      break;
1835    case X86::VPBROADCASTDZrmk:
1836      Opc = X86::VPBLENDMDZrmbk;
1837      break;
1838    case X86::VPBROADCASTQZ128rmk:
1839      Opc = X86::VPBLENDMQZ128rmbk;
1840      break;
1841    case X86::VPBROADCASTQZ256rmk:
1842      Opc = X86::VPBLENDMQZ256rmbk;
1843      break;
1844    case X86::VPBROADCASTQZrmk:
1845      Opc = X86::VPBLENDMQZrmbk;
1846      break;
1847    }
1848
1849    NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1850                .add(Dest)
1851                .add(MI.getOperand(2))
1852                .add(Src)
1853                .add(MI.getOperand(3))
1854                .add(MI.getOperand(4))
1855                .add(MI.getOperand(5))
1856                .add(MI.getOperand(6))
1857                .add(MI.getOperand(7));
1858    NumRegOperands = 4;
1859    break;
1860  }
1861
1862  case X86::VMOVDQU8Z128rrk:
1863  case X86::VMOVDQU8Z256rrk:
1864  case X86::VMOVDQU8Zrrk:
1865  case X86::VMOVDQU16Z128rrk:
1866  case X86::VMOVDQU16Z256rrk:
1867  case X86::VMOVDQU16Zrrk:
1868  case X86::VMOVDQU32Z128rrk:
1869  case X86::VMOVDQA32Z128rrk:
1870  case X86::VMOVDQU32Z256rrk:
1871  case X86::VMOVDQA32Z256rrk:
1872  case X86::VMOVDQU32Zrrk:
1873  case X86::VMOVDQA32Zrrk:
1874  case X86::VMOVDQU64Z128rrk:
1875  case X86::VMOVDQA64Z128rrk:
1876  case X86::VMOVDQU64Z256rrk:
1877  case X86::VMOVDQA64Z256rrk:
1878  case X86::VMOVDQU64Zrrk:
1879  case X86::VMOVDQA64Zrrk:
1880  case X86::VMOVUPDZ128rrk:
1881  case X86::VMOVAPDZ128rrk:
1882  case X86::VMOVUPDZ256rrk:
1883  case X86::VMOVAPDZ256rrk:
1884  case X86::VMOVUPDZrrk:
1885  case X86::VMOVAPDZrrk:
1886  case X86::VMOVUPSZ128rrk:
1887  case X86::VMOVAPSZ128rrk:
1888  case X86::VMOVUPSZ256rrk:
1889  case X86::VMOVAPSZ256rrk:
1890  case X86::VMOVUPSZrrk:
1891  case X86::VMOVAPSZrrk: {
1892    unsigned Opc;
1893    switch (MIOpc) {
1894    default:
1895      llvm_unreachable("Unreachable!");
1896    case X86::VMOVDQU8Z128rrk:
1897      Opc = X86::VPBLENDMBZ128rrk;
1898      break;
1899    case X86::VMOVDQU8Z256rrk:
1900      Opc = X86::VPBLENDMBZ256rrk;
1901      break;
1902    case X86::VMOVDQU8Zrrk:
1903      Opc = X86::VPBLENDMBZrrk;
1904      break;
1905    case X86::VMOVDQU16Z128rrk:
1906      Opc = X86::VPBLENDMWZ128rrk;
1907      break;
1908    case X86::VMOVDQU16Z256rrk:
1909      Opc = X86::VPBLENDMWZ256rrk;
1910      break;
1911    case X86::VMOVDQU16Zrrk:
1912      Opc = X86::VPBLENDMWZrrk;
1913      break;
1914    case X86::VMOVDQU32Z128rrk:
1915      Opc = X86::VPBLENDMDZ128rrk;
1916      break;
1917    case X86::VMOVDQU32Z256rrk:
1918      Opc = X86::VPBLENDMDZ256rrk;
1919      break;
1920    case X86::VMOVDQU32Zrrk:
1921      Opc = X86::VPBLENDMDZrrk;
1922      break;
1923    case X86::VMOVDQU64Z128rrk:
1924      Opc = X86::VPBLENDMQZ128rrk;
1925      break;
1926    case X86::VMOVDQU64Z256rrk:
1927      Opc = X86::VPBLENDMQZ256rrk;
1928      break;
1929    case X86::VMOVDQU64Zrrk:
1930      Opc = X86::VPBLENDMQZrrk;
1931      break;
1932    case X86::VMOVUPDZ128rrk:
1933      Opc = X86::VBLENDMPDZ128rrk;
1934      break;
1935    case X86::VMOVUPDZ256rrk:
1936      Opc = X86::VBLENDMPDZ256rrk;
1937      break;
1938    case X86::VMOVUPDZrrk:
1939      Opc = X86::VBLENDMPDZrrk;
1940      break;
1941    case X86::VMOVUPSZ128rrk:
1942      Opc = X86::VBLENDMPSZ128rrk;
1943      break;
1944    case X86::VMOVUPSZ256rrk:
1945      Opc = X86::VBLENDMPSZ256rrk;
1946      break;
1947    case X86::VMOVUPSZrrk:
1948      Opc = X86::VBLENDMPSZrrk;
1949      break;
1950    case X86::VMOVDQA32Z128rrk:
1951      Opc = X86::VPBLENDMDZ128rrk;
1952      break;
1953    case X86::VMOVDQA32Z256rrk:
1954      Opc = X86::VPBLENDMDZ256rrk;
1955      break;
1956    case X86::VMOVDQA32Zrrk:
1957      Opc = X86::VPBLENDMDZrrk;
1958      break;
1959    case X86::VMOVDQA64Z128rrk:
1960      Opc = X86::VPBLENDMQZ128rrk;
1961      break;
1962    case X86::VMOVDQA64Z256rrk:
1963      Opc = X86::VPBLENDMQZ256rrk;
1964      break;
1965    case X86::VMOVDQA64Zrrk:
1966      Opc = X86::VPBLENDMQZrrk;
1967      break;
1968    case X86::VMOVAPDZ128rrk:
1969      Opc = X86::VBLENDMPDZ128rrk;
1970      break;
1971    case X86::VMOVAPDZ256rrk:
1972      Opc = X86::VBLENDMPDZ256rrk;
1973      break;
1974    case X86::VMOVAPDZrrk:
1975      Opc = X86::VBLENDMPDZrrk;
1976      break;
1977    case X86::VMOVAPSZ128rrk:
1978      Opc = X86::VBLENDMPSZ128rrk;
1979      break;
1980    case X86::VMOVAPSZ256rrk:
1981      Opc = X86::VBLENDMPSZ256rrk;
1982      break;
1983    case X86::VMOVAPSZrrk:
1984      Opc = X86::VBLENDMPSZrrk;
1985      break;
1986    }
1987
1988    NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1989                .add(Dest)
1990                .add(MI.getOperand(2))
1991                .add(Src)
1992                .add(MI.getOperand(3));
1993    NumRegOperands = 4;
1994    break;
1995  }
1996  }
1997
1998  if (!NewMI)
1999    return nullptr;
2000
2001  if (LV) { // Update live variables
2002    for (unsigned I = 0; I < NumRegOperands; ++I) {
2003      MachineOperand &Op = MI.getOperand(I);
2004      if (Op.isReg() && (Op.isDead() || Op.isKill()))
2005        LV->replaceKillInstruction(Op.getReg(), MI, *NewMI);
2006    }
2007  }
2008
2009  MachineBasicBlock &MBB = *MI.getParent();
2010  MBB.insert(MI.getIterator(), NewMI); // Insert the new inst
2011
2012  if (LIS) {
2013    LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
2014    if (SrcReg)
2015      LIS->getInterval(SrcReg);
2016    if (SrcReg2)
2017      LIS->getInterval(SrcReg2);
2018  }
2019
2020  return NewMI;
2021}
2022
2023/// This determines which of three possible cases of a three source commute
2024/// the source indexes correspond to taking into account any mask operands.
2025/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
2026/// possible.
2027/// Case 0 - Possible to commute the first and second operands.
2028/// Case 1 - Possible to commute the first and third operands.
2029/// Case 2 - Possible to commute the second and third operands.
2030static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
2031                                       unsigned SrcOpIdx2) {
2032  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
2033  if (SrcOpIdx1 > SrcOpIdx2)
2034    std::swap(SrcOpIdx1, SrcOpIdx2);
2035
2036  unsigned Op1 = 1, Op2 = 2, Op3 = 3;
2037  if (X86II::isKMasked(TSFlags)) {
2038    Op2++;
2039    Op3++;
2040  }
2041
2042  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
2043    return 0;
2044  if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
2045    return 1;
2046  if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
2047    return 2;
2048  llvm_unreachable("Unknown three src commute case.");
2049}
2050
2051unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
2052    const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
2053    const X86InstrFMA3Group &FMA3Group) const {
2054
2055  unsigned Opc = MI.getOpcode();
2056
2057  // TODO: Commuting the 1st operand of FMA*_Int requires some additional
2058  // analysis. The commute optimization is legal only if all users of FMA*_Int
2059  // use only the lowest element of the FMA*_Int instruction. Such analysis are
2060  // not implemented yet. So, just return 0 in that case.
2061  // When such analysis are available this place will be the right place for
2062  // calling it.
2063  assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
2064         "Intrinsic instructions can't commute operand 1");
2065
2066  // Determine which case this commute is or if it can't be done.
2067  unsigned Case =
2068      getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2069  assert(Case < 3 && "Unexpected case number!");
2070
2071  // Define the FMA forms mapping array that helps to map input FMA form
2072  // to output FMA form to preserve the operation semantics after
2073  // commuting the operands.
2074  const unsigned Form132Index = 0;
2075  const unsigned Form213Index = 1;
2076  const unsigned Form231Index = 2;
2077  static const unsigned FormMapping[][3] = {
2078      // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
2079      // FMA132 A, C, b; ==> FMA231 C, A, b;
2080      // FMA213 B, A, c; ==> FMA213 A, B, c;
2081      // FMA231 C, A, b; ==> FMA132 A, C, b;
2082      {Form231Index, Form213Index, Form132Index},
2083      // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
2084      // FMA132 A, c, B; ==> FMA132 B, c, A;
2085      // FMA213 B, a, C; ==> FMA231 C, a, B;
2086      // FMA231 C, a, B; ==> FMA213 B, a, C;
2087      {Form132Index, Form231Index, Form213Index},
2088      // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
2089      // FMA132 a, C, B; ==> FMA213 a, B, C;
2090      // FMA213 b, A, C; ==> FMA132 b, C, A;
2091      // FMA231 c, A, B; ==> FMA231 c, B, A;
2092      {Form213Index, Form132Index, Form231Index}};
2093
2094  unsigned FMAForms[3];
2095  FMAForms[0] = FMA3Group.get132Opcode();
2096  FMAForms[1] = FMA3Group.get213Opcode();
2097  FMAForms[2] = FMA3Group.get231Opcode();
2098
2099  // Everything is ready, just adjust the FMA opcode and return it.
2100  for (unsigned FormIndex = 0; FormIndex < 3; FormIndex++)
2101    if (Opc == FMAForms[FormIndex])
2102      return FMAForms[FormMapping[Case][FormIndex]];
2103
2104  llvm_unreachable("Illegal FMA3 format");
2105}
2106
2107static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
2108                             unsigned SrcOpIdx2) {
2109  // Determine which case this commute is or if it can't be done.
2110  unsigned Case =
2111      getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
2112  assert(Case < 3 && "Unexpected case value!");
2113
2114  // For each case we need to swap two pairs of bits in the final immediate.
2115  static const uint8_t SwapMasks[3][4] = {
2116      {0x04, 0x10, 0x08, 0x20}, // Swap bits 2/4 and 3/5.
2117      {0x02, 0x10, 0x08, 0x40}, // Swap bits 1/4 and 3/6.
2118      {0x02, 0x04, 0x20, 0x40}, // Swap bits 1/2 and 5/6.
2119  };
2120
2121  uint8_t Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2122  // Clear out the bits we are swapping.
2123  uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
2124                           SwapMasks[Case][2] | SwapMasks[Case][3]);
2125  // If the immediate had a bit of the pair set, then set the opposite bit.
2126  if (Imm & SwapMasks[Case][0])
2127    NewImm |= SwapMasks[Case][1];
2128  if (Imm & SwapMasks[Case][1])
2129    NewImm |= SwapMasks[Case][0];
2130  if (Imm & SwapMasks[Case][2])
2131    NewImm |= SwapMasks[Case][3];
2132  if (Imm & SwapMasks[Case][3])
2133    NewImm |= SwapMasks[Case][2];
2134  MI.getOperand(MI.getNumOperands() - 1).setImm(NewImm);
2135}
2136
2137// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
2138// commuted.
2139static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
2140#define VPERM_CASES(Suffix)                                                    \
2141  case X86::VPERMI2##Suffix##Z128rr:                                           \
2142  case X86::VPERMT2##Suffix##Z128rr:                                           \
2143  case X86::VPERMI2##Suffix##Z256rr:                                           \
2144  case X86::VPERMT2##Suffix##Z256rr:                                           \
2145  case X86::VPERMI2##Suffix##Zrr:                                              \
2146  case X86::VPERMT2##Suffix##Zrr:                                              \
2147  case X86::VPERMI2##Suffix##Z128rm:                                           \
2148  case X86::VPERMT2##Suffix##Z128rm:                                           \
2149  case X86::VPERMI2##Suffix##Z256rm:                                           \
2150  case X86::VPERMT2##Suffix##Z256rm:                                           \
2151  case X86::VPERMI2##Suffix##Zrm:                                              \
2152  case X86::VPERMT2##Suffix##Zrm:                                              \
2153  case X86::VPERMI2##Suffix##Z128rrkz:                                         \
2154  case X86::VPERMT2##Suffix##Z128rrkz:                                         \
2155  case X86::VPERMI2##Suffix##Z256rrkz:                                         \
2156  case X86::VPERMT2##Suffix##Z256rrkz:                                         \
2157  case X86::VPERMI2##Suffix##Zrrkz:                                            \
2158  case X86::VPERMT2##Suffix##Zrrkz:                                            \
2159  case X86::VPERMI2##Suffix##Z128rmkz:                                         \
2160  case X86::VPERMT2##Suffix##Z128rmkz:                                         \
2161  case X86::VPERMI2##Suffix##Z256rmkz:                                         \
2162  case X86::VPERMT2##Suffix##Z256rmkz:                                         \
2163  case X86::VPERMI2##Suffix##Zrmkz:                                            \
2164  case X86::VPERMT2##Suffix##Zrmkz:
2165
2166#define VPERM_CASES_BROADCAST(Suffix)                                          \
2167  VPERM_CASES(Suffix)                                                          \
2168  case X86::VPERMI2##Suffix##Z128rmb:                                          \
2169  case X86::VPERMT2##Suffix##Z128rmb:                                          \
2170  case X86::VPERMI2##Suffix##Z256rmb:                                          \
2171  case X86::VPERMT2##Suffix##Z256rmb:                                          \
2172  case X86::VPERMI2##Suffix##Zrmb:                                             \
2173  case X86::VPERMT2##Suffix##Zrmb:                                             \
2174  case X86::VPERMI2##Suffix##Z128rmbkz:                                        \
2175  case X86::VPERMT2##Suffix##Z128rmbkz:                                        \
2176  case X86::VPERMI2##Suffix##Z256rmbkz:                                        \
2177  case X86::VPERMT2##Suffix##Z256rmbkz:                                        \
2178  case X86::VPERMI2##Suffix##Zrmbkz:                                           \
2179  case X86::VPERMT2##Suffix##Zrmbkz:
2180
2181  switch (Opcode) {
2182  default:
2183    return false;
2184    VPERM_CASES(B)
2185    VPERM_CASES_BROADCAST(D)
2186    VPERM_CASES_BROADCAST(PD)
2187    VPERM_CASES_BROADCAST(PS)
2188    VPERM_CASES_BROADCAST(Q)
2189    VPERM_CASES(W)
2190    return true;
2191  }
2192#undef VPERM_CASES_BROADCAST
2193#undef VPERM_CASES
2194}
2195
2196// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
2197// from the I opcode to the T opcode and vice versa.
2198static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
2199#define VPERM_CASES(Orig, New)                                                 \
2200  case X86::Orig##Z128rr:                                                      \
2201    return X86::New##Z128rr;                                                   \
2202  case X86::Orig##Z128rrkz:                                                    \
2203    return X86::New##Z128rrkz;                                                 \
2204  case X86::Orig##Z128rm:                                                      \
2205    return X86::New##Z128rm;                                                   \
2206  case X86::Orig##Z128rmkz:                                                    \
2207    return X86::New##Z128rmkz;                                                 \
2208  case X86::Orig##Z256rr:                                                      \
2209    return X86::New##Z256rr;                                                   \
2210  case X86::Orig##Z256rrkz:                                                    \
2211    return X86::New##Z256rrkz;                                                 \
2212  case X86::Orig##Z256rm:                                                      \
2213    return X86::New##Z256rm;                                                   \
2214  case X86::Orig##Z256rmkz:                                                    \
2215    return X86::New##Z256rmkz;                                                 \
2216  case X86::Orig##Zrr:                                                         \
2217    return X86::New##Zrr;                                                      \
2218  case X86::Orig##Zrrkz:                                                       \
2219    return X86::New##Zrrkz;                                                    \
2220  case X86::Orig##Zrm:                                                         \
2221    return X86::New##Zrm;                                                      \
2222  case X86::Orig##Zrmkz:                                                       \
2223    return X86::New##Zrmkz;
2224
2225#define VPERM_CASES_BROADCAST(Orig, New)                                       \
2226  VPERM_CASES(Orig, New)                                                       \
2227  case X86::Orig##Z128rmb:                                                     \
2228    return X86::New##Z128rmb;                                                  \
2229  case X86::Orig##Z128rmbkz:                                                   \
2230    return X86::New##Z128rmbkz;                                                \
2231  case X86::Orig##Z256rmb:                                                     \
2232    return X86::New##Z256rmb;                                                  \
2233  case X86::Orig##Z256rmbkz:                                                   \
2234    return X86::New##Z256rmbkz;                                                \
2235  case X86::Orig##Zrmb:                                                        \
2236    return X86::New##Zrmb;                                                     \
2237  case X86::Orig##Zrmbkz:                                                      \
2238    return X86::New##Zrmbkz;
2239
2240  switch (Opcode) {
2241    VPERM_CASES(VPERMI2B, VPERMT2B)
2242    VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
2243    VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
2244    VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
2245    VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
2246    VPERM_CASES(VPERMI2W, VPERMT2W)
2247    VPERM_CASES(VPERMT2B, VPERMI2B)
2248    VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
2249    VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
2250    VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
2251    VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
2252    VPERM_CASES(VPERMT2W, VPERMI2W)
2253  }
2254
2255  llvm_unreachable("Unreachable!");
2256#undef VPERM_CASES_BROADCAST
2257#undef VPERM_CASES
2258}
2259
2260MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2261                                                   unsigned OpIdx1,
2262                                                   unsigned OpIdx2) const {
2263  auto CloneIfNew = [&](MachineInstr &MI) {
2264    return std::exchange(NewMI, false)
2265               ? MI.getParent()->getParent()->CloneMachineInstr(&MI)
2266               : &MI;
2267  };
2268  MachineInstr *WorkingMI = nullptr;
2269  unsigned Opc = MI.getOpcode();
2270
2271  switch (Opc) {
2272  // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
2273  case X86::SHRD16rri8:
2274  case X86::SHLD16rri8:
2275  case X86::SHRD32rri8:
2276  case X86::SHLD32rri8:
2277  case X86::SHRD64rri8:
2278  case X86::SHLD64rri8:
2279  case X86::SHRD16rri8_ND:
2280  case X86::SHLD16rri8_ND:
2281  case X86::SHRD32rri8_ND:
2282  case X86::SHLD32rri8_ND:
2283  case X86::SHRD64rri8_ND:
2284  case X86::SHLD64rri8_ND: {
2285    unsigned Size;
2286    switch (Opc) {
2287    default:
2288      llvm_unreachable("Unreachable!");
2289    case X86::SHRD16rri8:
2290      Size = 16;
2291      Opc = X86::SHLD16rri8;
2292      break;
2293    case X86::SHLD16rri8:
2294      Size = 16;
2295      Opc = X86::SHRD16rri8;
2296      break;
2297    case X86::SHRD32rri8:
2298      Size = 32;
2299      Opc = X86::SHLD32rri8;
2300      break;
2301    case X86::SHLD32rri8:
2302      Size = 32;
2303      Opc = X86::SHRD32rri8;
2304      break;
2305    case X86::SHRD64rri8:
2306      Size = 64;
2307      Opc = X86::SHLD64rri8;
2308      break;
2309    case X86::SHLD64rri8:
2310      Size = 64;
2311      Opc = X86::SHRD64rri8;
2312      break;
2313    case X86::SHRD16rri8_ND:
2314      Size = 16;
2315      Opc = X86::SHLD16rri8_ND;
2316      break;
2317    case X86::SHLD16rri8_ND:
2318      Size = 16;
2319      Opc = X86::SHRD16rri8_ND;
2320      break;
2321    case X86::SHRD32rri8_ND:
2322      Size = 32;
2323      Opc = X86::SHLD32rri8_ND;
2324      break;
2325    case X86::SHLD32rri8_ND:
2326      Size = 32;
2327      Opc = X86::SHRD32rri8_ND;
2328      break;
2329    case X86::SHRD64rri8_ND:
2330      Size = 64;
2331      Opc = X86::SHLD64rri8_ND;
2332      break;
2333    case X86::SHLD64rri8_ND:
2334      Size = 64;
2335      Opc = X86::SHRD64rri8_ND;
2336      break;
2337    }
2338    WorkingMI = CloneIfNew(MI);
2339    WorkingMI->setDesc(get(Opc));
2340    WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
2341    break;
2342  }
2343  case X86::PFSUBrr:
2344  case X86::PFSUBRrr:
2345    // PFSUB  x, y: x = x - y
2346    // PFSUBR x, y: x = y - x
2347    WorkingMI = CloneIfNew(MI);
2348    WorkingMI->setDesc(
2349        get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr));
2350    break;
2351  case X86::BLENDPDrri:
2352  case X86::BLENDPSrri:
2353  case X86::VBLENDPDrri:
2354  case X86::VBLENDPSrri:
2355    // If we're optimizing for size, try to use MOVSD/MOVSS.
2356    if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
2357      unsigned Mask = (Opc == X86::BLENDPDrri || Opc == X86::VBLENDPDrri) ? 0x03: 0x0F;
2358      if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
2359#define FROM_TO(FROM, TO)                                                      \
2360  case X86::FROM:                                                              \
2361    Opc = X86::TO;                                                             \
2362    break;
2363        switch (Opc) {
2364        default:
2365          llvm_unreachable("Unreachable!");
2366        FROM_TO(BLENDPDrri, MOVSDrr)
2367        FROM_TO(BLENDPSrri, MOVSSrr)
2368        FROM_TO(VBLENDPDrri, VMOVSDrr)
2369        FROM_TO(VBLENDPSrri, VMOVSSrr)
2370        }
2371        WorkingMI = CloneIfNew(MI);
2372        WorkingMI->setDesc(get(Opc));
2373        WorkingMI->removeOperand(3);
2374        break;
2375      }
2376#undef FROM_TO
2377    }
2378    [[fallthrough]];
2379  case X86::PBLENDWrri:
2380  case X86::VBLENDPDYrri:
2381  case X86::VBLENDPSYrri:
2382  case X86::VPBLENDDrri:
2383  case X86::VPBLENDWrri:
2384  case X86::VPBLENDDYrri:
2385  case X86::VPBLENDWYrri: {
2386    int8_t Mask;
2387    switch (Opc) {
2388    default:
2389      llvm_unreachable("Unreachable!");
2390    case X86::BLENDPDrri:
2391      Mask = (int8_t)0x03;
2392      break;
2393    case X86::BLENDPSrri:
2394      Mask = (int8_t)0x0F;
2395      break;
2396    case X86::PBLENDWrri:
2397      Mask = (int8_t)0xFF;
2398      break;
2399    case X86::VBLENDPDrri:
2400      Mask = (int8_t)0x03;
2401      break;
2402    case X86::VBLENDPSrri:
2403      Mask = (int8_t)0x0F;
2404      break;
2405    case X86::VBLENDPDYrri:
2406      Mask = (int8_t)0x0F;
2407      break;
2408    case X86::VBLENDPSYrri:
2409      Mask = (int8_t)0xFF;
2410      break;
2411    case X86::VPBLENDDrri:
2412      Mask = (int8_t)0x0F;
2413      break;
2414    case X86::VPBLENDWrri:
2415      Mask = (int8_t)0xFF;
2416      break;
2417    case X86::VPBLENDDYrri:
2418      Mask = (int8_t)0xFF;
2419      break;
2420    case X86::VPBLENDWYrri:
2421      Mask = (int8_t)0xFF;
2422      break;
2423    }
2424    // Only the least significant bits of Imm are used.
2425    // Using int8_t to ensure it will be sign extended to the int64_t that
2426    // setImm takes in order to match isel behavior.
2427    int8_t Imm = MI.getOperand(3).getImm() & Mask;
2428    WorkingMI = CloneIfNew(MI);
2429    WorkingMI->getOperand(3).setImm(Mask ^ Imm);
2430    break;
2431  }
2432  case X86::INSERTPSrr:
2433  case X86::VINSERTPSrr:
2434  case X86::VINSERTPSZrr: {
2435    unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
2436    unsigned ZMask = Imm & 15;
2437    unsigned DstIdx = (Imm >> 4) & 3;
2438    unsigned SrcIdx = (Imm >> 6) & 3;
2439
2440    // We can commute insertps if we zero 2 of the elements, the insertion is
2441    // "inline" and we don't override the insertion with a zero.
2442    if (DstIdx == SrcIdx && (ZMask & (1 << DstIdx)) == 0 &&
2443        llvm::popcount(ZMask) == 2) {
2444      unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
2445      assert(AltIdx < 4 && "Illegal insertion index");
2446      unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
2447      WorkingMI = CloneIfNew(MI);
2448      WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm);
2449      break;
2450    }
2451    return nullptr;
2452  }
2453  case X86::MOVSDrr:
2454  case X86::MOVSSrr:
2455  case X86::VMOVSDrr:
2456  case X86::VMOVSSrr: {
2457    // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
2458    if (Subtarget.hasSSE41()) {
2459      unsigned Mask;
2460      switch (Opc) {
2461      default:
2462        llvm_unreachable("Unreachable!");
2463      case X86::MOVSDrr:
2464        Opc = X86::BLENDPDrri;
2465        Mask = 0x02;
2466        break;
2467      case X86::MOVSSrr:
2468        Opc = X86::BLENDPSrri;
2469        Mask = 0x0E;
2470        break;
2471      case X86::VMOVSDrr:
2472        Opc = X86::VBLENDPDrri;
2473        Mask = 0x02;
2474        break;
2475      case X86::VMOVSSrr:
2476        Opc = X86::VBLENDPSrri;
2477        Mask = 0x0E;
2478        break;
2479      }
2480
2481      WorkingMI = CloneIfNew(MI);
2482      WorkingMI->setDesc(get(Opc));
2483      WorkingMI->addOperand(MachineOperand::CreateImm(Mask));
2484      break;
2485    }
2486
2487    WorkingMI = CloneIfNew(MI);
2488    WorkingMI->setDesc(get(X86::SHUFPDrri));
2489    WorkingMI->addOperand(MachineOperand::CreateImm(0x02));
2490    break;
2491  }
2492  case X86::SHUFPDrri: {
2493    // Commute to MOVSD.
2494    assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
2495    WorkingMI = CloneIfNew(MI);
2496    WorkingMI->setDesc(get(X86::MOVSDrr));
2497    WorkingMI->removeOperand(3);
2498    break;
2499  }
2500  case X86::PCLMULQDQrr:
2501  case X86::VPCLMULQDQrr:
2502  case X86::VPCLMULQDQYrr:
2503  case X86::VPCLMULQDQZrr:
2504  case X86::VPCLMULQDQZ128rr:
2505  case X86::VPCLMULQDQZ256rr: {
2506    // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
2507    // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
2508    unsigned Imm = MI.getOperand(3).getImm();
2509    unsigned Src1Hi = Imm & 0x01;
2510    unsigned Src2Hi = Imm & 0x10;
2511    WorkingMI = CloneIfNew(MI);
2512    WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
2513    break;
2514  }
2515  case X86::VPCMPBZ128rri:
2516  case X86::VPCMPUBZ128rri:
2517  case X86::VPCMPBZ256rri:
2518  case X86::VPCMPUBZ256rri:
2519  case X86::VPCMPBZrri:
2520  case X86::VPCMPUBZrri:
2521  case X86::VPCMPDZ128rri:
2522  case X86::VPCMPUDZ128rri:
2523  case X86::VPCMPDZ256rri:
2524  case X86::VPCMPUDZ256rri:
2525  case X86::VPCMPDZrri:
2526  case X86::VPCMPUDZrri:
2527  case X86::VPCMPQZ128rri:
2528  case X86::VPCMPUQZ128rri:
2529  case X86::VPCMPQZ256rri:
2530  case X86::VPCMPUQZ256rri:
2531  case X86::VPCMPQZrri:
2532  case X86::VPCMPUQZrri:
2533  case X86::VPCMPWZ128rri:
2534  case X86::VPCMPUWZ128rri:
2535  case X86::VPCMPWZ256rri:
2536  case X86::VPCMPUWZ256rri:
2537  case X86::VPCMPWZrri:
2538  case X86::VPCMPUWZrri:
2539  case X86::VPCMPBZ128rrik:
2540  case X86::VPCMPUBZ128rrik:
2541  case X86::VPCMPBZ256rrik:
2542  case X86::VPCMPUBZ256rrik:
2543  case X86::VPCMPBZrrik:
2544  case X86::VPCMPUBZrrik:
2545  case X86::VPCMPDZ128rrik:
2546  case X86::VPCMPUDZ128rrik:
2547  case X86::VPCMPDZ256rrik:
2548  case X86::VPCMPUDZ256rrik:
2549  case X86::VPCMPDZrrik:
2550  case X86::VPCMPUDZrrik:
2551  case X86::VPCMPQZ128rrik:
2552  case X86::VPCMPUQZ128rrik:
2553  case X86::VPCMPQZ256rrik:
2554  case X86::VPCMPUQZ256rrik:
2555  case X86::VPCMPQZrrik:
2556  case X86::VPCMPUQZrrik:
2557  case X86::VPCMPWZ128rrik:
2558  case X86::VPCMPUWZ128rrik:
2559  case X86::VPCMPWZ256rrik:
2560  case X86::VPCMPUWZ256rrik:
2561  case X86::VPCMPWZrrik:
2562  case X86::VPCMPUWZrrik:
2563    WorkingMI = CloneIfNew(MI);
2564    // Flip comparison mode immediate (if necessary).
2565    WorkingMI->getOperand(MI.getNumOperands() - 1)
2566        .setImm(X86::getSwappedVPCMPImm(
2567            MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7));
2568    break;
2569  case X86::VPCOMBri:
2570  case X86::VPCOMUBri:
2571  case X86::VPCOMDri:
2572  case X86::VPCOMUDri:
2573  case X86::VPCOMQri:
2574  case X86::VPCOMUQri:
2575  case X86::VPCOMWri:
2576  case X86::VPCOMUWri:
2577    WorkingMI = CloneIfNew(MI);
2578    // Flip comparison mode immediate (if necessary).
2579    WorkingMI->getOperand(3).setImm(
2580        X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7));
2581    break;
2582  case X86::VCMPSDZrr:
2583  case X86::VCMPSSZrr:
2584  case X86::VCMPPDZrri:
2585  case X86::VCMPPSZrri:
2586  case X86::VCMPSHZrr:
2587  case X86::VCMPPHZrri:
2588  case X86::VCMPPHZ128rri:
2589  case X86::VCMPPHZ256rri:
2590  case X86::VCMPPDZ128rri:
2591  case X86::VCMPPSZ128rri:
2592  case X86::VCMPPDZ256rri:
2593  case X86::VCMPPSZ256rri:
2594  case X86::VCMPPDZrrik:
2595  case X86::VCMPPSZrrik:
2596  case X86::VCMPPDZ128rrik:
2597  case X86::VCMPPSZ128rrik:
2598  case X86::VCMPPDZ256rrik:
2599  case X86::VCMPPSZ256rrik:
2600    WorkingMI = CloneIfNew(MI);
2601    WorkingMI->getOperand(MI.getNumExplicitOperands() - 1)
2602        .setImm(X86::getSwappedVCMPImm(
2603            MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f));
2604    break;
2605  case X86::VPERM2F128rr:
2606  case X86::VPERM2I128rr:
2607    // Flip permute source immediate.
2608    // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
2609    // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
2610    WorkingMI = CloneIfNew(MI);
2611    WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22);
2612    break;
2613  case X86::MOVHLPSrr:
2614  case X86::UNPCKHPDrr:
2615  case X86::VMOVHLPSrr:
2616  case X86::VUNPCKHPDrr:
2617  case X86::VMOVHLPSZrr:
2618  case X86::VUNPCKHPDZ128rr:
2619    assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
2620
2621    switch (Opc) {
2622    default:
2623      llvm_unreachable("Unreachable!");
2624    case X86::MOVHLPSrr:
2625      Opc = X86::UNPCKHPDrr;
2626      break;
2627    case X86::UNPCKHPDrr:
2628      Opc = X86::MOVHLPSrr;
2629      break;
2630    case X86::VMOVHLPSrr:
2631      Opc = X86::VUNPCKHPDrr;
2632      break;
2633    case X86::VUNPCKHPDrr:
2634      Opc = X86::VMOVHLPSrr;
2635      break;
2636    case X86::VMOVHLPSZrr:
2637      Opc = X86::VUNPCKHPDZ128rr;
2638      break;
2639    case X86::VUNPCKHPDZ128rr:
2640      Opc = X86::VMOVHLPSZrr;
2641      break;
2642    }
2643    WorkingMI = CloneIfNew(MI);
2644    WorkingMI->setDesc(get(Opc));
2645    break;
2646  case X86::CMOV16rr:
2647  case X86::CMOV32rr:
2648  case X86::CMOV64rr: {
2649    WorkingMI = CloneIfNew(MI);
2650    unsigned OpNo = MI.getDesc().getNumOperands() - 1;
2651    X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
2652    WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
2653    break;
2654  }
2655  case X86::VPTERNLOGDZrri:
2656  case X86::VPTERNLOGDZrmi:
2657  case X86::VPTERNLOGDZ128rri:
2658  case X86::VPTERNLOGDZ128rmi:
2659  case X86::VPTERNLOGDZ256rri:
2660  case X86::VPTERNLOGDZ256rmi:
2661  case X86::VPTERNLOGQZrri:
2662  case X86::VPTERNLOGQZrmi:
2663  case X86::VPTERNLOGQZ128rri:
2664  case X86::VPTERNLOGQZ128rmi:
2665  case X86::VPTERNLOGQZ256rri:
2666  case X86::VPTERNLOGQZ256rmi:
2667  case X86::VPTERNLOGDZrrik:
2668  case X86::VPTERNLOGDZ128rrik:
2669  case X86::VPTERNLOGDZ256rrik:
2670  case X86::VPTERNLOGQZrrik:
2671  case X86::VPTERNLOGQZ128rrik:
2672  case X86::VPTERNLOGQZ256rrik:
2673  case X86::VPTERNLOGDZrrikz:
2674  case X86::VPTERNLOGDZrmikz:
2675  case X86::VPTERNLOGDZ128rrikz:
2676  case X86::VPTERNLOGDZ128rmikz:
2677  case X86::VPTERNLOGDZ256rrikz:
2678  case X86::VPTERNLOGDZ256rmikz:
2679  case X86::VPTERNLOGQZrrikz:
2680  case X86::VPTERNLOGQZrmikz:
2681  case X86::VPTERNLOGQZ128rrikz:
2682  case X86::VPTERNLOGQZ128rmikz:
2683  case X86::VPTERNLOGQZ256rrikz:
2684  case X86::VPTERNLOGQZ256rmikz:
2685  case X86::VPTERNLOGDZ128rmbi:
2686  case X86::VPTERNLOGDZ256rmbi:
2687  case X86::VPTERNLOGDZrmbi:
2688  case X86::VPTERNLOGQZ128rmbi:
2689  case X86::VPTERNLOGQZ256rmbi:
2690  case X86::VPTERNLOGQZrmbi:
2691  case X86::VPTERNLOGDZ128rmbikz:
2692  case X86::VPTERNLOGDZ256rmbikz:
2693  case X86::VPTERNLOGDZrmbikz:
2694  case X86::VPTERNLOGQZ128rmbikz:
2695  case X86::VPTERNLOGQZ256rmbikz:
2696  case X86::VPTERNLOGQZrmbikz: {
2697    WorkingMI = CloneIfNew(MI);
2698    commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2);
2699    break;
2700  }
2701  default:
2702    if (isCommutableVPERMV3Instruction(Opc)) {
2703      WorkingMI = CloneIfNew(MI);
2704      WorkingMI->setDesc(get(getCommutedVPERMV3Opcode(Opc)));
2705      break;
2706    }
2707
2708    if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) {
2709      WorkingMI = CloneIfNew(MI);
2710      WorkingMI->setDesc(
2711          get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group)));
2712      break;
2713    }
2714  }
2715  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
2716}
2717
2718bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
2719                                                 unsigned &SrcOpIdx1,
2720                                                 unsigned &SrcOpIdx2,
2721                                                 bool IsIntrinsic) const {
2722  uint64_t TSFlags = MI.getDesc().TSFlags;
2723
2724  unsigned FirstCommutableVecOp = 1;
2725  unsigned LastCommutableVecOp = 3;
2726  unsigned KMaskOp = -1U;
2727  if (X86II::isKMasked(TSFlags)) {
2728    // For k-zero-masked operations it is Ok to commute the first vector
2729    // operand. Unless this is an intrinsic instruction.
2730    // For regular k-masked operations a conservative choice is done as the
2731    // elements of the first vector operand, for which the corresponding bit
2732    // in the k-mask operand is set to 0, are copied to the result of the
2733    // instruction.
2734    // TODO/FIXME: The commute still may be legal if it is known that the
2735    // k-mask operand is set to either all ones or all zeroes.
2736    // It is also Ok to commute the 1st operand if all users of MI use only
2737    // the elements enabled by the k-mask operand. For example,
2738    //   v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
2739    //                                                     : v1[i];
2740    //   VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
2741    //                                  // Ok, to commute v1 in FMADD213PSZrk.
2742
2743    // The k-mask operand has index = 2 for masked and zero-masked operations.
2744    KMaskOp = 2;
2745
2746    // The operand with index = 1 is used as a source for those elements for
2747    // which the corresponding bit in the k-mask is set to 0.
2748    if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
2749      FirstCommutableVecOp = 3;
2750
2751    LastCommutableVecOp++;
2752  } else if (IsIntrinsic) {
2753    // Commuting the first operand of an intrinsic instruction isn't possible
2754    // unless we can prove that only the lowest element of the result is used.
2755    FirstCommutableVecOp = 2;
2756  }
2757
2758  if (isMem(MI, LastCommutableVecOp))
2759    LastCommutableVecOp--;
2760
2761  // Only the first RegOpsNum operands are commutable.
2762  // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
2763  // that the operand is not specified/fixed.
2764  if (SrcOpIdx1 != CommuteAnyOperandIndex &&
2765      (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
2766       SrcOpIdx1 == KMaskOp))
2767    return false;
2768  if (SrcOpIdx2 != CommuteAnyOperandIndex &&
2769      (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
2770       SrcOpIdx2 == KMaskOp))
2771    return false;
2772
2773  // Look for two different register operands assumed to be commutable
2774  // regardless of the FMA opcode. The FMA opcode is adjusted later.
2775  if (SrcOpIdx1 == CommuteAnyOperandIndex ||
2776      SrcOpIdx2 == CommuteAnyOperandIndex) {
2777    unsigned CommutableOpIdx2 = SrcOpIdx2;
2778
2779    // At least one of operands to be commuted is not specified and
2780    // this method is free to choose appropriate commutable operands.
2781    if (SrcOpIdx1 == SrcOpIdx2)
2782      // Both of operands are not fixed. By default set one of commutable
2783      // operands to the last register operand of the instruction.
2784      CommutableOpIdx2 = LastCommutableVecOp;
2785    else if (SrcOpIdx2 == CommuteAnyOperandIndex)
2786      // Only one of operands is not fixed.
2787      CommutableOpIdx2 = SrcOpIdx1;
2788
2789    // CommutableOpIdx2 is well defined now. Let's choose another commutable
2790    // operand and assign its index to CommutableOpIdx1.
2791    Register Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
2792
2793    unsigned CommutableOpIdx1;
2794    for (CommutableOpIdx1 = LastCommutableVecOp;
2795         CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
2796      // Just ignore and skip the k-mask operand.
2797      if (CommutableOpIdx1 == KMaskOp)
2798        continue;
2799
2800      // The commuted operands must have different registers.
2801      // Otherwise, the commute transformation does not change anything and
2802      // is useless then.
2803      if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
2804        break;
2805    }
2806
2807    // No appropriate commutable operands were found.
2808    if (CommutableOpIdx1 < FirstCommutableVecOp)
2809      return false;
2810
2811    // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
2812    // to return those values.
2813    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
2814                              CommutableOpIdx2))
2815      return false;
2816  }
2817
2818  return true;
2819}
2820
2821bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2822                                         unsigned &SrcOpIdx1,
2823                                         unsigned &SrcOpIdx2) const {
2824  const MCInstrDesc &Desc = MI.getDesc();
2825  if (!Desc.isCommutable())
2826    return false;
2827
2828  switch (MI.getOpcode()) {
2829  case X86::CMPSDrr:
2830  case X86::CMPSSrr:
2831  case X86::CMPPDrri:
2832  case X86::CMPPSrri:
2833  case X86::VCMPSDrr:
2834  case X86::VCMPSSrr:
2835  case X86::VCMPPDrri:
2836  case X86::VCMPPSrri:
2837  case X86::VCMPPDYrri:
2838  case X86::VCMPPSYrri:
2839  case X86::VCMPSDZrr:
2840  case X86::VCMPSSZrr:
2841  case X86::VCMPPDZrri:
2842  case X86::VCMPPSZrri:
2843  case X86::VCMPSHZrr:
2844  case X86::VCMPPHZrri:
2845  case X86::VCMPPHZ128rri:
2846  case X86::VCMPPHZ256rri:
2847  case X86::VCMPPDZ128rri:
2848  case X86::VCMPPSZ128rri:
2849  case X86::VCMPPDZ256rri:
2850  case X86::VCMPPSZ256rri:
2851  case X86::VCMPPDZrrik:
2852  case X86::VCMPPSZrrik:
2853  case X86::VCMPPDZ128rrik:
2854  case X86::VCMPPSZ128rrik:
2855  case X86::VCMPPDZ256rrik:
2856  case X86::VCMPPSZ256rrik: {
2857    unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0;
2858
2859    // Float comparison can be safely commuted for
2860    // Ordered/Unordered/Equal/NotEqual tests
2861    unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
2862    switch (Imm) {
2863    default:
2864      // EVEX versions can be commuted.
2865      if ((Desc.TSFlags & X86II::EncodingMask) == X86II::EVEX)
2866        break;
2867      return false;
2868    case 0x00: // EQUAL
2869    case 0x03: // UNORDERED
2870    case 0x04: // NOT EQUAL
2871    case 0x07: // ORDERED
2872      break;
2873    }
2874
2875    // The indices of the commutable operands are 1 and 2 (or 2 and 3
2876    // when masked).
2877    // Assign them to the returned operand indices here.
2878    return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
2879                                2 + OpOffset);
2880  }
2881  case X86::MOVSSrr:
2882    // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
2883    // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
2884    // AVX implies sse4.1.
2885    if (Subtarget.hasSSE41())
2886      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2887    return false;
2888  case X86::SHUFPDrri:
2889    // We can commute this to MOVSD.
2890    if (MI.getOperand(3).getImm() == 0x02)
2891      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2892    return false;
2893  case X86::MOVHLPSrr:
2894  case X86::UNPCKHPDrr:
2895  case X86::VMOVHLPSrr:
2896  case X86::VUNPCKHPDrr:
2897  case X86::VMOVHLPSZrr:
2898  case X86::VUNPCKHPDZ128rr:
2899    if (Subtarget.hasSSE2())
2900      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2901    return false;
2902  case X86::VPTERNLOGDZrri:
2903  case X86::VPTERNLOGDZrmi:
2904  case X86::VPTERNLOGDZ128rri:
2905  case X86::VPTERNLOGDZ128rmi:
2906  case X86::VPTERNLOGDZ256rri:
2907  case X86::VPTERNLOGDZ256rmi:
2908  case X86::VPTERNLOGQZrri:
2909  case X86::VPTERNLOGQZrmi:
2910  case X86::VPTERNLOGQZ128rri:
2911  case X86::VPTERNLOGQZ128rmi:
2912  case X86::VPTERNLOGQZ256rri:
2913  case X86::VPTERNLOGQZ256rmi:
2914  case X86::VPTERNLOGDZrrik:
2915  case X86::VPTERNLOGDZ128rrik:
2916  case X86::VPTERNLOGDZ256rrik:
2917  case X86::VPTERNLOGQZrrik:
2918  case X86::VPTERNLOGQZ128rrik:
2919  case X86::VPTERNLOGQZ256rrik:
2920  case X86::VPTERNLOGDZrrikz:
2921  case X86::VPTERNLOGDZrmikz:
2922  case X86::VPTERNLOGDZ128rrikz:
2923  case X86::VPTERNLOGDZ128rmikz:
2924  case X86::VPTERNLOGDZ256rrikz:
2925  case X86::VPTERNLOGDZ256rmikz:
2926  case X86::VPTERNLOGQZrrikz:
2927  case X86::VPTERNLOGQZrmikz:
2928  case X86::VPTERNLOGQZ128rrikz:
2929  case X86::VPTERNLOGQZ128rmikz:
2930  case X86::VPTERNLOGQZ256rrikz:
2931  case X86::VPTERNLOGQZ256rmikz:
2932  case X86::VPTERNLOGDZ128rmbi:
2933  case X86::VPTERNLOGDZ256rmbi:
2934  case X86::VPTERNLOGDZrmbi:
2935  case X86::VPTERNLOGQZ128rmbi:
2936  case X86::VPTERNLOGQZ256rmbi:
2937  case X86::VPTERNLOGQZrmbi:
2938  case X86::VPTERNLOGDZ128rmbikz:
2939  case X86::VPTERNLOGDZ256rmbikz:
2940  case X86::VPTERNLOGDZrmbikz:
2941  case X86::VPTERNLOGQZ128rmbikz:
2942  case X86::VPTERNLOGQZ256rmbikz:
2943  case X86::VPTERNLOGQZrmbikz:
2944    return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2945  case X86::VPDPWSSDYrr:
2946  case X86::VPDPWSSDrr:
2947  case X86::VPDPWSSDSYrr:
2948  case X86::VPDPWSSDSrr:
2949  case X86::VPDPWUUDrr:
2950  case X86::VPDPWUUDYrr:
2951  case X86::VPDPWUUDSrr:
2952  case X86::VPDPWUUDSYrr:
2953  case X86::VPDPBSSDSrr:
2954  case X86::VPDPBSSDSYrr:
2955  case X86::VPDPBSSDrr:
2956  case X86::VPDPBSSDYrr:
2957  case X86::VPDPBUUDSrr:
2958  case X86::VPDPBUUDSYrr:
2959  case X86::VPDPBUUDrr:
2960  case X86::VPDPBUUDYrr:
2961  case X86::VPDPWSSDZ128r:
2962  case X86::VPDPWSSDZ128rk:
2963  case X86::VPDPWSSDZ128rkz:
2964  case X86::VPDPWSSDZ256r:
2965  case X86::VPDPWSSDZ256rk:
2966  case X86::VPDPWSSDZ256rkz:
2967  case X86::VPDPWSSDZr:
2968  case X86::VPDPWSSDZrk:
2969  case X86::VPDPWSSDZrkz:
2970  case X86::VPDPWSSDSZ128r:
2971  case X86::VPDPWSSDSZ128rk:
2972  case X86::VPDPWSSDSZ128rkz:
2973  case X86::VPDPWSSDSZ256r:
2974  case X86::VPDPWSSDSZ256rk:
2975  case X86::VPDPWSSDSZ256rkz:
2976  case X86::VPDPWSSDSZr:
2977  case X86::VPDPWSSDSZrk:
2978  case X86::VPDPWSSDSZrkz:
2979  case X86::VPMADD52HUQrr:
2980  case X86::VPMADD52HUQYrr:
2981  case X86::VPMADD52HUQZ128r:
2982  case X86::VPMADD52HUQZ128rk:
2983  case X86::VPMADD52HUQZ128rkz:
2984  case X86::VPMADD52HUQZ256r:
2985  case X86::VPMADD52HUQZ256rk:
2986  case X86::VPMADD52HUQZ256rkz:
2987  case X86::VPMADD52HUQZr:
2988  case X86::VPMADD52HUQZrk:
2989  case X86::VPMADD52HUQZrkz:
2990  case X86::VPMADD52LUQrr:
2991  case X86::VPMADD52LUQYrr:
2992  case X86::VPMADD52LUQZ128r:
2993  case X86::VPMADD52LUQZ128rk:
2994  case X86::VPMADD52LUQZ128rkz:
2995  case X86::VPMADD52LUQZ256r:
2996  case X86::VPMADD52LUQZ256rk:
2997  case X86::VPMADD52LUQZ256rkz:
2998  case X86::VPMADD52LUQZr:
2999  case X86::VPMADD52LUQZrk:
3000  case X86::VPMADD52LUQZrkz:
3001  case X86::VFMADDCPHZr:
3002  case X86::VFMADDCPHZrk:
3003  case X86::VFMADDCPHZrkz:
3004  case X86::VFMADDCPHZ128r:
3005  case X86::VFMADDCPHZ128rk:
3006  case X86::VFMADDCPHZ128rkz:
3007  case X86::VFMADDCPHZ256r:
3008  case X86::VFMADDCPHZ256rk:
3009  case X86::VFMADDCPHZ256rkz:
3010  case X86::VFMADDCSHZr:
3011  case X86::VFMADDCSHZrk:
3012  case X86::VFMADDCSHZrkz: {
3013    unsigned CommutableOpIdx1 = 2;
3014    unsigned CommutableOpIdx2 = 3;
3015    if (X86II::isKMasked(Desc.TSFlags)) {
3016      // Skip the mask register.
3017      ++CommutableOpIdx1;
3018      ++CommutableOpIdx2;
3019    }
3020    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3021                              CommutableOpIdx2))
3022      return false;
3023    if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
3024      // No idea.
3025      return false;
3026    return true;
3027  }
3028
3029  default:
3030    const X86InstrFMA3Group *FMA3Group =
3031        getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
3032    if (FMA3Group)
3033      return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
3034                                           FMA3Group->isIntrinsic());
3035
3036    // Handled masked instructions since we need to skip over the mask input
3037    // and the preserved input.
3038    if (X86II::isKMasked(Desc.TSFlags)) {
3039      // First assume that the first input is the mask operand and skip past it.
3040      unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
3041      unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
3042      // Check if the first input is tied. If there isn't one then we only
3043      // need to skip the mask operand which we did above.
3044      if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
3045                                             MCOI::TIED_TO) != -1)) {
3046        // If this is zero masking instruction with a tied operand, we need to
3047        // move the first index back to the first input since this must
3048        // be a 3 input instruction and we want the first two non-mask inputs.
3049        // Otherwise this is a 2 input instruction with a preserved input and
3050        // mask, so we need to move the indices to skip one more input.
3051        if (X86II::isKMergeMasked(Desc.TSFlags)) {
3052          ++CommutableOpIdx1;
3053          ++CommutableOpIdx2;
3054        } else {
3055          --CommutableOpIdx1;
3056        }
3057      }
3058
3059      if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, CommutableOpIdx1,
3060                                CommutableOpIdx2))
3061        return false;
3062
3063      if (!MI.getOperand(SrcOpIdx1).isReg() ||
3064          !MI.getOperand(SrcOpIdx2).isReg())
3065        // No idea.
3066        return false;
3067      return true;
3068    }
3069
3070    return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
3071  }
3072  return false;
3073}
3074
3075static bool isConvertibleLEA(MachineInstr *MI) {
3076  unsigned Opcode = MI->getOpcode();
3077  if (Opcode != X86::LEA32r && Opcode != X86::LEA64r &&
3078      Opcode != X86::LEA64_32r)
3079    return false;
3080
3081  const MachineOperand &Scale = MI->getOperand(1 + X86::AddrScaleAmt);
3082  const MachineOperand &Disp = MI->getOperand(1 + X86::AddrDisp);
3083  const MachineOperand &Segment = MI->getOperand(1 + X86::AddrSegmentReg);
3084
3085  if (Segment.getReg() != 0 || !Disp.isImm() || Disp.getImm() != 0 ||
3086      Scale.getImm() > 1)
3087    return false;
3088
3089  return true;
3090}
3091
3092bool X86InstrInfo::hasCommutePreference(MachineInstr &MI, bool &Commute) const {
3093  // Currently we're interested in following sequence only.
3094  //   r3 = lea r1, r2
3095  //   r5 = add r3, r4
3096  // Both r3 and r4 are killed in add, we hope the add instruction has the
3097  // operand order
3098  //   r5 = add r4, r3
3099  // So later in X86FixupLEAs the lea instruction can be rewritten as add.
3100  unsigned Opcode = MI.getOpcode();
3101  if (Opcode != X86::ADD32rr && Opcode != X86::ADD64rr)
3102    return false;
3103
3104  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3105  Register Reg1 = MI.getOperand(1).getReg();
3106  Register Reg2 = MI.getOperand(2).getReg();
3107
3108  // Check if Reg1 comes from LEA in the same MBB.
3109  if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg1)) {
3110    if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3111      Commute = true;
3112      return true;
3113    }
3114  }
3115
3116  // Check if Reg2 comes from LEA in the same MBB.
3117  if (MachineInstr *Inst = MRI.getUniqueVRegDef(Reg2)) {
3118    if (isConvertibleLEA(Inst) && Inst->getParent() == MI.getParent()) {
3119      Commute = false;
3120      return true;
3121    }
3122  }
3123
3124  return false;
3125}
3126
3127int X86::getCondSrcNoFromDesc(const MCInstrDesc &MCID) {
3128  unsigned Opcode = MCID.getOpcode();
3129  if (!(X86::isJCC(Opcode) || X86::isSETCC(Opcode) || X86::isCMOVCC(Opcode)))
3130    return -1;
3131  // Assume that condition code is always the last use operand.
3132  unsigned NumUses = MCID.getNumOperands() - MCID.getNumDefs();
3133  return NumUses - 1;
3134}
3135
3136X86::CondCode X86::getCondFromMI(const MachineInstr &MI) {
3137  const MCInstrDesc &MCID = MI.getDesc();
3138  int CondNo = getCondSrcNoFromDesc(MCID);
3139  if (CondNo < 0)
3140    return X86::COND_INVALID;
3141  CondNo += MCID.getNumDefs();
3142  return static_cast<X86::CondCode>(MI.getOperand(CondNo).getImm());
3143}
3144
3145X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) {
3146  return X86::isJCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3147                                    : X86::COND_INVALID;
3148}
3149
3150X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) {
3151  return X86::isSETCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3152                                      : X86::COND_INVALID;
3153}
3154
3155X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
3156  return X86::isCMOVCC(MI.getOpcode()) ? X86::getCondFromMI(MI)
3157                                       : X86::COND_INVALID;
3158}
3159
3160/// Return the inverse of the specified condition,
3161/// e.g. turning COND_E to COND_NE.
3162X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
3163  switch (CC) {
3164  default:
3165    llvm_unreachable("Illegal condition code!");
3166  case X86::COND_E:
3167    return X86::COND_NE;
3168  case X86::COND_NE:
3169    return X86::COND_E;
3170  case X86::COND_L:
3171    return X86::COND_GE;
3172  case X86::COND_LE:
3173    return X86::COND_G;
3174  case X86::COND_G:
3175    return X86::COND_LE;
3176  case X86::COND_GE:
3177    return X86::COND_L;
3178  case X86::COND_B:
3179    return X86::COND_AE;
3180  case X86::COND_BE:
3181    return X86::COND_A;
3182  case X86::COND_A:
3183    return X86::COND_BE;
3184  case X86::COND_AE:
3185    return X86::COND_B;
3186  case X86::COND_S:
3187    return X86::COND_NS;
3188  case X86::COND_NS:
3189    return X86::COND_S;
3190  case X86::COND_P:
3191    return X86::COND_NP;
3192  case X86::COND_NP:
3193    return X86::COND_P;
3194  case X86::COND_O:
3195    return X86::COND_NO;
3196  case X86::COND_NO:
3197    return X86::COND_O;
3198  case X86::COND_NE_OR_P:
3199    return X86::COND_E_AND_NP;
3200  case X86::COND_E_AND_NP:
3201    return X86::COND_NE_OR_P;
3202  }
3203}
3204
3205/// Assuming the flags are set by MI(a,b), return the condition code if we
3206/// modify the instructions such that flags are set by MI(b,a).
3207static X86::CondCode getSwappedCondition(X86::CondCode CC) {
3208  switch (CC) {
3209  default:
3210    return X86::COND_INVALID;
3211  case X86::COND_E:
3212    return X86::COND_E;
3213  case X86::COND_NE:
3214    return X86::COND_NE;
3215  case X86::COND_L:
3216    return X86::COND_G;
3217  case X86::COND_LE:
3218    return X86::COND_GE;
3219  case X86::COND_G:
3220    return X86::COND_L;
3221  case X86::COND_GE:
3222    return X86::COND_LE;
3223  case X86::COND_B:
3224    return X86::COND_A;
3225  case X86::COND_BE:
3226    return X86::COND_AE;
3227  case X86::COND_A:
3228    return X86::COND_B;
3229  case X86::COND_AE:
3230    return X86::COND_BE;
3231  }
3232}
3233
3234std::pair<X86::CondCode, bool>
3235X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
3236  X86::CondCode CC = X86::COND_INVALID;
3237  bool NeedSwap = false;
3238  switch (Predicate) {
3239  default:
3240    break;
3241  // Floating-point Predicates
3242  case CmpInst::FCMP_UEQ:
3243    CC = X86::COND_E;
3244    break;
3245  case CmpInst::FCMP_OLT:
3246    NeedSwap = true;
3247    [[fallthrough]];
3248  case CmpInst::FCMP_OGT:
3249    CC = X86::COND_A;
3250    break;
3251  case CmpInst::FCMP_OLE:
3252    NeedSwap = true;
3253    [[fallthrough]];
3254  case CmpInst::FCMP_OGE:
3255    CC = X86::COND_AE;
3256    break;
3257  case CmpInst::FCMP_UGT:
3258    NeedSwap = true;
3259    [[fallthrough]];
3260  case CmpInst::FCMP_ULT:
3261    CC = X86::COND_B;
3262    break;
3263  case CmpInst::FCMP_UGE:
3264    NeedSwap = true;
3265    [[fallthrough]];
3266  case CmpInst::FCMP_ULE:
3267    CC = X86::COND_BE;
3268    break;
3269  case CmpInst::FCMP_ONE:
3270    CC = X86::COND_NE;
3271    break;
3272  case CmpInst::FCMP_UNO:
3273    CC = X86::COND_P;
3274    break;
3275  case CmpInst::FCMP_ORD:
3276    CC = X86::COND_NP;
3277    break;
3278  case CmpInst::FCMP_OEQ:
3279    [[fallthrough]];
3280  case CmpInst::FCMP_UNE:
3281    CC = X86::COND_INVALID;
3282    break;
3283
3284  // Integer Predicates
3285  case CmpInst::ICMP_EQ:
3286    CC = X86::COND_E;
3287    break;
3288  case CmpInst::ICMP_NE:
3289    CC = X86::COND_NE;
3290    break;
3291  case CmpInst::ICMP_UGT:
3292    CC = X86::COND_A;
3293    break;
3294  case CmpInst::ICMP_UGE:
3295    CC = X86::COND_AE;
3296    break;
3297  case CmpInst::ICMP_ULT:
3298    CC = X86::COND_B;
3299    break;
3300  case CmpInst::ICMP_ULE:
3301    CC = X86::COND_BE;
3302    break;
3303  case CmpInst::ICMP_SGT:
3304    CC = X86::COND_G;
3305    break;
3306  case CmpInst::ICMP_SGE:
3307    CC = X86::COND_GE;
3308    break;
3309  case CmpInst::ICMP_SLT:
3310    CC = X86::COND_L;
3311    break;
3312  case CmpInst::ICMP_SLE:
3313    CC = X86::COND_LE;
3314    break;
3315  }
3316
3317  return std::make_pair(CC, NeedSwap);
3318}
3319
3320/// Return a cmov opcode for the given register size in bytes, and operand type.
3321unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
3322  switch (RegBytes) {
3323  default:
3324    llvm_unreachable("Illegal register size!");
3325  case 2:
3326    return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
3327  case 4:
3328    return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
3329  case 8:
3330    return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
3331  }
3332}
3333
3334/// Get the VPCMP immediate for the given condition.
3335unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
3336  switch (CC) {
3337  default:
3338    llvm_unreachable("Unexpected SETCC condition");
3339  case ISD::SETNE:
3340    return 4;
3341  case ISD::SETEQ:
3342    return 0;
3343  case ISD::SETULT:
3344  case ISD::SETLT:
3345    return 1;
3346  case ISD::SETUGT:
3347  case ISD::SETGT:
3348    return 6;
3349  case ISD::SETUGE:
3350  case ISD::SETGE:
3351    return 5;
3352  case ISD::SETULE:
3353  case ISD::SETLE:
3354    return 2;
3355  }
3356}
3357
3358/// Get the VPCMP immediate if the operands are swapped.
3359unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
3360  switch (Imm) {
3361  default:
3362    llvm_unreachable("Unreachable!");
3363  case 0x01:
3364    Imm = 0x06;
3365    break; // LT  -> NLE
3366  case 0x02:
3367    Imm = 0x05;
3368    break; // LE  -> NLT
3369  case 0x05:
3370    Imm = 0x02;
3371    break; // NLT -> LE
3372  case 0x06:
3373    Imm = 0x01;
3374    break;   // NLE -> LT
3375  case 0x00: // EQ
3376  case 0x03: // FALSE
3377  case 0x04: // NE
3378  case 0x07: // TRUE
3379    break;
3380  }
3381
3382  return Imm;
3383}
3384
3385/// Get the VPCOM immediate if the operands are swapped.
3386unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
3387  switch (Imm) {
3388  default:
3389    llvm_unreachable("Unreachable!");
3390  case 0x00:
3391    Imm = 0x02;
3392    break; // LT -> GT
3393  case 0x01:
3394    Imm = 0x03;
3395    break; // LE -> GE
3396  case 0x02:
3397    Imm = 0x00;
3398    break; // GT -> LT
3399  case 0x03:
3400    Imm = 0x01;
3401    break;   // GE -> LE
3402  case 0x04: // EQ
3403  case 0x05: // NE
3404  case 0x06: // FALSE
3405  case 0x07: // TRUE
3406    break;
3407  }
3408
3409  return Imm;
3410}
3411
3412/// Get the VCMP immediate if the operands are swapped.
3413unsigned X86::getSwappedVCMPImm(unsigned Imm) {
3414  // Only need the lower 2 bits to distinquish.
3415  switch (Imm & 0x3) {
3416  default:
3417    llvm_unreachable("Unreachable!");
3418  case 0x00:
3419  case 0x03:
3420    // EQ/NE/TRUE/FALSE/ORD/UNORD don't change immediate when commuted.
3421    break;
3422  case 0x01:
3423  case 0x02:
3424    // Need to toggle bits 3:0. Bit 4 stays the same.
3425    Imm ^= 0xf;
3426    break;
3427  }
3428
3429  return Imm;
3430}
3431
3432/// Return true if the Reg is X87 register.
3433static bool isX87Reg(unsigned Reg) {
3434  return (Reg == X86::FPCW || Reg == X86::FPSW ||
3435          (Reg >= X86::ST0 && Reg <= X86::ST7));
3436}
3437
3438/// check if the instruction is X87 instruction
3439bool X86::isX87Instruction(MachineInstr &MI) {
3440  for (const MachineOperand &MO : MI.operands()) {
3441    if (!MO.isReg())
3442      continue;
3443    if (isX87Reg(MO.getReg()))
3444      return true;
3445  }
3446  return false;
3447}
3448
3449int X86::getFirstAddrOperandIdx(const MachineInstr &MI) {
3450  auto IsMemOp = [](const MCOperandInfo &OpInfo) {
3451    return OpInfo.OperandType == MCOI::OPERAND_MEMORY;
3452  };
3453
3454  const MCInstrDesc &Desc = MI.getDesc();
3455
3456  // Directly invoke the MC-layer routine for real (i.e., non-pseudo)
3457  // instructions (fast case).
3458  if (!X86II::isPseudo(Desc.TSFlags)) {
3459    int MemRefIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
3460    if (MemRefIdx >= 0)
3461      return MemRefIdx + X86II::getOperandBias(Desc);
3462#ifdef EXPENSIVE_CHECKS
3463    assert(none_of(Desc.operands(), IsMemOp) &&
3464           "Got false negative from X86II::getMemoryOperandNo()!");
3465#endif
3466    return -1;
3467  }
3468
3469  // Otherwise, handle pseudo instructions by examining the type of their
3470  // operands (slow case). An instruction cannot have a memory reference if it
3471  // has fewer than AddrNumOperands (= 5) explicit operands.
3472  unsigned NumOps = Desc.getNumOperands();
3473  if (NumOps < X86::AddrNumOperands) {
3474#ifdef EXPENSIVE_CHECKS
3475    assert(none_of(Desc.operands(), IsMemOp) &&
3476           "Expected no operands to have OPERAND_MEMORY type!");
3477#endif
3478    return -1;
3479  }
3480
3481  // The first operand with type OPERAND_MEMORY indicates the start of a memory
3482  // reference. We expect the following AddrNumOperand-1 operands to also have
3483  // OPERAND_MEMORY type.
3484  for (unsigned I = 0, E = NumOps - X86::AddrNumOperands; I != E; ++I) {
3485    if (IsMemOp(Desc.operands()[I])) {
3486#ifdef EXPENSIVE_CHECKS
3487      assert(std::all_of(Desc.operands().begin() + I,
3488                         Desc.operands().begin() + I + X86::AddrNumOperands,
3489                         IsMemOp) &&
3490             "Expected all five operands in the memory reference to have "
3491             "OPERAND_MEMORY type!");
3492#endif
3493      return I;
3494    }
3495  }
3496
3497  return -1;
3498}
3499
3500const Constant *X86::getConstantFromPool(const MachineInstr &MI,
3501                                         unsigned OpNo) {
3502  assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) &&
3503         "Unexpected number of operands!");
3504
3505  const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg);
3506  if (!Index.isReg() || Index.getReg() != X86::NoRegister)
3507    return nullptr;
3508
3509  const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp);
3510  if (!Disp.isCPI() || Disp.getOffset() != 0)
3511    return nullptr;
3512
3513  ArrayRef<MachineConstantPoolEntry> Constants =
3514      MI.getParent()->getParent()->getConstantPool()->getConstants();
3515  const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()];
3516
3517  // Bail if this is a machine constant pool entry, we won't be able to dig out
3518  // anything useful.
3519  if (ConstantEntry.isMachineConstantPoolEntry())
3520    return nullptr;
3521
3522  return ConstantEntry.Val.ConstVal;
3523}
3524
3525bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
3526  switch (MI.getOpcode()) {
3527  case X86::TCRETURNdi:
3528  case X86::TCRETURNri:
3529  case X86::TCRETURNmi:
3530  case X86::TCRETURNdi64:
3531  case X86::TCRETURNri64:
3532  case X86::TCRETURNmi64:
3533    return true;
3534  default:
3535    return false;
3536  }
3537}
3538
3539bool X86InstrInfo::canMakeTailCallConditional(
3540    SmallVectorImpl<MachineOperand> &BranchCond,
3541    const MachineInstr &TailCall) const {
3542
3543  const MachineFunction *MF = TailCall.getMF();
3544
3545  if (MF->getTarget().getCodeModel() == CodeModel::Kernel) {
3546    // Kernel patches thunk calls in runtime, these should never be conditional.
3547    const MachineOperand &Target = TailCall.getOperand(0);
3548    if (Target.isSymbol()) {
3549      StringRef Symbol(Target.getSymbolName());
3550      // this is currently only relevant to r11/kernel indirect thunk.
3551      if (Symbol.equals("__x86_indirect_thunk_r11"))
3552        return false;
3553    }
3554  }
3555
3556  if (TailCall.getOpcode() != X86::TCRETURNdi &&
3557      TailCall.getOpcode() != X86::TCRETURNdi64) {
3558    // Only direct calls can be done with a conditional branch.
3559    return false;
3560  }
3561
3562  if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
3563    // Conditional tail calls confuse the Win64 unwinder.
3564    return false;
3565  }
3566
3567  assert(BranchCond.size() == 1);
3568  if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
3569    // Can't make a conditional tail call with this condition.
3570    return false;
3571  }
3572
3573  const X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
3574  if (X86FI->getTCReturnAddrDelta() != 0 ||
3575      TailCall.getOperand(1).getImm() != 0) {
3576    // A conditional tail call cannot do any stack adjustment.
3577    return false;
3578  }
3579
3580  return true;
3581}
3582
3583void X86InstrInfo::replaceBranchWithTailCall(
3584    MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond,
3585    const MachineInstr &TailCall) const {
3586  assert(canMakeTailCallConditional(BranchCond, TailCall));
3587
3588  MachineBasicBlock::iterator I = MBB.end();
3589  while (I != MBB.begin()) {
3590    --I;
3591    if (I->isDebugInstr())
3592      continue;
3593    if (!I->isBranch())
3594      assert(0 && "Can't find the branch to replace!");
3595
3596    X86::CondCode CC = X86::getCondFromBranch(*I);
3597    assert(BranchCond.size() == 1);
3598    if (CC != BranchCond[0].getImm())
3599      continue;
3600
3601    break;
3602  }
3603
3604  unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
3605                                                         : X86::TCRETURNdi64cc;
3606
3607  auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
3608  MIB->addOperand(TailCall.getOperand(0)); // Destination.
3609  MIB.addImm(0);                           // Stack offset (not used).
3610  MIB->addOperand(BranchCond[0]);          // Condition.
3611  MIB.copyImplicitOps(TailCall);           // Regmask and (imp-used) parameters.
3612
3613  // Add implicit uses and defs of all live regs potentially clobbered by the
3614  // call. This way they still appear live across the call.
3615  LivePhysRegs LiveRegs(getRegisterInfo());
3616  LiveRegs.addLiveOuts(MBB);
3617  SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 8> Clobbers;
3618  LiveRegs.stepForward(*MIB, Clobbers);
3619  for (const auto &C : Clobbers) {
3620    MIB.addReg(C.first, RegState::Implicit);
3621    MIB.addReg(C.first, RegState::Implicit | RegState::Define);
3622  }
3623
3624  I->eraseFromParent();
3625}
3626
3627// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
3628// not be a fallthrough MBB now due to layout changes). Return nullptr if the
3629// fallthrough MBB cannot be identified.
3630static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB,
3631                                            MachineBasicBlock *TBB) {
3632  // Look for non-EHPad successors other than TBB. If we find exactly one, it
3633  // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
3634  // and fallthrough MBB. If we find more than one, we cannot identify the
3635  // fallthrough MBB and should return nullptr.
3636  MachineBasicBlock *FallthroughBB = nullptr;
3637  for (MachineBasicBlock *Succ : MBB->successors()) {
3638    if (Succ->isEHPad() || (Succ == TBB && FallthroughBB))
3639      continue;
3640    // Return a nullptr if we found more than one fallthrough successor.
3641    if (FallthroughBB && FallthroughBB != TBB)
3642      return nullptr;
3643    FallthroughBB = Succ;
3644  }
3645  return FallthroughBB;
3646}
3647
3648bool X86InstrInfo::AnalyzeBranchImpl(
3649    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
3650    SmallVectorImpl<MachineOperand> &Cond,
3651    SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
3652
3653  // Start from the bottom of the block and work up, examining the
3654  // terminator instructions.
3655  MachineBasicBlock::iterator I = MBB.end();
3656  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
3657  while (I != MBB.begin()) {
3658    --I;
3659    if (I->isDebugInstr())
3660      continue;
3661
3662    // Working from the bottom, when we see a non-terminator instruction, we're
3663    // done.
3664    if (!isUnpredicatedTerminator(*I))
3665      break;
3666
3667    // A terminator that isn't a branch can't easily be handled by this
3668    // analysis.
3669    if (!I->isBranch())
3670      return true;
3671
3672    // Handle unconditional branches.
3673    if (I->getOpcode() == X86::JMP_1) {
3674      UnCondBrIter = I;
3675
3676      if (!AllowModify) {
3677        TBB = I->getOperand(0).getMBB();
3678        continue;
3679      }
3680
3681      // If the block has any instructions after a JMP, delete them.
3682      MBB.erase(std::next(I), MBB.end());
3683
3684      Cond.clear();
3685      FBB = nullptr;
3686
3687      // Delete the JMP if it's equivalent to a fall-through.
3688      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
3689        TBB = nullptr;
3690        I->eraseFromParent();
3691        I = MBB.end();
3692        UnCondBrIter = MBB.end();
3693        continue;
3694      }
3695
3696      // TBB is used to indicate the unconditional destination.
3697      TBB = I->getOperand(0).getMBB();
3698      continue;
3699    }
3700
3701    // Handle conditional branches.
3702    X86::CondCode BranchCode = X86::getCondFromBranch(*I);
3703    if (BranchCode == X86::COND_INVALID)
3704      return true; // Can't handle indirect branch.
3705
3706    // In practice we should never have an undef eflags operand, if we do
3707    // abort here as we are not prepared to preserve the flag.
3708    if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
3709      return true;
3710
3711    // Working from the bottom, handle the first conditional branch.
3712    if (Cond.empty()) {
3713      FBB = TBB;
3714      TBB = I->getOperand(0).getMBB();
3715      Cond.push_back(MachineOperand::CreateImm(BranchCode));
3716      CondBranches.push_back(&*I);
3717      continue;
3718    }
3719
3720    // Handle subsequent conditional branches. Only handle the case where all
3721    // conditional branches branch to the same destination and their condition
3722    // opcodes fit one of the special multi-branch idioms.
3723    assert(Cond.size() == 1);
3724    assert(TBB);
3725
3726    // If the conditions are the same, we can leave them alone.
3727    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
3728    auto NewTBB = I->getOperand(0).getMBB();
3729    if (OldBranchCode == BranchCode && TBB == NewTBB)
3730      continue;
3731
3732    // If they differ, see if they fit one of the known patterns. Theoretically,
3733    // we could handle more patterns here, but we shouldn't expect to see them
3734    // if instruction selection has done a reasonable job.
3735    if (TBB == NewTBB &&
3736        ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
3737         (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
3738      BranchCode = X86::COND_NE_OR_P;
3739    } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
3740               (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
3741      if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
3742        return true;
3743
3744      // X86::COND_E_AND_NP usually has two different branch destinations.
3745      //
3746      // JP B1
3747      // JE B2
3748      // JMP B1
3749      // B1:
3750      // B2:
3751      //
3752      // Here this condition branches to B2 only if NP && E. It has another
3753      // equivalent form:
3754      //
3755      // JNE B1
3756      // JNP B2
3757      // JMP B1
3758      // B1:
3759      // B2:
3760      //
3761      // Similarly it branches to B2 only if E && NP. That is why this condition
3762      // is named with COND_E_AND_NP.
3763      BranchCode = X86::COND_E_AND_NP;
3764    } else
3765      return true;
3766
3767    // Update the MachineOperand.
3768    Cond[0].setImm(BranchCode);
3769    CondBranches.push_back(&*I);
3770  }
3771
3772  return false;
3773}
3774
3775bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
3776                                 MachineBasicBlock *&TBB,
3777                                 MachineBasicBlock *&FBB,
3778                                 SmallVectorImpl<MachineOperand> &Cond,
3779                                 bool AllowModify) const {
3780  SmallVector<MachineInstr *, 4> CondBranches;
3781  return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
3782}
3783
3784static int getJumpTableIndexFromAddr(const MachineInstr &MI) {
3785  const MCInstrDesc &Desc = MI.getDesc();
3786  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3787  assert(MemRefBegin >= 0 && "instr should have memory operand");
3788  MemRefBegin += X86II::getOperandBias(Desc);
3789
3790  const MachineOperand &MO = MI.getOperand(MemRefBegin + X86::AddrDisp);
3791  if (!MO.isJTI())
3792    return -1;
3793
3794  return MO.getIndex();
3795}
3796
3797static int getJumpTableIndexFromReg(const MachineRegisterInfo &MRI,
3798                                    Register Reg) {
3799  if (!Reg.isVirtual())
3800    return -1;
3801  MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
3802  if (MI == nullptr)
3803    return -1;
3804  unsigned Opcode = MI->getOpcode();
3805  if (Opcode != X86::LEA64r && Opcode != X86::LEA32r)
3806    return -1;
3807  return getJumpTableIndexFromAddr(*MI);
3808}
3809
3810int X86InstrInfo::getJumpTableIndex(const MachineInstr &MI) const {
3811  unsigned Opcode = MI.getOpcode();
3812  // Switch-jump pattern for non-PIC code looks like:
3813  //   JMP64m $noreg, 8, %X, %jump-table.X, $noreg
3814  if (Opcode == X86::JMP64m || Opcode == X86::JMP32m) {
3815    return getJumpTableIndexFromAddr(MI);
3816  }
3817  // The pattern for PIC code looks like:
3818  //   %0 = LEA64r $rip, 1, $noreg, %jump-table.X
3819  //   %1 = MOVSX64rm32 %0, 4, XX, 0, $noreg
3820  //   %2 = ADD64rr %1, %0
3821  //   JMP64r %2
3822  if (Opcode == X86::JMP64r || Opcode == X86::JMP32r) {
3823    Register Reg = MI.getOperand(0).getReg();
3824    if (!Reg.isVirtual())
3825      return -1;
3826    const MachineFunction &MF = *MI.getParent()->getParent();
3827    const MachineRegisterInfo &MRI = MF.getRegInfo();
3828    MachineInstr *Add = MRI.getUniqueVRegDef(Reg);
3829    if (Add == nullptr)
3830      return -1;
3831    if (Add->getOpcode() != X86::ADD64rr && Add->getOpcode() != X86::ADD32rr)
3832      return -1;
3833    int JTI1 = getJumpTableIndexFromReg(MRI, Add->getOperand(1).getReg());
3834    if (JTI1 >= 0)
3835      return JTI1;
3836    int JTI2 = getJumpTableIndexFromReg(MRI, Add->getOperand(2).getReg());
3837    if (JTI2 >= 0)
3838      return JTI2;
3839  }
3840  return -1;
3841}
3842
3843bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
3844                                          MachineBranchPredicate &MBP,
3845                                          bool AllowModify) const {
3846  using namespace std::placeholders;
3847
3848  SmallVector<MachineOperand, 4> Cond;
3849  SmallVector<MachineInstr *, 4> CondBranches;
3850  if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
3851                        AllowModify))
3852    return true;
3853
3854  if (Cond.size() != 1)
3855    return true;
3856
3857  assert(MBP.TrueDest && "expected!");
3858
3859  if (!MBP.FalseDest)
3860    MBP.FalseDest = MBB.getNextNode();
3861
3862  const TargetRegisterInfo *TRI = &getRegisterInfo();
3863
3864  MachineInstr *ConditionDef = nullptr;
3865  bool SingleUseCondition = true;
3866
3867  for (MachineInstr &MI : llvm::drop_begin(llvm::reverse(MBB))) {
3868    if (MI.modifiesRegister(X86::EFLAGS, TRI)) {
3869      ConditionDef = &MI;
3870      break;
3871    }
3872
3873    if (MI.readsRegister(X86::EFLAGS, TRI))
3874      SingleUseCondition = false;
3875  }
3876
3877  if (!ConditionDef)
3878    return true;
3879
3880  if (SingleUseCondition) {
3881    for (auto *Succ : MBB.successors())
3882      if (Succ->isLiveIn(X86::EFLAGS))
3883        SingleUseCondition = false;
3884  }
3885
3886  MBP.ConditionDef = ConditionDef;
3887  MBP.SingleUseCondition = SingleUseCondition;
3888
3889  // Currently we only recognize the simple pattern:
3890  //
3891  //   test %reg, %reg
3892  //   je %label
3893  //
3894  const unsigned TestOpcode =
3895      Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
3896
3897  if (ConditionDef->getOpcode() == TestOpcode &&
3898      ConditionDef->getNumOperands() == 3 &&
3899      ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
3900      (Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
3901    MBP.LHS = ConditionDef->getOperand(0);
3902    MBP.RHS = MachineOperand::CreateImm(0);
3903    MBP.Predicate = Cond[0].getImm() == X86::COND_NE
3904                        ? MachineBranchPredicate::PRED_NE
3905                        : MachineBranchPredicate::PRED_EQ;
3906    return false;
3907  }
3908
3909  return true;
3910}
3911
3912unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
3913                                    int *BytesRemoved) const {
3914  assert(!BytesRemoved && "code size not handled");
3915
3916  MachineBasicBlock::iterator I = MBB.end();
3917  unsigned Count = 0;
3918
3919  while (I != MBB.begin()) {
3920    --I;
3921    if (I->isDebugInstr())
3922      continue;
3923    if (I->getOpcode() != X86::JMP_1 &&
3924        X86::getCondFromBranch(*I) == X86::COND_INVALID)
3925      break;
3926    // Remove the branch.
3927    I->eraseFromParent();
3928    I = MBB.end();
3929    ++Count;
3930  }
3931
3932  return Count;
3933}
3934
3935unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
3936                                    MachineBasicBlock *TBB,
3937                                    MachineBasicBlock *FBB,
3938                                    ArrayRef<MachineOperand> Cond,
3939                                    const DebugLoc &DL, int *BytesAdded) const {
3940  // Shouldn't be a fall through.
3941  assert(TBB && "insertBranch must not be told to insert a fallthrough");
3942  assert((Cond.size() == 1 || Cond.size() == 0) &&
3943         "X86 branch conditions have one component!");
3944  assert(!BytesAdded && "code size not handled");
3945
3946  if (Cond.empty()) {
3947    // Unconditional branch?
3948    assert(!FBB && "Unconditional branch with multiple successors!");
3949    BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
3950    return 1;
3951  }
3952
3953  // If FBB is null, it is implied to be a fall-through block.
3954  bool FallThru = FBB == nullptr;
3955
3956  // Conditional branch.
3957  unsigned Count = 0;
3958  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
3959  switch (CC) {
3960  case X86::COND_NE_OR_P:
3961    // Synthesize NE_OR_P with two branches.
3962    BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
3963    ++Count;
3964    BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
3965    ++Count;
3966    break;
3967  case X86::COND_E_AND_NP:
3968    // Use the next block of MBB as FBB if it is null.
3969    if (FBB == nullptr) {
3970      FBB = getFallThroughMBB(&MBB, TBB);
3971      assert(FBB && "MBB cannot be the last block in function when the false "
3972                    "body is a fall-through.");
3973    }
3974    // Synthesize COND_E_AND_NP with two branches.
3975    BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
3976    ++Count;
3977    BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
3978    ++Count;
3979    break;
3980  default: {
3981    BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
3982    ++Count;
3983  }
3984  }
3985  if (!FallThru) {
3986    // Two-way Conditional branch. Insert the second branch.
3987    BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
3988    ++Count;
3989  }
3990  return Count;
3991}
3992
3993bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
3994                                   ArrayRef<MachineOperand> Cond,
3995                                   Register DstReg, Register TrueReg,
3996                                   Register FalseReg, int &CondCycles,
3997                                   int &TrueCycles, int &FalseCycles) const {
3998  // Not all subtargets have cmov instructions.
3999  if (!Subtarget.canUseCMOV())
4000    return false;
4001  if (Cond.size() != 1)
4002    return false;
4003  // We cannot do the composite conditions, at least not in SSA form.
4004  if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
4005    return false;
4006
4007  // Check register classes.
4008  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4009  const TargetRegisterClass *RC =
4010      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
4011  if (!RC)
4012    return false;
4013
4014  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
4015  if (X86::GR16RegClass.hasSubClassEq(RC) ||
4016      X86::GR32RegClass.hasSubClassEq(RC) ||
4017      X86::GR64RegClass.hasSubClassEq(RC)) {
4018    // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
4019    // Bridge. Probably Ivy Bridge as well.
4020    CondCycles = 2;
4021    TrueCycles = 2;
4022    FalseCycles = 2;
4023    return true;
4024  }
4025
4026  // Can't do vectors.
4027  return false;
4028}
4029
4030void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
4031                                MachineBasicBlock::iterator I,
4032                                const DebugLoc &DL, Register DstReg,
4033                                ArrayRef<MachineOperand> Cond, Register TrueReg,
4034                                Register FalseReg) const {
4035  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4036  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
4037  const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
4038  assert(Cond.size() == 1 && "Invalid Cond array");
4039  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
4040                                    false /*HasMemoryOperand*/);
4041  BuildMI(MBB, I, DL, get(Opc), DstReg)
4042      .addReg(FalseReg)
4043      .addReg(TrueReg)
4044      .addImm(Cond[0].getImm());
4045}
4046
4047/// Test if the given register is a physical h register.
4048static bool isHReg(unsigned Reg) {
4049  return X86::GR8_ABCD_HRegClass.contains(Reg);
4050}
4051
4052// Try and copy between VR128/VR64 and GR64 registers.
4053static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
4054                                        const X86Subtarget &Subtarget) {
4055  bool HasAVX = Subtarget.hasAVX();
4056  bool HasAVX512 = Subtarget.hasAVX512();
4057  bool HasEGPR = Subtarget.hasEGPR();
4058
4059  // SrcReg(MaskReg) -> DestReg(GR64)
4060  // SrcReg(MaskReg) -> DestReg(GR32)
4061
4062  // All KMASK RegClasses hold the same k registers, can be tested against
4063  // anyone.
4064  if (X86::VK16RegClass.contains(SrcReg)) {
4065    if (X86::GR64RegClass.contains(DestReg)) {
4066      assert(Subtarget.hasBWI());
4067      return HasEGPR ? X86::KMOVQrk_EVEX : X86::KMOVQrk;
4068    }
4069    if (X86::GR32RegClass.contains(DestReg))
4070      return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDrk_EVEX : X86::KMOVDrk)
4071                                : (HasEGPR ? X86::KMOVWrk_EVEX : X86::KMOVWrk);
4072  }
4073
4074  // SrcReg(GR64) -> DestReg(MaskReg)
4075  // SrcReg(GR32) -> DestReg(MaskReg)
4076
4077  // All KMASK RegClasses hold the same k registers, can be tested against
4078  // anyone.
4079  if (X86::VK16RegClass.contains(DestReg)) {
4080    if (X86::GR64RegClass.contains(SrcReg)) {
4081      assert(Subtarget.hasBWI());
4082      return HasEGPR ? X86::KMOVQkr_EVEX : X86::KMOVQkr;
4083    }
4084    if (X86::GR32RegClass.contains(SrcReg))
4085      return Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVDkr_EVEX : X86::KMOVDkr)
4086                                : (HasEGPR ? X86::KMOVWkr_EVEX : X86::KMOVWkr);
4087  }
4088
4089  // SrcReg(VR128) -> DestReg(GR64)
4090  // SrcReg(VR64)  -> DestReg(GR64)
4091  // SrcReg(GR64)  -> DestReg(VR128)
4092  // SrcReg(GR64)  -> DestReg(VR64)
4093
4094  if (X86::GR64RegClass.contains(DestReg)) {
4095    if (X86::VR128XRegClass.contains(SrcReg))
4096      // Copy from a VR128 register to a GR64 register.
4097      return HasAVX512 ? X86::VMOVPQIto64Zrr
4098             : HasAVX  ? X86::VMOVPQIto64rr
4099                       : X86::MOVPQIto64rr;
4100    if (X86::VR64RegClass.contains(SrcReg))
4101      // Copy from a VR64 register to a GR64 register.
4102      return X86::MMX_MOVD64from64rr;
4103  } else if (X86::GR64RegClass.contains(SrcReg)) {
4104    // Copy from a GR64 register to a VR128 register.
4105    if (X86::VR128XRegClass.contains(DestReg))
4106      return HasAVX512 ? X86::VMOV64toPQIZrr
4107             : HasAVX  ? X86::VMOV64toPQIrr
4108                       : X86::MOV64toPQIrr;
4109    // Copy from a GR64 register to a VR64 register.
4110    if (X86::VR64RegClass.contains(DestReg))
4111      return X86::MMX_MOVD64to64rr;
4112  }
4113
4114  // SrcReg(VR128) -> DestReg(GR32)
4115  // SrcReg(GR32)  -> DestReg(VR128)
4116
4117  if (X86::GR32RegClass.contains(DestReg) &&
4118      X86::VR128XRegClass.contains(SrcReg))
4119    // Copy from a VR128 register to a GR32 register.
4120    return HasAVX512 ? X86::VMOVPDI2DIZrr
4121           : HasAVX  ? X86::VMOVPDI2DIrr
4122                     : X86::MOVPDI2DIrr;
4123
4124  if (X86::VR128XRegClass.contains(DestReg) &&
4125      X86::GR32RegClass.contains(SrcReg))
4126    // Copy from a VR128 register to a VR128 register.
4127    return HasAVX512 ? X86::VMOVDI2PDIZrr
4128           : HasAVX  ? X86::VMOVDI2PDIrr
4129                     : X86::MOVDI2PDIrr;
4130  return 0;
4131}
4132
4133void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
4134                               MachineBasicBlock::iterator MI,
4135                               const DebugLoc &DL, MCRegister DestReg,
4136                               MCRegister SrcReg, bool KillSrc) const {
4137  // First deal with the normal symmetric copies.
4138  bool HasAVX = Subtarget.hasAVX();
4139  bool HasVLX = Subtarget.hasVLX();
4140  bool HasEGPR = Subtarget.hasEGPR();
4141  unsigned Opc = 0;
4142  if (X86::GR64RegClass.contains(DestReg, SrcReg))
4143    Opc = X86::MOV64rr;
4144  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
4145    Opc = X86::MOV32rr;
4146  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
4147    Opc = X86::MOV16rr;
4148  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
4149    // Copying to or from a physical H register on x86-64 requires a NOREX
4150    // move.  Otherwise use a normal move.
4151    if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) {
4152      Opc = X86::MOV8rr_NOREX;
4153      // Both operands must be encodable without an REX prefix.
4154      assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
4155             "8-bit H register can not be copied outside GR8_NOREX");
4156    } else
4157      Opc = X86::MOV8rr;
4158  } else if (X86::VR64RegClass.contains(DestReg, SrcReg))
4159    Opc = X86::MMX_MOVQ64rr;
4160  else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
4161    if (HasVLX)
4162      Opc = X86::VMOVAPSZ128rr;
4163    else if (X86::VR128RegClass.contains(DestReg, SrcReg))
4164      Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
4165    else {
4166      // If this an extended register and we don't have VLX we need to use a
4167      // 512-bit move.
4168      Opc = X86::VMOVAPSZrr;
4169      const TargetRegisterInfo *TRI = &getRegisterInfo();
4170      DestReg =
4171          TRI->getMatchingSuperReg(DestReg, X86::sub_xmm, &X86::VR512RegClass);
4172      SrcReg =
4173          TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4174    }
4175  } else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
4176    if (HasVLX)
4177      Opc = X86::VMOVAPSZ256rr;
4178    else if (X86::VR256RegClass.contains(DestReg, SrcReg))
4179      Opc = X86::VMOVAPSYrr;
4180    else {
4181      // If this an extended register and we don't have VLX we need to use a
4182      // 512-bit move.
4183      Opc = X86::VMOVAPSZrr;
4184      const TargetRegisterInfo *TRI = &getRegisterInfo();
4185      DestReg =
4186          TRI->getMatchingSuperReg(DestReg, X86::sub_ymm, &X86::VR512RegClass);
4187      SrcReg =
4188          TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4189    }
4190  } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
4191    Opc = X86::VMOVAPSZrr;
4192  // All KMASK RegClasses hold the same k registers, can be tested against
4193  // anyone.
4194  else if (X86::VK16RegClass.contains(DestReg, SrcReg))
4195    Opc = Subtarget.hasBWI() ? (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVQkk)
4196                             : (HasEGPR ? X86::KMOVQkk_EVEX : X86::KMOVWkk);
4197  if (!Opc)
4198    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
4199
4200  if (Opc) {
4201    BuildMI(MBB, MI, DL, get(Opc), DestReg)
4202        .addReg(SrcReg, getKillRegState(KillSrc));
4203    return;
4204  }
4205
4206  if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
4207    // FIXME: We use a fatal error here because historically LLVM has tried
4208    // lower some of these physreg copies and we want to ensure we get
4209    // reasonable bug reports if someone encounters a case no other testing
4210    // found. This path should be removed after the LLVM 7 release.
4211    report_fatal_error("Unable to copy EFLAGS physical register!");
4212  }
4213
4214  LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
4215                    << RI.getName(DestReg) << '\n');
4216  report_fatal_error("Cannot emit physreg copy instruction");
4217}
4218
4219std::optional<DestSourcePair>
4220X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
4221  if (MI.isMoveReg()) {
4222    // FIXME: Dirty hack for apparent invariant that doesn't hold when
4223    // subreg_to_reg is coalesced with ordinary copies, such that the bits that
4224    // were asserted as 0 are now undef.
4225    if (MI.getOperand(0).isUndef() && MI.getOperand(0).getSubReg())
4226      return std::nullopt;
4227
4228    return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
4229  }
4230  return std::nullopt;
4231}
4232
4233static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
4234  if (STI.hasFP16())
4235    return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
4236  if (Load)
4237    return STI.hasAVX512() ? X86::VMOVSSZrm
4238           : STI.hasAVX()  ? X86::VMOVSSrm
4239                           : X86::MOVSSrm;
4240  else
4241    return STI.hasAVX512() ? X86::VMOVSSZmr
4242           : STI.hasAVX()  ? X86::VMOVSSmr
4243                           : X86::MOVSSmr;
4244}
4245
4246static unsigned getLoadStoreRegOpcode(Register Reg,
4247                                      const TargetRegisterClass *RC,
4248                                      bool IsStackAligned,
4249                                      const X86Subtarget &STI, bool Load) {
4250  bool HasAVX = STI.hasAVX();
4251  bool HasAVX512 = STI.hasAVX512();
4252  bool HasVLX = STI.hasVLX();
4253  bool HasEGPR = STI.hasEGPR();
4254
4255  assert(RC != nullptr && "Invalid target register class");
4256  switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
4257  default:
4258    llvm_unreachable("Unknown spill size");
4259  case 1:
4260    assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
4261    if (STI.is64Bit())
4262      // Copying to or from a physical H register on x86-64 requires a NOREX
4263      // move.  Otherwise use a normal move.
4264      if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
4265        return Load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
4266    return Load ? X86::MOV8rm : X86::MOV8mr;
4267  case 2:
4268    if (X86::VK16RegClass.hasSubClassEq(RC))
4269      return Load ? (HasEGPR ? X86::KMOVWkm_EVEX : X86::KMOVWkm)
4270                  : (HasEGPR ? X86::KMOVWmk_EVEX : X86::KMOVWmk);
4271    assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
4272    return Load ? X86::MOV16rm : X86::MOV16mr;
4273  case 4:
4274    if (X86::GR32RegClass.hasSubClassEq(RC))
4275      return Load ? X86::MOV32rm : X86::MOV32mr;
4276    if (X86::FR32XRegClass.hasSubClassEq(RC))
4277      return Load ? (HasAVX512 ? X86::VMOVSSZrm_alt
4278                     : HasAVX  ? X86::VMOVSSrm_alt
4279                               : X86::MOVSSrm_alt)
4280                  : (HasAVX512 ? X86::VMOVSSZmr
4281                     : HasAVX  ? X86::VMOVSSmr
4282                               : X86::MOVSSmr);
4283    if (X86::RFP32RegClass.hasSubClassEq(RC))
4284      return Load ? X86::LD_Fp32m : X86::ST_Fp32m;
4285    if (X86::VK32RegClass.hasSubClassEq(RC)) {
4286      assert(STI.hasBWI() && "KMOVD requires BWI");
4287      return Load ? (HasEGPR ? X86::KMOVDkm_EVEX : X86::KMOVDkm)
4288                  : (HasEGPR ? X86::KMOVDmk_EVEX : X86::KMOVDmk);
4289    }
4290    // All of these mask pair classes have the same spill size, the same kind
4291    // of kmov instructions can be used with all of them.
4292    if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
4293        X86::VK2PAIRRegClass.hasSubClassEq(RC) ||
4294        X86::VK4PAIRRegClass.hasSubClassEq(RC) ||
4295        X86::VK8PAIRRegClass.hasSubClassEq(RC) ||
4296        X86::VK16PAIRRegClass.hasSubClassEq(RC))
4297      return Load ? X86::MASKPAIR16LOAD : X86::MASKPAIR16STORE;
4298    if (X86::FR16RegClass.hasSubClassEq(RC) ||
4299        X86::FR16XRegClass.hasSubClassEq(RC))
4300      return getLoadStoreOpcodeForFP16(Load, STI);
4301    llvm_unreachable("Unknown 4-byte regclass");
4302  case 8:
4303    if (X86::GR64RegClass.hasSubClassEq(RC))
4304      return Load ? X86::MOV64rm : X86::MOV64mr;
4305    if (X86::FR64XRegClass.hasSubClassEq(RC))
4306      return Load ? (HasAVX512 ? X86::VMOVSDZrm_alt
4307                     : HasAVX  ? X86::VMOVSDrm_alt
4308                               : X86::MOVSDrm_alt)
4309                  : (HasAVX512 ? X86::VMOVSDZmr
4310                     : HasAVX  ? X86::VMOVSDmr
4311                               : X86::MOVSDmr);
4312    if (X86::VR64RegClass.hasSubClassEq(RC))
4313      return Load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
4314    if (X86::RFP64RegClass.hasSubClassEq(RC))
4315      return Load ? X86::LD_Fp64m : X86::ST_Fp64m;
4316    if (X86::VK64RegClass.hasSubClassEq(RC)) {
4317      assert(STI.hasBWI() && "KMOVQ requires BWI");
4318      return Load ? (HasEGPR ? X86::KMOVQkm_EVEX : X86::KMOVQkm)
4319                  : (HasEGPR ? X86::KMOVQmk_EVEX : X86::KMOVQmk);
4320    }
4321    llvm_unreachable("Unknown 8-byte regclass");
4322  case 10:
4323    assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
4324    return Load ? X86::LD_Fp80m : X86::ST_FpP80m;
4325  case 16: {
4326    if (X86::VR128XRegClass.hasSubClassEq(RC)) {
4327      // If stack is realigned we can use aligned stores.
4328      if (IsStackAligned)
4329        return Load ? (HasVLX      ? X86::VMOVAPSZ128rm
4330                       : HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX
4331                       : HasAVX    ? X86::VMOVAPSrm
4332                                   : X86::MOVAPSrm)
4333                    : (HasVLX      ? X86::VMOVAPSZ128mr
4334                       : HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX
4335                       : HasAVX    ? X86::VMOVAPSmr
4336                                   : X86::MOVAPSmr);
4337      else
4338        return Load ? (HasVLX      ? X86::VMOVUPSZ128rm
4339                       : HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX
4340                       : HasAVX    ? X86::VMOVUPSrm
4341                                   : X86::MOVUPSrm)
4342                    : (HasVLX      ? X86::VMOVUPSZ128mr
4343                       : HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX
4344                       : HasAVX    ? X86::VMOVUPSmr
4345                                   : X86::MOVUPSmr);
4346    }
4347    llvm_unreachable("Unknown 16-byte regclass");
4348  }
4349  case 32:
4350    assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
4351    // If stack is realigned we can use aligned stores.
4352    if (IsStackAligned)
4353      return Load ? (HasVLX      ? X86::VMOVAPSZ256rm
4354                     : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
4355                                 : X86::VMOVAPSYrm)
4356                  : (HasVLX      ? X86::VMOVAPSZ256mr
4357                     : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
4358                                 : X86::VMOVAPSYmr);
4359    else
4360      return Load ? (HasVLX      ? X86::VMOVUPSZ256rm
4361                     : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
4362                                 : X86::VMOVUPSYrm)
4363                  : (HasVLX      ? X86::VMOVUPSZ256mr
4364                     : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
4365                                 : X86::VMOVUPSYmr);
4366  case 64:
4367    assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
4368    assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
4369    if (IsStackAligned)
4370      return Load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
4371    else
4372      return Load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
4373  case 1024:
4374    assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
4375    assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
4376#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
4377    return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
4378                : GET_EGPR_IF_ENABLED(X86::TILESTORED);
4379#undef GET_EGPR_IF_ENABLED
4380  }
4381}
4382
4383std::optional<ExtAddrMode>
4384X86InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
4385                                      const TargetRegisterInfo *TRI) const {
4386  const MCInstrDesc &Desc = MemI.getDesc();
4387  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4388  if (MemRefBegin < 0)
4389    return std::nullopt;
4390
4391  MemRefBegin += X86II::getOperandBias(Desc);
4392
4393  auto &BaseOp = MemI.getOperand(MemRefBegin + X86::AddrBaseReg);
4394  if (!BaseOp.isReg()) // Can be an MO_FrameIndex
4395    return std::nullopt;
4396
4397  const MachineOperand &DispMO = MemI.getOperand(MemRefBegin + X86::AddrDisp);
4398  // Displacement can be symbolic
4399  if (!DispMO.isImm())
4400    return std::nullopt;
4401
4402  ExtAddrMode AM;
4403  AM.BaseReg = BaseOp.getReg();
4404  AM.ScaledReg = MemI.getOperand(MemRefBegin + X86::AddrIndexReg).getReg();
4405  AM.Scale = MemI.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm();
4406  AM.Displacement = DispMO.getImm();
4407  return AM;
4408}
4409
4410bool X86InstrInfo::verifyInstruction(const MachineInstr &MI,
4411                                     StringRef &ErrInfo) const {
4412  std::optional<ExtAddrMode> AMOrNone = getAddrModeFromMemoryOp(MI, nullptr);
4413  if (!AMOrNone)
4414    return true;
4415
4416  ExtAddrMode AM = *AMOrNone;
4417  assert(AM.Form == ExtAddrMode::Formula::Basic);
4418  if (AM.ScaledReg != X86::NoRegister) {
4419    switch (AM.Scale) {
4420    case 1:
4421    case 2:
4422    case 4:
4423    case 8:
4424      break;
4425    default:
4426      ErrInfo = "Scale factor in address must be 1, 2, 4 or 8";
4427      return false;
4428    }
4429  }
4430  if (!isInt<32>(AM.Displacement)) {
4431    ErrInfo = "Displacement in address must fit into 32-bit signed "
4432              "integer";
4433    return false;
4434  }
4435
4436  return true;
4437}
4438
4439bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
4440                                           const Register Reg,
4441                                           int64_t &ImmVal) const {
4442  Register MovReg = Reg;
4443  const MachineInstr *MovMI = &MI;
4444
4445  // Follow use-def for SUBREG_TO_REG to find the real move immediate
4446  // instruction. It is quite common for x86-64.
4447  if (MI.isSubregToReg()) {
4448    // We use following pattern to setup 64b immediate.
4449    //      %8:gr32 = MOV32r0 implicit-def dead $eflags
4450    //      %6:gr64 = SUBREG_TO_REG 0, killed %8:gr32, %subreg.sub_32bit
4451    if (!MI.getOperand(1).isImm())
4452      return false;
4453    unsigned FillBits = MI.getOperand(1).getImm();
4454    unsigned SubIdx = MI.getOperand(3).getImm();
4455    MovReg = MI.getOperand(2).getReg();
4456    if (SubIdx != X86::sub_32bit || FillBits != 0)
4457      return false;
4458    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4459    MovMI = MRI.getUniqueVRegDef(MovReg);
4460    if (!MovMI)
4461      return false;
4462  }
4463
4464  if (MovMI->getOpcode() == X86::MOV32r0 &&
4465      MovMI->getOperand(0).getReg() == MovReg) {
4466    ImmVal = 0;
4467    return true;
4468  }
4469
4470  if (MovMI->getOpcode() != X86::MOV32ri &&
4471      MovMI->getOpcode() != X86::MOV64ri &&
4472      MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri)
4473    return false;
4474  // Mov Src can be a global address.
4475  if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg)
4476    return false;
4477  ImmVal = MovMI->getOperand(1).getImm();
4478  return true;
4479}
4480
4481bool X86InstrInfo::preservesZeroValueInReg(
4482    const MachineInstr *MI, const Register NullValueReg,
4483    const TargetRegisterInfo *TRI) const {
4484  if (!MI->modifiesRegister(NullValueReg, TRI))
4485    return true;
4486  switch (MI->getOpcode()) {
4487  // Shift right/left of a null unto itself is still a null, i.e. rax = shl rax
4488  // X.
4489  case X86::SHR64ri:
4490  case X86::SHR32ri:
4491  case X86::SHL64ri:
4492  case X86::SHL32ri:
4493    assert(MI->getOperand(0).isDef() && MI->getOperand(1).isUse() &&
4494           "expected for shift opcode!");
4495    return MI->getOperand(0).getReg() == NullValueReg &&
4496           MI->getOperand(1).getReg() == NullValueReg;
4497  // Zero extend of a sub-reg of NullValueReg into itself does not change the
4498  // null value.
4499  case X86::MOV32rr:
4500    return llvm::all_of(MI->operands(), [&](const MachineOperand &MO) {
4501      return TRI->isSubRegisterEq(NullValueReg, MO.getReg());
4502    });
4503  default:
4504    return false;
4505  }
4506  llvm_unreachable("Should be handled above!");
4507}
4508
4509bool X86InstrInfo::getMemOperandsWithOffsetWidth(
4510    const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps,
4511    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
4512    const TargetRegisterInfo *TRI) const {
4513  const MCInstrDesc &Desc = MemOp.getDesc();
4514  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
4515  if (MemRefBegin < 0)
4516    return false;
4517
4518  MemRefBegin += X86II::getOperandBias(Desc);
4519
4520  const MachineOperand *BaseOp =
4521      &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
4522  if (!BaseOp->isReg()) // Can be an MO_FrameIndex
4523    return false;
4524
4525  if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
4526    return false;
4527
4528  if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
4529      X86::NoRegister)
4530    return false;
4531
4532  const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
4533
4534  // Displacement can be symbolic
4535  if (!DispMO.isImm())
4536    return false;
4537
4538  Offset = DispMO.getImm();
4539
4540  if (!BaseOp->isReg())
4541    return false;
4542
4543  OffsetIsScalable = false;
4544  // FIXME: Relying on memoperands() may not be right thing to do here. Check
4545  // with X86 maintainers, and fix it accordingly. For now, it is ok, since
4546  // there is no use of `Width` for X86 back-end at the moment.
4547  Width =
4548      !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
4549  BaseOps.push_back(BaseOp);
4550  return true;
4551}
4552
4553static unsigned getStoreRegOpcode(Register SrcReg,
4554                                  const TargetRegisterClass *RC,
4555                                  bool IsStackAligned,
4556                                  const X86Subtarget &STI) {
4557  return getLoadStoreRegOpcode(SrcReg, RC, IsStackAligned, STI, false);
4558}
4559
4560static unsigned getLoadRegOpcode(Register DestReg,
4561                                 const TargetRegisterClass *RC,
4562                                 bool IsStackAligned, const X86Subtarget &STI) {
4563  return getLoadStoreRegOpcode(DestReg, RC, IsStackAligned, STI, true);
4564}
4565
4566static bool isAMXOpcode(unsigned Opc) {
4567  switch (Opc) {
4568  default:
4569    return false;
4570  case X86::TILELOADD:
4571  case X86::TILESTORED:
4572  case X86::TILELOADD_EVEX:
4573  case X86::TILESTORED_EVEX:
4574    return true;
4575  }
4576}
4577
4578void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
4579                                    MachineBasicBlock::iterator MI,
4580                                    unsigned Opc, Register Reg, int FrameIdx,
4581                                    bool isKill) const {
4582  switch (Opc) {
4583  default:
4584    llvm_unreachable("Unexpected special opcode!");
4585  case X86::TILESTORED:
4586  case X86::TILESTORED_EVEX: {
4587    // tilestored %tmm, (%sp, %idx)
4588    MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4589    Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4590    BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4591    MachineInstr *NewMI =
4592        addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4593            .addReg(Reg, getKillRegState(isKill));
4594    MachineOperand &MO = NewMI->getOperand(X86::AddrIndexReg);
4595    MO.setReg(VirtReg);
4596    MO.setIsKill(true);
4597    break;
4598  }
4599  case X86::TILELOADD:
4600  case X86::TILELOADD_EVEX: {
4601    // tileloadd (%sp, %idx), %tmm
4602    MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
4603    Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
4604    BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64);
4605    MachineInstr *NewMI = addFrameReference(
4606        BuildMI(MBB, MI, DebugLoc(), get(Opc), Reg), FrameIdx);
4607    MachineOperand &MO = NewMI->getOperand(1 + X86::AddrIndexReg);
4608    MO.setReg(VirtReg);
4609    MO.setIsKill(true);
4610    break;
4611  }
4612  }
4613}
4614
4615void X86InstrInfo::storeRegToStackSlot(
4616    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
4617    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
4618    const TargetRegisterInfo *TRI, Register VReg) const {
4619  const MachineFunction &MF = *MBB.getParent();
4620  const MachineFrameInfo &MFI = MF.getFrameInfo();
4621  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
4622         "Stack slot too small for store");
4623
4624  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
4625  bool isAligned =
4626      (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4627      (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4628
4629  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
4630  if (isAMXOpcode(Opc))
4631    loadStoreTileReg(MBB, MI, Opc, SrcReg, FrameIdx, isKill);
4632  else
4633    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
4634        .addReg(SrcReg, getKillRegState(isKill));
4635}
4636
4637void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
4638                                        MachineBasicBlock::iterator MI,
4639                                        Register DestReg, int FrameIdx,
4640                                        const TargetRegisterClass *RC,
4641                                        const TargetRegisterInfo *TRI,
4642                                        Register VReg) const {
4643  const MachineFunction &MF = *MBB.getParent();
4644  const MachineFrameInfo &MFI = MF.getFrameInfo();
4645  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
4646         "Load size exceeds stack slot");
4647  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
4648  bool isAligned =
4649      (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
4650      (RI.canRealignStack(MF) && !MFI.isFixedObjectIndex(FrameIdx));
4651
4652  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
4653  if (isAMXOpcode(Opc))
4654    loadStoreTileReg(MBB, MI, Opc, DestReg, FrameIdx);
4655  else
4656    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
4657                      FrameIdx);
4658}
4659
4660bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
4661                                  Register &SrcReg2, int64_t &CmpMask,
4662                                  int64_t &CmpValue) const {
4663  switch (MI.getOpcode()) {
4664  default:
4665    break;
4666  case X86::CMP64ri32:
4667  case X86::CMP32ri:
4668  case X86::CMP16ri:
4669  case X86::CMP8ri:
4670    SrcReg = MI.getOperand(0).getReg();
4671    SrcReg2 = 0;
4672    if (MI.getOperand(1).isImm()) {
4673      CmpMask = ~0;
4674      CmpValue = MI.getOperand(1).getImm();
4675    } else {
4676      CmpMask = CmpValue = 0;
4677    }
4678    return true;
4679  // A SUB can be used to perform comparison.
4680  case X86::SUB64rm:
4681  case X86::SUB32rm:
4682  case X86::SUB16rm:
4683  case X86::SUB8rm:
4684    SrcReg = MI.getOperand(1).getReg();
4685    SrcReg2 = 0;
4686    CmpMask = 0;
4687    CmpValue = 0;
4688    return true;
4689  case X86::SUB64rr:
4690  case X86::SUB32rr:
4691  case X86::SUB16rr:
4692  case X86::SUB8rr:
4693    SrcReg = MI.getOperand(1).getReg();
4694    SrcReg2 = MI.getOperand(2).getReg();
4695    CmpMask = 0;
4696    CmpValue = 0;
4697    return true;
4698  case X86::SUB64ri32:
4699  case X86::SUB32ri:
4700  case X86::SUB16ri:
4701  case X86::SUB8ri:
4702    SrcReg = MI.getOperand(1).getReg();
4703    SrcReg2 = 0;
4704    if (MI.getOperand(2).isImm()) {
4705      CmpMask = ~0;
4706      CmpValue = MI.getOperand(2).getImm();
4707    } else {
4708      CmpMask = CmpValue = 0;
4709    }
4710    return true;
4711  case X86::CMP64rr:
4712  case X86::CMP32rr:
4713  case X86::CMP16rr:
4714  case X86::CMP8rr:
4715    SrcReg = MI.getOperand(0).getReg();
4716    SrcReg2 = MI.getOperand(1).getReg();
4717    CmpMask = 0;
4718    CmpValue = 0;
4719    return true;
4720  case X86::TEST8rr:
4721  case X86::TEST16rr:
4722  case X86::TEST32rr:
4723  case X86::TEST64rr:
4724    SrcReg = MI.getOperand(0).getReg();
4725    if (MI.getOperand(1).getReg() != SrcReg)
4726      return false;
4727    // Compare against zero.
4728    SrcReg2 = 0;
4729    CmpMask = ~0;
4730    CmpValue = 0;
4731    return true;
4732  }
4733  return false;
4734}
4735
4736bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
4737                                        Register SrcReg, Register SrcReg2,
4738                                        int64_t ImmMask, int64_t ImmValue,
4739                                        const MachineInstr &OI, bool *IsSwapped,
4740                                        int64_t *ImmDelta) const {
4741  switch (OI.getOpcode()) {
4742  case X86::CMP64rr:
4743  case X86::CMP32rr:
4744  case X86::CMP16rr:
4745  case X86::CMP8rr:
4746  case X86::SUB64rr:
4747  case X86::SUB32rr:
4748  case X86::SUB16rr:
4749  case X86::SUB8rr: {
4750    Register OISrcReg;
4751    Register OISrcReg2;
4752    int64_t OIMask;
4753    int64_t OIValue;
4754    if (!analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) ||
4755        OIMask != ImmMask || OIValue != ImmValue)
4756      return false;
4757    if (SrcReg == OISrcReg && SrcReg2 == OISrcReg2) {
4758      *IsSwapped = false;
4759      return true;
4760    }
4761    if (SrcReg == OISrcReg2 && SrcReg2 == OISrcReg) {
4762      *IsSwapped = true;
4763      return true;
4764    }
4765    return false;
4766  }
4767  case X86::CMP64ri32:
4768  case X86::CMP32ri:
4769  case X86::CMP16ri:
4770  case X86::CMP8ri:
4771  case X86::SUB64ri32:
4772  case X86::SUB32ri:
4773  case X86::SUB16ri:
4774  case X86::SUB8ri:
4775  case X86::TEST64rr:
4776  case X86::TEST32rr:
4777  case X86::TEST16rr:
4778  case X86::TEST8rr: {
4779    if (ImmMask != 0) {
4780      Register OISrcReg;
4781      Register OISrcReg2;
4782      int64_t OIMask;
4783      int64_t OIValue;
4784      if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
4785          SrcReg == OISrcReg && ImmMask == OIMask) {
4786        if (OIValue == ImmValue) {
4787          *ImmDelta = 0;
4788          return true;
4789        } else if (static_cast<uint64_t>(ImmValue) ==
4790                   static_cast<uint64_t>(OIValue) - 1) {
4791          *ImmDelta = -1;
4792          return true;
4793        } else if (static_cast<uint64_t>(ImmValue) ==
4794                   static_cast<uint64_t>(OIValue) + 1) {
4795          *ImmDelta = 1;
4796          return true;
4797        } else {
4798          return false;
4799        }
4800      }
4801    }
4802    return FlagI.isIdenticalTo(OI);
4803  }
4804  default:
4805    return false;
4806  }
4807}
4808
4809/// Check whether the definition can be converted
4810/// to remove a comparison against zero.
4811inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
4812                                    bool &ClearsOverflowFlag) {
4813  NoSignFlag = false;
4814  ClearsOverflowFlag = false;
4815
4816  // "ELF Handling for Thread-Local Storage" specifies that x86-64 GOTTPOFF, and
4817  // i386 GOTNTPOFF/INDNTPOFF relocations can convert an ADD to a LEA during
4818  // Initial Exec to Local Exec relaxation. In these cases, we must not depend
4819  // on the EFLAGS modification of ADD actually happening in the final binary.
4820  if (MI.getOpcode() == X86::ADD64rm || MI.getOpcode() == X86::ADD32rm) {
4821    unsigned Flags = MI.getOperand(5).getTargetFlags();
4822    if (Flags == X86II::MO_GOTTPOFF || Flags == X86II::MO_INDNTPOFF ||
4823        Flags == X86II::MO_GOTNTPOFF)
4824      return false;
4825  }
4826
4827  switch (MI.getOpcode()) {
4828  default:
4829    return false;
4830
4831  // The shift instructions only modify ZF if their shift count is non-zero.
4832  // N.B.: The processor truncates the shift count depending on the encoding.
4833  case X86::SAR8ri:
4834  case X86::SAR16ri:
4835  case X86::SAR32ri:
4836  case X86::SAR64ri:
4837  case X86::SHR8ri:
4838  case X86::SHR16ri:
4839  case X86::SHR32ri:
4840  case X86::SHR64ri:
4841    return getTruncatedShiftCount(MI, 2) != 0;
4842
4843  // Some left shift instructions can be turned into LEA instructions but only
4844  // if their flags aren't used. Avoid transforming such instructions.
4845  case X86::SHL8ri:
4846  case X86::SHL16ri:
4847  case X86::SHL32ri:
4848  case X86::SHL64ri: {
4849    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
4850    if (isTruncatedShiftCountForLEA(ShAmt))
4851      return false;
4852    return ShAmt != 0;
4853  }
4854
4855  case X86::SHRD16rri8:
4856  case X86::SHRD32rri8:
4857  case X86::SHRD64rri8:
4858  case X86::SHLD16rri8:
4859  case X86::SHLD32rri8:
4860  case X86::SHLD64rri8:
4861    return getTruncatedShiftCount(MI, 3) != 0;
4862
4863  case X86::SUB64ri32:
4864  case X86::SUB32ri:
4865  case X86::SUB16ri:
4866  case X86::SUB8ri:
4867  case X86::SUB64rr:
4868  case X86::SUB32rr:
4869  case X86::SUB16rr:
4870  case X86::SUB8rr:
4871  case X86::SUB64rm:
4872  case X86::SUB32rm:
4873  case X86::SUB16rm:
4874  case X86::SUB8rm:
4875  case X86::DEC64r:
4876  case X86::DEC32r:
4877  case X86::DEC16r:
4878  case X86::DEC8r:
4879  case X86::ADD64ri32:
4880  case X86::ADD32ri:
4881  case X86::ADD16ri:
4882  case X86::ADD8ri:
4883  case X86::ADD64rr:
4884  case X86::ADD32rr:
4885  case X86::ADD16rr:
4886  case X86::ADD8rr:
4887  case X86::ADD64rm:
4888  case X86::ADD32rm:
4889  case X86::ADD16rm:
4890  case X86::ADD8rm:
4891  case X86::INC64r:
4892  case X86::INC32r:
4893  case X86::INC16r:
4894  case X86::INC8r:
4895  case X86::ADC64ri32:
4896  case X86::ADC32ri:
4897  case X86::ADC16ri:
4898  case X86::ADC8ri:
4899  case X86::ADC64rr:
4900  case X86::ADC32rr:
4901  case X86::ADC16rr:
4902  case X86::ADC8rr:
4903  case X86::ADC64rm:
4904  case X86::ADC32rm:
4905  case X86::ADC16rm:
4906  case X86::ADC8rm:
4907  case X86::SBB64ri32:
4908  case X86::SBB32ri:
4909  case X86::SBB16ri:
4910  case X86::SBB8ri:
4911  case X86::SBB64rr:
4912  case X86::SBB32rr:
4913  case X86::SBB16rr:
4914  case X86::SBB8rr:
4915  case X86::SBB64rm:
4916  case X86::SBB32rm:
4917  case X86::SBB16rm:
4918  case X86::SBB8rm:
4919  case X86::NEG8r:
4920  case X86::NEG16r:
4921  case X86::NEG32r:
4922  case X86::NEG64r:
4923  case X86::LZCNT16rr:
4924  case X86::LZCNT16rm:
4925  case X86::LZCNT32rr:
4926  case X86::LZCNT32rm:
4927  case X86::LZCNT64rr:
4928  case X86::LZCNT64rm:
4929  case X86::POPCNT16rr:
4930  case X86::POPCNT16rm:
4931  case X86::POPCNT32rr:
4932  case X86::POPCNT32rm:
4933  case X86::POPCNT64rr:
4934  case X86::POPCNT64rm:
4935  case X86::TZCNT16rr:
4936  case X86::TZCNT16rm:
4937  case X86::TZCNT32rr:
4938  case X86::TZCNT32rm:
4939  case X86::TZCNT64rr:
4940  case X86::TZCNT64rm:
4941    return true;
4942  case X86::AND64ri32:
4943  case X86::AND32ri:
4944  case X86::AND16ri:
4945  case X86::AND8ri:
4946  case X86::AND64rr:
4947  case X86::AND32rr:
4948  case X86::AND16rr:
4949  case X86::AND8rr:
4950  case X86::AND64rm:
4951  case X86::AND32rm:
4952  case X86::AND16rm:
4953  case X86::AND8rm:
4954  case X86::XOR64ri32:
4955  case X86::XOR32ri:
4956  case X86::XOR16ri:
4957  case X86::XOR8ri:
4958  case X86::XOR64rr:
4959  case X86::XOR32rr:
4960  case X86::XOR16rr:
4961  case X86::XOR8rr:
4962  case X86::XOR64rm:
4963  case X86::XOR32rm:
4964  case X86::XOR16rm:
4965  case X86::XOR8rm:
4966  case X86::OR64ri32:
4967  case X86::OR32ri:
4968  case X86::OR16ri:
4969  case X86::OR8ri:
4970  case X86::OR64rr:
4971  case X86::OR32rr:
4972  case X86::OR16rr:
4973  case X86::OR8rr:
4974  case X86::OR64rm:
4975  case X86::OR32rm:
4976  case X86::OR16rm:
4977  case X86::OR8rm:
4978  case X86::ANDN32rr:
4979  case X86::ANDN32rm:
4980  case X86::ANDN64rr:
4981  case X86::ANDN64rm:
4982  case X86::BLSI32rr:
4983  case X86::BLSI32rm:
4984  case X86::BLSI64rr:
4985  case X86::BLSI64rm:
4986  case X86::BLSMSK32rr:
4987  case X86::BLSMSK32rm:
4988  case X86::BLSMSK64rr:
4989  case X86::BLSMSK64rm:
4990  case X86::BLSR32rr:
4991  case X86::BLSR32rm:
4992  case X86::BLSR64rr:
4993  case X86::BLSR64rm:
4994  case X86::BLCFILL32rr:
4995  case X86::BLCFILL32rm:
4996  case X86::BLCFILL64rr:
4997  case X86::BLCFILL64rm:
4998  case X86::BLCI32rr:
4999  case X86::BLCI32rm:
5000  case X86::BLCI64rr:
5001  case X86::BLCI64rm:
5002  case X86::BLCIC32rr:
5003  case X86::BLCIC32rm:
5004  case X86::BLCIC64rr:
5005  case X86::BLCIC64rm:
5006  case X86::BLCMSK32rr:
5007  case X86::BLCMSK32rm:
5008  case X86::BLCMSK64rr:
5009  case X86::BLCMSK64rm:
5010  case X86::BLCS32rr:
5011  case X86::BLCS32rm:
5012  case X86::BLCS64rr:
5013  case X86::BLCS64rm:
5014  case X86::BLSFILL32rr:
5015  case X86::BLSFILL32rm:
5016  case X86::BLSFILL64rr:
5017  case X86::BLSFILL64rm:
5018  case X86::BLSIC32rr:
5019  case X86::BLSIC32rm:
5020  case X86::BLSIC64rr:
5021  case X86::BLSIC64rm:
5022  case X86::BZHI32rr:
5023  case X86::BZHI32rm:
5024  case X86::BZHI64rr:
5025  case X86::BZHI64rm:
5026  case X86::T1MSKC32rr:
5027  case X86::T1MSKC32rm:
5028  case X86::T1MSKC64rr:
5029  case X86::T1MSKC64rm:
5030  case X86::TZMSK32rr:
5031  case X86::TZMSK32rm:
5032  case X86::TZMSK64rr:
5033  case X86::TZMSK64rm:
5034    // These instructions clear the overflow flag just like TEST.
5035    // FIXME: These are not the only instructions in this switch that clear the
5036    // overflow flag.
5037    ClearsOverflowFlag = true;
5038    return true;
5039  case X86::BEXTR32rr:
5040  case X86::BEXTR64rr:
5041  case X86::BEXTR32rm:
5042  case X86::BEXTR64rm:
5043  case X86::BEXTRI32ri:
5044  case X86::BEXTRI32mi:
5045  case X86::BEXTRI64ri:
5046  case X86::BEXTRI64mi:
5047    // BEXTR doesn't update the sign flag so we can't use it. It does clear
5048    // the overflow flag, but that's not useful without the sign flag.
5049    NoSignFlag = true;
5050    return true;
5051  }
5052}
5053
5054/// Check whether the use can be converted to remove a comparison against zero.
5055static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
5056  switch (MI.getOpcode()) {
5057  default:
5058    return X86::COND_INVALID;
5059  case X86::NEG8r:
5060  case X86::NEG16r:
5061  case X86::NEG32r:
5062  case X86::NEG64r:
5063    return X86::COND_AE;
5064  case X86::LZCNT16rr:
5065  case X86::LZCNT32rr:
5066  case X86::LZCNT64rr:
5067    return X86::COND_B;
5068  case X86::POPCNT16rr:
5069  case X86::POPCNT32rr:
5070  case X86::POPCNT64rr:
5071    return X86::COND_E;
5072  case X86::TZCNT16rr:
5073  case X86::TZCNT32rr:
5074  case X86::TZCNT64rr:
5075    return X86::COND_B;
5076  case X86::BSF16rr:
5077  case X86::BSF32rr:
5078  case X86::BSF64rr:
5079  case X86::BSR16rr:
5080  case X86::BSR32rr:
5081  case X86::BSR64rr:
5082    return X86::COND_E;
5083  case X86::BLSI32rr:
5084  case X86::BLSI64rr:
5085    return X86::COND_AE;
5086  case X86::BLSR32rr:
5087  case X86::BLSR64rr:
5088  case X86::BLSMSK32rr:
5089  case X86::BLSMSK64rr:
5090    return X86::COND_B;
5091    // TODO: TBM instructions.
5092  }
5093}
5094
5095/// Check if there exists an earlier instruction that
5096/// operates on the same source operands and sets flags in the same way as
5097/// Compare; remove Compare if possible.
5098bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
5099                                        Register SrcReg2, int64_t CmpMask,
5100                                        int64_t CmpValue,
5101                                        const MachineRegisterInfo *MRI) const {
5102  // Check whether we can replace SUB with CMP.
5103  switch (CmpInstr.getOpcode()) {
5104  default:
5105    break;
5106  case X86::SUB64ri32:
5107  case X86::SUB32ri:
5108  case X86::SUB16ri:
5109  case X86::SUB8ri:
5110  case X86::SUB64rm:
5111  case X86::SUB32rm:
5112  case X86::SUB16rm:
5113  case X86::SUB8rm:
5114  case X86::SUB64rr:
5115  case X86::SUB32rr:
5116  case X86::SUB16rr:
5117  case X86::SUB8rr: {
5118    if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
5119      return false;
5120    // There is no use of the destination register, we can replace SUB with CMP.
5121    unsigned NewOpcode = 0;
5122    switch (CmpInstr.getOpcode()) {
5123    default:
5124      llvm_unreachable("Unreachable!");
5125    case X86::SUB64rm:
5126      NewOpcode = X86::CMP64rm;
5127      break;
5128    case X86::SUB32rm:
5129      NewOpcode = X86::CMP32rm;
5130      break;
5131    case X86::SUB16rm:
5132      NewOpcode = X86::CMP16rm;
5133      break;
5134    case X86::SUB8rm:
5135      NewOpcode = X86::CMP8rm;
5136      break;
5137    case X86::SUB64rr:
5138      NewOpcode = X86::CMP64rr;
5139      break;
5140    case X86::SUB32rr:
5141      NewOpcode = X86::CMP32rr;
5142      break;
5143    case X86::SUB16rr:
5144      NewOpcode = X86::CMP16rr;
5145      break;
5146    case X86::SUB8rr:
5147      NewOpcode = X86::CMP8rr;
5148      break;
5149    case X86::SUB64ri32:
5150      NewOpcode = X86::CMP64ri32;
5151      break;
5152    case X86::SUB32ri:
5153      NewOpcode = X86::CMP32ri;
5154      break;
5155    case X86::SUB16ri:
5156      NewOpcode = X86::CMP16ri;
5157      break;
5158    case X86::SUB8ri:
5159      NewOpcode = X86::CMP8ri;
5160      break;
5161    }
5162    CmpInstr.setDesc(get(NewOpcode));
5163    CmpInstr.removeOperand(0);
5164    // Mutating this instruction invalidates any debug data associated with it.
5165    CmpInstr.dropDebugNumber();
5166    // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
5167    if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
5168        NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
5169      return false;
5170  }
5171  }
5172
5173  // The following code tries to remove the comparison by re-using EFLAGS
5174  // from earlier instructions.
5175
5176  bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
5177
5178  // Transformation currently requires SSA values.
5179  if (SrcReg2.isPhysical())
5180    return false;
5181  MachineInstr *SrcRegDef = MRI->getVRegDef(SrcReg);
5182  assert(SrcRegDef && "Must have a definition (SSA)");
5183
5184  MachineInstr *MI = nullptr;
5185  MachineInstr *Sub = nullptr;
5186  MachineInstr *Movr0Inst = nullptr;
5187  bool NoSignFlag = false;
5188  bool ClearsOverflowFlag = false;
5189  bool ShouldUpdateCC = false;
5190  bool IsSwapped = false;
5191  X86::CondCode NewCC = X86::COND_INVALID;
5192  int64_t ImmDelta = 0;
5193
5194  // Search backward from CmpInstr for the next instruction defining EFLAGS.
5195  const TargetRegisterInfo *TRI = &getRegisterInfo();
5196  MachineBasicBlock &CmpMBB = *CmpInstr.getParent();
5197  MachineBasicBlock::reverse_iterator From =
5198      std::next(MachineBasicBlock::reverse_iterator(CmpInstr));
5199  for (MachineBasicBlock *MBB = &CmpMBB;;) {
5200    for (MachineInstr &Inst : make_range(From, MBB->rend())) {
5201      // Try to use EFLAGS from the instruction defining %SrcReg. Example:
5202      //     %eax = addl ...
5203      //     ...                // EFLAGS not changed
5204      //     testl %eax, %eax   // <-- can be removed
5205      if (&Inst == SrcRegDef) {
5206        if (IsCmpZero &&
5207            isDefConvertible(Inst, NoSignFlag, ClearsOverflowFlag)) {
5208          MI = &Inst;
5209          break;
5210        }
5211
5212        // Look back for the following pattern, in which case the
5213        // test16rr/test64rr instruction could be erased.
5214        //
5215        // Example for test16rr:
5216        //  %reg = and32ri %in_reg, 5
5217        //  ...                         // EFLAGS not changed.
5218        //  %src_reg = copy %reg.sub_16bit:gr32
5219        //  test16rr %src_reg, %src_reg, implicit-def $eflags
5220        // Example for test64rr:
5221        //  %reg = and32ri %in_reg, 5
5222        //  ...                         // EFLAGS not changed.
5223        //  %src_reg = subreg_to_reg 0, %reg, %subreg.sub_index
5224        //  test64rr %src_reg, %src_reg, implicit-def $eflags
5225        MachineInstr *AndInstr = nullptr;
5226        if (IsCmpZero &&
5227            findRedundantFlagInstr(CmpInstr, Inst, MRI, &AndInstr, TRI,
5228                                   NoSignFlag, ClearsOverflowFlag)) {
5229          assert(AndInstr != nullptr && X86::isAND(AndInstr->getOpcode()));
5230          MI = AndInstr;
5231          break;
5232        }
5233        // Cannot find other candidates before definition of SrcReg.
5234        return false;
5235      }
5236
5237      if (Inst.modifiesRegister(X86::EFLAGS, TRI)) {
5238        // Try to use EFLAGS produced by an instruction reading %SrcReg.
5239        // Example:
5240        //      %eax = ...
5241        //      ...
5242        //      popcntl %eax
5243        //      ...                 // EFLAGS not changed
5244        //      testl %eax, %eax    // <-- can be removed
5245        if (IsCmpZero) {
5246          NewCC = isUseDefConvertible(Inst);
5247          if (NewCC != X86::COND_INVALID && Inst.getOperand(1).isReg() &&
5248              Inst.getOperand(1).getReg() == SrcReg) {
5249            ShouldUpdateCC = true;
5250            MI = &Inst;
5251            break;
5252          }
5253        }
5254
5255        // Try to use EFLAGS from an instruction with similar flag results.
5256        // Example:
5257        //     sub x, y  or  cmp x, y
5258        //     ...           // EFLAGS not changed
5259        //     cmp x, y      // <-- can be removed
5260        if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
5261                                 Inst, &IsSwapped, &ImmDelta)) {
5262          Sub = &Inst;
5263          break;
5264        }
5265
5266        // MOV32r0 is implemented with xor which clobbers condition code. It is
5267        // safe to move up, if the definition to EFLAGS is dead and earlier
5268        // instructions do not read or write EFLAGS.
5269        if (!Movr0Inst && Inst.getOpcode() == X86::MOV32r0 &&
5270            Inst.registerDefIsDead(X86::EFLAGS, TRI)) {
5271          Movr0Inst = &Inst;
5272          continue;
5273        }
5274
5275        // Cannot do anything for any other EFLAG changes.
5276        return false;
5277      }
5278    }
5279
5280    if (MI || Sub)
5281      break;
5282
5283    // Reached begin of basic block. Continue in predecessor if there is
5284    // exactly one.
5285    if (MBB->pred_size() != 1)
5286      return false;
5287    MBB = *MBB->pred_begin();
5288    From = MBB->rbegin();
5289  }
5290
5291  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
5292  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
5293  // If we are done with the basic block, we need to check whether EFLAGS is
5294  // live-out.
5295  bool FlagsMayLiveOut = true;
5296  SmallVector<std::pair<MachineInstr *, X86::CondCode>, 4> OpsToUpdate;
5297  MachineBasicBlock::iterator AfterCmpInstr =
5298      std::next(MachineBasicBlock::iterator(CmpInstr));
5299  for (MachineInstr &Instr : make_range(AfterCmpInstr, CmpMBB.end())) {
5300    bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
5301    bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
5302    // We should check the usage if this instruction uses and updates EFLAGS.
5303    if (!UseEFLAGS && ModifyEFLAGS) {
5304      // It is safe to remove CmpInstr if EFLAGS is updated again.
5305      FlagsMayLiveOut = false;
5306      break;
5307    }
5308    if (!UseEFLAGS && !ModifyEFLAGS)
5309      continue;
5310
5311    // EFLAGS is used by this instruction.
5312    X86::CondCode OldCC = X86::getCondFromMI(Instr);
5313    if ((MI || IsSwapped || ImmDelta != 0) && OldCC == X86::COND_INVALID)
5314      return false;
5315
5316    X86::CondCode ReplacementCC = X86::COND_INVALID;
5317    if (MI) {
5318      switch (OldCC) {
5319      default:
5320        break;
5321      case X86::COND_A:
5322      case X86::COND_AE:
5323      case X86::COND_B:
5324      case X86::COND_BE:
5325        // CF is used, we can't perform this optimization.
5326        return false;
5327      case X86::COND_G:
5328      case X86::COND_GE:
5329      case X86::COND_L:
5330      case X86::COND_LE:
5331        // If SF is used, but the instruction doesn't update the SF, then we
5332        // can't do the optimization.
5333        if (NoSignFlag)
5334          return false;
5335        [[fallthrough]];
5336      case X86::COND_O:
5337      case X86::COND_NO:
5338        // If OF is used, the instruction needs to clear it like CmpZero does.
5339        if (!ClearsOverflowFlag)
5340          return false;
5341        break;
5342      case X86::COND_S:
5343      case X86::COND_NS:
5344        // If SF is used, but the instruction doesn't update the SF, then we
5345        // can't do the optimization.
5346        if (NoSignFlag)
5347          return false;
5348        break;
5349      }
5350
5351      // If we're updating the condition code check if we have to reverse the
5352      // condition.
5353      if (ShouldUpdateCC)
5354        switch (OldCC) {
5355        default:
5356          return false;
5357        case X86::COND_E:
5358          ReplacementCC = NewCC;
5359          break;
5360        case X86::COND_NE:
5361          ReplacementCC = GetOppositeBranchCondition(NewCC);
5362          break;
5363        }
5364    } else if (IsSwapped) {
5365      // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
5366      // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
5367      // We swap the condition code and synthesize the new opcode.
5368      ReplacementCC = getSwappedCondition(OldCC);
5369      if (ReplacementCC == X86::COND_INVALID)
5370        return false;
5371      ShouldUpdateCC = true;
5372    } else if (ImmDelta != 0) {
5373      unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
5374      // Shift amount for min/max constants to adjust for 8/16/32 instruction
5375      // sizes.
5376      switch (OldCC) {
5377      case X86::COND_L: // x <s (C + 1)  -->  x <=s C
5378        if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5379          return false;
5380        ReplacementCC = X86::COND_LE;
5381        break;
5382      case X86::COND_B: // x <u (C + 1)  -->  x <=u C
5383        if (ImmDelta != 1 || CmpValue == 0)
5384          return false;
5385        ReplacementCC = X86::COND_BE;
5386        break;
5387      case X86::COND_GE: // x >=s (C + 1)  -->  x >s C
5388        if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
5389          return false;
5390        ReplacementCC = X86::COND_G;
5391        break;
5392      case X86::COND_AE: // x >=u (C + 1)  -->  x >u C
5393        if (ImmDelta != 1 || CmpValue == 0)
5394          return false;
5395        ReplacementCC = X86::COND_A;
5396        break;
5397      case X86::COND_G: // x >s (C - 1)  -->  x >=s C
5398        if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5399          return false;
5400        ReplacementCC = X86::COND_GE;
5401        break;
5402      case X86::COND_A: // x >u (C - 1)  -->  x >=u C
5403        if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5404          return false;
5405        ReplacementCC = X86::COND_AE;
5406        break;
5407      case X86::COND_LE: // x <=s (C - 1)  -->  x <s C
5408        if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
5409          return false;
5410        ReplacementCC = X86::COND_L;
5411        break;
5412      case X86::COND_BE: // x <=u (C - 1)  -->  x <u C
5413        if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
5414          return false;
5415        ReplacementCC = X86::COND_B;
5416        break;
5417      default:
5418        return false;
5419      }
5420      ShouldUpdateCC = true;
5421    }
5422
5423    if (ShouldUpdateCC && ReplacementCC != OldCC) {
5424      // Push the MachineInstr to OpsToUpdate.
5425      // If it is safe to remove CmpInstr, the condition code of these
5426      // instructions will be modified.
5427      OpsToUpdate.push_back(std::make_pair(&Instr, ReplacementCC));
5428    }
5429    if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
5430      // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
5431      FlagsMayLiveOut = false;
5432      break;
5433    }
5434  }
5435
5436  // If we have to update users but EFLAGS is live-out abort, since we cannot
5437  // easily find all of the users.
5438  if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
5439    for (MachineBasicBlock *Successor : CmpMBB.successors())
5440      if (Successor->isLiveIn(X86::EFLAGS))
5441        return false;
5442  }
5443
5444  // The instruction to be updated is either Sub or MI.
5445  assert((MI == nullptr || Sub == nullptr) && "Should not have Sub and MI set");
5446  Sub = MI != nullptr ? MI : Sub;
5447  MachineBasicBlock *SubBB = Sub->getParent();
5448  // Move Movr0Inst to the appropriate place before Sub.
5449  if (Movr0Inst) {
5450    // Only move within the same block so we don't accidentally move to a
5451    // block with higher execution frequency.
5452    if (&CmpMBB != SubBB)
5453      return false;
5454    // Look backwards until we find a def that doesn't use the current EFLAGS.
5455    MachineBasicBlock::reverse_iterator InsertI = Sub,
5456                                        InsertE = Sub->getParent()->rend();
5457    for (; InsertI != InsertE; ++InsertI) {
5458      MachineInstr *Instr = &*InsertI;
5459      if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
5460          Instr->modifiesRegister(X86::EFLAGS, TRI)) {
5461        Movr0Inst->getParent()->remove(Movr0Inst);
5462        Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
5463                                   Movr0Inst);
5464        break;
5465      }
5466    }
5467    if (InsertI == InsertE)
5468      return false;
5469  }
5470
5471  // Make sure Sub instruction defines EFLAGS and mark the def live.
5472  MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
5473  assert(FlagDef && "Unable to locate a def EFLAGS operand");
5474  FlagDef->setIsDead(false);
5475
5476  CmpInstr.eraseFromParent();
5477
5478  // Modify the condition code of instructions in OpsToUpdate.
5479  for (auto &Op : OpsToUpdate) {
5480    Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
5481        .setImm(Op.second);
5482  }
5483  // Add EFLAGS to block live-ins between CmpBB and block of flags producer.
5484  for (MachineBasicBlock *MBB = &CmpMBB; MBB != SubBB;
5485       MBB = *MBB->pred_begin()) {
5486    assert(MBB->pred_size() == 1 && "Expected exactly one predecessor");
5487    if (!MBB->isLiveIn(X86::EFLAGS))
5488      MBB->addLiveIn(X86::EFLAGS);
5489  }
5490  return true;
5491}
5492
5493/// Try to remove the load by folding it to a register
5494/// operand at the use. We fold the load instructions if load defines a virtual
5495/// register, the virtual register is used once in the same BB, and the
5496/// instructions in-between do not load or store, and have no side effects.
5497MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
5498                                              const MachineRegisterInfo *MRI,
5499                                              Register &FoldAsLoadDefReg,
5500                                              MachineInstr *&DefMI) const {
5501  // Check whether we can move DefMI here.
5502  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
5503  assert(DefMI);
5504  bool SawStore = false;
5505  if (!DefMI->isSafeToMove(nullptr, SawStore))
5506    return nullptr;
5507
5508  // Collect information about virtual register operands of MI.
5509  SmallVector<unsigned, 1> SrcOperandIds;
5510  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5511    MachineOperand &MO = MI.getOperand(i);
5512    if (!MO.isReg())
5513      continue;
5514    Register Reg = MO.getReg();
5515    if (Reg != FoldAsLoadDefReg)
5516      continue;
5517    // Do not fold if we have a subreg use or a def.
5518    if (MO.getSubReg() || MO.isDef())
5519      return nullptr;
5520    SrcOperandIds.push_back(i);
5521  }
5522  if (SrcOperandIds.empty())
5523    return nullptr;
5524
5525  // Check whether we can fold the def into SrcOperandId.
5526  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
5527    FoldAsLoadDefReg = 0;
5528    return FoldMI;
5529  }
5530
5531  return nullptr;
5532}
5533
5534/// Convert an ALUrr opcode to corresponding ALUri opcode. Such as
5535///     ADD32rr  ==>  ADD32ri
5536/// ShiftRotate will be set to true if the Opcode is shift or rotate.
5537/// If the ALUri can be further changed to COPY when the immediate is 0, set
5538/// CanConvert2Copy to true.
5539static unsigned ConvertALUrr2ALUri(unsigned Opcode, bool &CanConvert2Copy,
5540                                   bool &ShiftRotate) {
5541  CanConvert2Copy = false;
5542  ShiftRotate = false;
5543  unsigned NewOpcode = 0;
5544  switch (Opcode) {
5545  case X86::ADD64rr:
5546    NewOpcode = X86::ADD64ri32;
5547    CanConvert2Copy = true;
5548    break;
5549  case X86::ADC64rr:
5550    NewOpcode = X86::ADC64ri32;
5551    break;
5552  case X86::SUB64rr:
5553    NewOpcode = X86::SUB64ri32;
5554    CanConvert2Copy = true;
5555    break;
5556  case X86::SBB64rr:
5557    NewOpcode = X86::SBB64ri32;
5558    break;
5559  case X86::AND64rr:
5560    NewOpcode = X86::AND64ri32;
5561    break;
5562  case X86::OR64rr:
5563    NewOpcode = X86::OR64ri32;
5564    CanConvert2Copy = true;
5565    break;
5566  case X86::XOR64rr:
5567    NewOpcode = X86::XOR64ri32;
5568    CanConvert2Copy = true;
5569    break;
5570  case X86::TEST64rr:
5571    NewOpcode = X86::TEST64ri32;
5572    break;
5573  case X86::CMP64rr:
5574    NewOpcode = X86::CMP64ri32;
5575    break;
5576  case X86::SHR64rCL:
5577    NewOpcode = X86::SHR64ri;
5578    ShiftRotate = true;
5579    break;
5580  case X86::SHL64rCL:
5581    NewOpcode = X86::SHL64ri;
5582    ShiftRotate = true;
5583    break;
5584  case X86::SAR64rCL:
5585    NewOpcode = X86::SAR64ri;
5586    ShiftRotate = true;
5587    break;
5588  case X86::ROL64rCL:
5589    NewOpcode = X86::ROL64ri;
5590    ShiftRotate = true;
5591    break;
5592  case X86::ROR64rCL:
5593    NewOpcode = X86::ROR64ri;
5594    ShiftRotate = true;
5595    break;
5596  case X86::RCL64rCL:
5597    NewOpcode = X86::RCL64ri;
5598    ShiftRotate = true;
5599    break;
5600  case X86::RCR64rCL:
5601    NewOpcode = X86::RCR64ri;
5602    ShiftRotate = true;
5603    break;
5604  case X86::ADD32rr:
5605    NewOpcode = X86::ADD32ri;
5606    CanConvert2Copy = true;
5607    break;
5608  case X86::ADC32rr:
5609    NewOpcode = X86::ADC32ri;
5610    break;
5611  case X86::SUB32rr:
5612    NewOpcode = X86::SUB32ri;
5613    CanConvert2Copy = true;
5614    break;
5615  case X86::SBB32rr:
5616    NewOpcode = X86::SBB32ri;
5617    break;
5618  case X86::AND32rr:
5619    NewOpcode = X86::AND32ri;
5620    break;
5621  case X86::OR32rr:
5622    NewOpcode = X86::OR32ri;
5623    CanConvert2Copy = true;
5624    break;
5625  case X86::XOR32rr:
5626    NewOpcode = X86::XOR32ri;
5627    CanConvert2Copy = true;
5628    break;
5629  case X86::TEST32rr:
5630    NewOpcode = X86::TEST32ri;
5631    break;
5632  case X86::CMP32rr:
5633    NewOpcode = X86::CMP32ri;
5634    break;
5635  case X86::SHR32rCL:
5636    NewOpcode = X86::SHR32ri;
5637    ShiftRotate = true;
5638    break;
5639  case X86::SHL32rCL:
5640    NewOpcode = X86::SHL32ri;
5641    ShiftRotate = true;
5642    break;
5643  case X86::SAR32rCL:
5644    NewOpcode = X86::SAR32ri;
5645    ShiftRotate = true;
5646    break;
5647  case X86::ROL32rCL:
5648    NewOpcode = X86::ROL32ri;
5649    ShiftRotate = true;
5650    break;
5651  case X86::ROR32rCL:
5652    NewOpcode = X86::ROR32ri;
5653    ShiftRotate = true;
5654    break;
5655  case X86::RCL32rCL:
5656    NewOpcode = X86::RCL32ri;
5657    ShiftRotate = true;
5658    break;
5659  case X86::RCR32rCL:
5660    NewOpcode = X86::RCR32ri;
5661    ShiftRotate = true;
5662    break;
5663  }
5664  return NewOpcode;
5665}
5666
5667/// Real implementation of FoldImmediate.
5668/// Reg is assigned ImmVal in DefMI, and is used in UseMI.
5669/// If MakeChange is true, this function tries to replace Reg by ImmVal in
5670/// UseMI. If MakeChange is false, just check if folding is possible.
5671/// Return true if folding is successful or possible.
5672bool X86InstrInfo::FoldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
5673                                     Register Reg, int64_t ImmVal,
5674                                     MachineRegisterInfo *MRI,
5675                                     bool MakeChange) const {
5676  bool Modified = false;
5677  bool ShiftRotate = false;
5678  // When ImmVal is 0, some instructions can be changed to COPY.
5679  bool CanChangeToCopy = false;
5680  unsigned Opc = UseMI.getOpcode();
5681
5682  // 64 bit operations accept sign extended 32 bit immediates.
5683  // 32 bit operations accept all 32 bit immediates, so we don't need to check
5684  // them.
5685  const TargetRegisterClass *RC = nullptr;
5686  if (Reg.isVirtual())
5687    RC = MRI->getRegClass(Reg);
5688  if ((Reg.isPhysical() && X86::GR64RegClass.contains(Reg)) ||
5689      (Reg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC))) {
5690    if (!isInt<32>(ImmVal))
5691      return false;
5692  }
5693
5694  if (UseMI.findRegisterUseOperand(Reg)->getSubReg())
5695    return false;
5696  // Immediate has larger code size than register. So avoid folding the
5697  // immediate if it has more than 1 use and we are optimizing for size.
5698  if (UseMI.getMF()->getFunction().hasOptSize() && Reg.isVirtual() &&
5699      !MRI->hasOneNonDBGUse(Reg))
5700    return false;
5701
5702  unsigned NewOpc;
5703  if (Opc == TargetOpcode::COPY) {
5704    Register ToReg = UseMI.getOperand(0).getReg();
5705    const TargetRegisterClass *RC = nullptr;
5706    if (ToReg.isVirtual())
5707      RC = MRI->getRegClass(ToReg);
5708    bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) ||
5709                   (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg));
5710    bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) ||
5711                   (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg));
5712    bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) ||
5713                  (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg));
5714
5715    if (ImmVal == 0) {
5716      // We have MOV32r0 only.
5717      if (!GR32Reg)
5718        return false;
5719    }
5720
5721    if (GR64Reg) {
5722      if (isUInt<32>(ImmVal))
5723        NewOpc = X86::MOV32ri64;
5724      else
5725        NewOpc = X86::MOV64ri;
5726    } else if (GR32Reg) {
5727      NewOpc = X86::MOV32ri;
5728      if (ImmVal == 0) {
5729        // MOV32r0 clobbers EFLAGS.
5730        const TargetRegisterInfo *TRI = &getRegisterInfo();
5731        if (UseMI.getParent()->computeRegisterLiveness(
5732                TRI, X86::EFLAGS, UseMI) != MachineBasicBlock::LQR_Dead)
5733          return false;
5734
5735        // MOV32r0 is different than other cases because it doesn't encode the
5736        // immediate in the instruction. So we directly modify it here.
5737        if (!MakeChange)
5738          return true;
5739        UseMI.setDesc(get(X86::MOV32r0));
5740        UseMI.removeOperand(UseMI.findRegisterUseOperandIdx(Reg));
5741        UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/true,
5742                                                   /*isImp=*/true,
5743                                                   /*isKill=*/false,
5744                                                   /*isDead=*/true));
5745        Modified = true;
5746      }
5747    } else if (GR8Reg)
5748      NewOpc = X86::MOV8ri;
5749    else
5750      return false;
5751  } else
5752    NewOpc = ConvertALUrr2ALUri(Opc, CanChangeToCopy, ShiftRotate);
5753
5754  if (!NewOpc)
5755    return false;
5756
5757  // For SUB instructions the immediate can only be the second source operand.
5758  if ((NewOpc == X86::SUB64ri32 || NewOpc == X86::SUB32ri ||
5759       NewOpc == X86::SBB64ri32 || NewOpc == X86::SBB32ri) &&
5760      UseMI.findRegisterUseOperandIdx(Reg) != 2)
5761    return false;
5762  // For CMP instructions the immediate can only be at index 1.
5763  if ((NewOpc == X86::CMP64ri32 || NewOpc == X86::CMP32ri) &&
5764      UseMI.findRegisterUseOperandIdx(Reg) != 1)
5765    return false;
5766
5767  if (ShiftRotate) {
5768    unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg);
5769    if (RegIdx < 2)
5770      return false;
5771    if (!isInt<8>(ImmVal))
5772      return false;
5773    assert(Reg == X86::CL);
5774
5775    if (!MakeChange)
5776      return true;
5777    UseMI.setDesc(get(NewOpc));
5778    UseMI.removeOperand(RegIdx);
5779    UseMI.addOperand(MachineOperand::CreateImm(ImmVal));
5780    // Reg is physical register $cl, so we don't know if DefMI is dead through
5781    // MRI. Let the caller handle it, or pass dead-mi-elimination can delete
5782    // the dead physical register define instruction.
5783    return true;
5784  }
5785
5786  if (!MakeChange)
5787    return true;
5788
5789  if (!Modified) {
5790    // Modify the instruction.
5791    if (ImmVal == 0 && CanChangeToCopy &&
5792        UseMI.registerDefIsDead(X86::EFLAGS)) {
5793      //          %100 = add %101, 0
5794      //    ==>
5795      //          %100 = COPY %101
5796      UseMI.setDesc(get(TargetOpcode::COPY));
5797      UseMI.removeOperand(UseMI.findRegisterUseOperandIdx(Reg));
5798      UseMI.removeOperand(UseMI.findRegisterDefOperandIdx(X86::EFLAGS));
5799      UseMI.untieRegOperand(0);
5800      UseMI.clearFlag(MachineInstr::MIFlag::NoSWrap);
5801      UseMI.clearFlag(MachineInstr::MIFlag::NoUWrap);
5802    } else {
5803      unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
5804      unsigned ImmOpNum = 2;
5805      if (!UseMI.getOperand(0).isDef()) {
5806        Op1 = 0; // TEST, CMP
5807        ImmOpNum = 1;
5808      }
5809      if (Opc == TargetOpcode::COPY)
5810        ImmOpNum = 1;
5811      if (findCommutedOpIndices(UseMI, Op1, Op2) &&
5812          UseMI.getOperand(Op1).getReg() == Reg)
5813        commuteInstruction(UseMI);
5814
5815      assert(UseMI.getOperand(ImmOpNum).getReg() == Reg);
5816      UseMI.setDesc(get(NewOpc));
5817      UseMI.getOperand(ImmOpNum).ChangeToImmediate(ImmVal);
5818    }
5819  }
5820
5821  if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg))
5822    DefMI->eraseFromBundle();
5823
5824  return true;
5825}
5826
5827/// FoldImmediate - 'Reg' is known to be defined by a move immediate
5828/// instruction, try to fold the immediate into the use instruction.
5829bool X86InstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
5830                                 Register Reg, MachineRegisterInfo *MRI) const {
5831  int64_t ImmVal;
5832  if (!getConstValDefinedInReg(DefMI, Reg, ImmVal))
5833    return false;
5834
5835  return FoldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true);
5836}
5837
5838/// Expand a single-def pseudo instruction to a two-addr
5839/// instruction with two undef reads of the register being defined.
5840/// This is used for mapping:
5841///   %xmm4 = V_SET0
5842/// to:
5843///   %xmm4 = PXORrr undef %xmm4, undef %xmm4
5844///
5845static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
5846                             const MCInstrDesc &Desc) {
5847  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5848  Register Reg = MIB.getReg(0);
5849  MIB->setDesc(Desc);
5850
5851  // MachineInstr::addOperand() will insert explicit operands before any
5852  // implicit operands.
5853  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
5854  // But we don't trust that.
5855  assert(MIB.getReg(1) == Reg && MIB.getReg(2) == Reg && "Misplaced operand");
5856  return true;
5857}
5858
5859/// Expand a single-def pseudo instruction to a two-addr
5860/// instruction with two %k0 reads.
5861/// This is used for mapping:
5862///   %k4 = K_SET1
5863/// to:
5864///   %k4 = KXNORrr %k0, %k0
5865static bool Expand2AddrKreg(MachineInstrBuilder &MIB, const MCInstrDesc &Desc,
5866                            Register Reg) {
5867  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
5868  MIB->setDesc(Desc);
5869  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
5870  return true;
5871}
5872
5873static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
5874                          bool MinusOne) {
5875  MachineBasicBlock &MBB = *MIB->getParent();
5876  const DebugLoc &DL = MIB->getDebugLoc();
5877  Register Reg = MIB.getReg(0);
5878
5879  // Insert the XOR.
5880  BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
5881      .addReg(Reg, RegState::Undef)
5882      .addReg(Reg, RegState::Undef);
5883
5884  // Turn the pseudo into an INC or DEC.
5885  MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
5886  MIB.addReg(Reg);
5887
5888  return true;
5889}
5890
5891static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
5892                               const TargetInstrInfo &TII,
5893                               const X86Subtarget &Subtarget) {
5894  MachineBasicBlock &MBB = *MIB->getParent();
5895  const DebugLoc &DL = MIB->getDebugLoc();
5896  int64_t Imm = MIB->getOperand(1).getImm();
5897  assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
5898  MachineBasicBlock::iterator I = MIB.getInstr();
5899
5900  int StackAdjustment;
5901
5902  if (Subtarget.is64Bit()) {
5903    assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
5904           MIB->getOpcode() == X86::MOV32ImmSExti8);
5905
5906    // Can't use push/pop lowering if the function might write to the red zone.
5907    X86MachineFunctionInfo *X86FI =
5908        MBB.getParent()->getInfo<X86MachineFunctionInfo>();
5909    if (X86FI->getUsesRedZone()) {
5910      MIB->setDesc(TII.get(MIB->getOpcode() == X86::MOV32ImmSExti8
5911                               ? X86::MOV32ri
5912                               : X86::MOV64ri));
5913      return true;
5914    }
5915
5916    // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
5917    // widen the register if necessary.
5918    StackAdjustment = 8;
5919    BuildMI(MBB, I, DL, TII.get(X86::PUSH64i32)).addImm(Imm);
5920    MIB->setDesc(TII.get(X86::POP64r));
5921    MIB->getOperand(0).setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
5922  } else {
5923    assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
5924    StackAdjustment = 4;
5925    BuildMI(MBB, I, DL, TII.get(X86::PUSH32i)).addImm(Imm);
5926    MIB->setDesc(TII.get(X86::POP32r));
5927  }
5928  MIB->removeOperand(1);
5929  MIB->addImplicitDefUseOperands(*MBB.getParent());
5930
5931  // Build CFI if necessary.
5932  MachineFunction &MF = *MBB.getParent();
5933  const X86FrameLowering *TFL = Subtarget.getFrameLowering();
5934  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
5935  bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
5936  bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
5937  if (EmitCFI) {
5938    TFL->BuildCFI(
5939        MBB, I, DL,
5940        MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
5941    TFL->BuildCFI(
5942        MBB, std::next(I), DL,
5943        MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
5944  }
5945
5946  return true;
5947}
5948
5949// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
5950// code sequence is needed for other targets.
5951static void expandLoadStackGuard(MachineInstrBuilder &MIB,
5952                                 const TargetInstrInfo &TII) {
5953  MachineBasicBlock &MBB = *MIB->getParent();
5954  const DebugLoc &DL = MIB->getDebugLoc();
5955  Register Reg = MIB.getReg(0);
5956  const GlobalValue *GV =
5957      cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
5958  auto Flags = MachineMemOperand::MOLoad |
5959               MachineMemOperand::MODereferenceable |
5960               MachineMemOperand::MOInvariant;
5961  MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
5962      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
5963  MachineBasicBlock::iterator I = MIB.getInstr();
5964
5965  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg)
5966      .addReg(X86::RIP)
5967      .addImm(1)
5968      .addReg(0)
5969      .addGlobalAddress(GV, 0, X86II::MO_GOTPCREL)
5970      .addReg(0)
5971      .addMemOperand(MMO);
5972  MIB->setDebugLoc(DL);
5973  MIB->setDesc(TII.get(X86::MOV64rm));
5974  MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
5975}
5976
5977static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) {
5978  MachineBasicBlock &MBB = *MIB->getParent();
5979  MachineFunction &MF = *MBB.getParent();
5980  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
5981  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
5982  unsigned XorOp =
5983      MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
5984  MIB->setDesc(TII.get(XorOp));
5985  MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
5986  return true;
5987}
5988
5989// This is used to handle spills for 128/256-bit registers when we have AVX512,
5990// but not VLX. If it uses an extended register we need to use an instruction
5991// that loads the lower 128/256-bit, but is available with only AVX512F.
5992static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
5993                            const TargetRegisterInfo *TRI,
5994                            const MCInstrDesc &LoadDesc,
5995                            const MCInstrDesc &BroadcastDesc, unsigned SubIdx) {
5996  Register DestReg = MIB.getReg(0);
5997  // Check if DestReg is XMM16-31 or YMM16-31.
5998  if (TRI->getEncodingValue(DestReg) < 16) {
5999    // We can use a normal VEX encoded load.
6000    MIB->setDesc(LoadDesc);
6001  } else {
6002    // Use a 128/256-bit VBROADCAST instruction.
6003    MIB->setDesc(BroadcastDesc);
6004    // Change the destination to a 512-bit register.
6005    DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
6006    MIB->getOperand(0).setReg(DestReg);
6007  }
6008  return true;
6009}
6010
6011// This is used to handle spills for 128/256-bit registers when we have AVX512,
6012// but not VLX. If it uses an extended register we need to use an instruction
6013// that stores the lower 128/256-bit, but is available with only AVX512F.
6014static bool expandNOVLXStore(MachineInstrBuilder &MIB,
6015                             const TargetRegisterInfo *TRI,
6016                             const MCInstrDesc &StoreDesc,
6017                             const MCInstrDesc &ExtractDesc, unsigned SubIdx) {
6018  Register SrcReg = MIB.getReg(X86::AddrNumOperands);
6019  // Check if DestReg is XMM16-31 or YMM16-31.
6020  if (TRI->getEncodingValue(SrcReg) < 16) {
6021    // We can use a normal VEX encoded store.
6022    MIB->setDesc(StoreDesc);
6023  } else {
6024    // Use a VEXTRACTF instruction.
6025    MIB->setDesc(ExtractDesc);
6026    // Change the destination to a 512-bit register.
6027    SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
6028    MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
6029    MIB.addImm(0x0); // Append immediate to extract from the lower bits.
6030  }
6031
6032  return true;
6033}
6034
6035static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
6036  MIB->setDesc(Desc);
6037  int64_t ShiftAmt = MIB->getOperand(2).getImm();
6038  // Temporarily remove the immediate so we can add another source register.
6039  MIB->removeOperand(2);
6040  // Add the register. Don't copy the kill flag if there is one.
6041  MIB.addReg(MIB.getReg(1), getUndefRegState(MIB->getOperand(1).isUndef()));
6042  // Add back the immediate.
6043  MIB.addImm(ShiftAmt);
6044  return true;
6045}
6046
6047bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
6048  bool HasAVX = Subtarget.hasAVX();
6049  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
6050  switch (MI.getOpcode()) {
6051  case X86::MOV32r0:
6052    return Expand2AddrUndef(MIB, get(X86::XOR32rr));
6053  case X86::MOV32r1:
6054    return expandMOV32r1(MIB, *this, /*MinusOne=*/false);
6055  case X86::MOV32r_1:
6056    return expandMOV32r1(MIB, *this, /*MinusOne=*/true);
6057  case X86::MOV32ImmSExti8:
6058  case X86::MOV64ImmSExti8:
6059    return ExpandMOVImmSExti8(MIB, *this, Subtarget);
6060  case X86::SETB_C32r:
6061    return Expand2AddrUndef(MIB, get(X86::SBB32rr));
6062  case X86::SETB_C64r:
6063    return Expand2AddrUndef(MIB, get(X86::SBB64rr));
6064  case X86::MMX_SET0:
6065    return Expand2AddrUndef(MIB, get(X86::MMX_PXORrr));
6066  case X86::V_SET0:
6067  case X86::FsFLD0SS:
6068  case X86::FsFLD0SD:
6069  case X86::FsFLD0SH:
6070  case X86::FsFLD0F128:
6071    return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
6072  case X86::AVX_SET0: {
6073    assert(HasAVX && "AVX not supported");
6074    const TargetRegisterInfo *TRI = &getRegisterInfo();
6075    Register SrcReg = MIB.getReg(0);
6076    Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6077    MIB->getOperand(0).setReg(XReg);
6078    Expand2AddrUndef(MIB, get(X86::VXORPSrr));
6079    MIB.addReg(SrcReg, RegState::ImplicitDefine);
6080    return true;
6081  }
6082  case X86::AVX512_128_SET0:
6083  case X86::AVX512_FsFLD0SH:
6084  case X86::AVX512_FsFLD0SS:
6085  case X86::AVX512_FsFLD0SD:
6086  case X86::AVX512_FsFLD0F128: {
6087    bool HasVLX = Subtarget.hasVLX();
6088    Register SrcReg = MIB.getReg(0);
6089    const TargetRegisterInfo *TRI = &getRegisterInfo();
6090    if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
6091      return Expand2AddrUndef(MIB,
6092                              get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6093    // Extended register without VLX. Use a larger XOR.
6094    SrcReg =
6095        TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
6096    MIB->getOperand(0).setReg(SrcReg);
6097    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6098  }
6099  case X86::AVX512_256_SET0:
6100  case X86::AVX512_512_SET0: {
6101    bool HasVLX = Subtarget.hasVLX();
6102    Register SrcReg = MIB.getReg(0);
6103    const TargetRegisterInfo *TRI = &getRegisterInfo();
6104    if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
6105      Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
6106      MIB->getOperand(0).setReg(XReg);
6107      Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
6108      MIB.addReg(SrcReg, RegState::ImplicitDefine);
6109      return true;
6110    }
6111    if (MI.getOpcode() == X86::AVX512_256_SET0) {
6112      // No VLX so we must reference a zmm.
6113      unsigned ZReg =
6114          TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
6115      MIB->getOperand(0).setReg(ZReg);
6116    }
6117    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
6118  }
6119  case X86::V_SETALLONES:
6120    return Expand2AddrUndef(MIB,
6121                            get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
6122  case X86::AVX2_SETALLONES:
6123    return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
6124  case X86::AVX1_SETALLONES: {
6125    Register Reg = MIB.getReg(0);
6126    // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
6127    MIB->setDesc(get(X86::VCMPPSYrri));
6128    MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
6129    return true;
6130  }
6131  case X86::AVX512_512_SETALLONES: {
6132    Register Reg = MIB.getReg(0);
6133    MIB->setDesc(get(X86::VPTERNLOGDZrri));
6134    // VPTERNLOGD needs 3 register inputs and an immediate.
6135    // 0xff will return 1s for any input.
6136    MIB.addReg(Reg, RegState::Undef)
6137        .addReg(Reg, RegState::Undef)
6138        .addReg(Reg, RegState::Undef)
6139        .addImm(0xff);
6140    return true;
6141  }
6142  case X86::AVX512_512_SEXT_MASK_32:
6143  case X86::AVX512_512_SEXT_MASK_64: {
6144    Register Reg = MIB.getReg(0);
6145    Register MaskReg = MIB.getReg(1);
6146    unsigned MaskState = getRegState(MIB->getOperand(1));
6147    unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64)
6148                       ? X86::VPTERNLOGQZrrikz
6149                       : X86::VPTERNLOGDZrrikz;
6150    MI.removeOperand(1);
6151    MIB->setDesc(get(Opc));
6152    // VPTERNLOG needs 3 register inputs and an immediate.
6153    // 0xff will return 1s for any input.
6154    MIB.addReg(Reg, RegState::Undef)
6155        .addReg(MaskReg, MaskState)
6156        .addReg(Reg, RegState::Undef)
6157        .addReg(Reg, RegState::Undef)
6158        .addImm(0xff);
6159    return true;
6160  }
6161  case X86::VMOVAPSZ128rm_NOVLX:
6162    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
6163                           get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
6164  case X86::VMOVUPSZ128rm_NOVLX:
6165    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
6166                           get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
6167  case X86::VMOVAPSZ256rm_NOVLX:
6168    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
6169                           get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
6170  case X86::VMOVUPSZ256rm_NOVLX:
6171    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
6172                           get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
6173  case X86::VMOVAPSZ128mr_NOVLX:
6174    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
6175                            get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
6176  case X86::VMOVUPSZ128mr_NOVLX:
6177    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
6178                            get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
6179  case X86::VMOVAPSZ256mr_NOVLX:
6180    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
6181                            get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
6182  case X86::VMOVUPSZ256mr_NOVLX:
6183    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
6184                            get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
6185  case X86::MOV32ri64: {
6186    Register Reg = MIB.getReg(0);
6187    Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
6188    MI.setDesc(get(X86::MOV32ri));
6189    MIB->getOperand(0).setReg(Reg32);
6190    MIB.addReg(Reg, RegState::ImplicitDefine);
6191    return true;
6192  }
6193
6194  case X86::RDFLAGS32:
6195  case X86::RDFLAGS64: {
6196    unsigned Is64Bit = MI.getOpcode() == X86::RDFLAGS64;
6197    MachineBasicBlock &MBB = *MIB->getParent();
6198
6199    MachineInstr *NewMI = BuildMI(MBB, MI, MIB->getDebugLoc(),
6200                                  get(Is64Bit ? X86::PUSHF64 : X86::PUSHF32))
6201                              .getInstr();
6202
6203    // Permit reads of the EFLAGS and DF registers without them being defined.
6204    // This intrinsic exists to read external processor state in flags, such as
6205    // the trap flag, interrupt flag, and direction flag, none of which are
6206    // modeled by the backend.
6207    assert(NewMI->getOperand(2).getReg() == X86::EFLAGS &&
6208           "Unexpected register in operand! Should be EFLAGS.");
6209    NewMI->getOperand(2).setIsUndef();
6210    assert(NewMI->getOperand(3).getReg() == X86::DF &&
6211           "Unexpected register in operand! Should be DF.");
6212    NewMI->getOperand(3).setIsUndef();
6213
6214    MIB->setDesc(get(Is64Bit ? X86::POP64r : X86::POP32r));
6215    return true;
6216  }
6217
6218  case X86::WRFLAGS32:
6219  case X86::WRFLAGS64: {
6220    unsigned Is64Bit = MI.getOpcode() == X86::WRFLAGS64;
6221    MachineBasicBlock &MBB = *MIB->getParent();
6222
6223    BuildMI(MBB, MI, MIB->getDebugLoc(),
6224            get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
6225        .addReg(MI.getOperand(0).getReg());
6226    BuildMI(MBB, MI, MIB->getDebugLoc(),
6227            get(Is64Bit ? X86::POPF64 : X86::POPF32));
6228    MI.eraseFromParent();
6229    return true;
6230  }
6231
6232  // KNL does not recognize dependency-breaking idioms for mask registers,
6233  // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
6234  // Using %k0 as the undef input register is a performance heuristic based
6235  // on the assumption that %k0 is used less frequently than the other mask
6236  // registers, since it is not usable as a write mask.
6237  // FIXME: A more advanced approach would be to choose the best input mask
6238  // register based on context.
6239  case X86::KSET0W:
6240    return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
6241  case X86::KSET0D:
6242    return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
6243  case X86::KSET0Q:
6244    return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
6245  case X86::KSET1W:
6246    return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
6247  case X86::KSET1D:
6248    return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
6249  case X86::KSET1Q:
6250    return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
6251  case TargetOpcode::LOAD_STACK_GUARD:
6252    expandLoadStackGuard(MIB, *this);
6253    return true;
6254  case X86::XOR64_FP:
6255  case X86::XOR32_FP:
6256    return expandXorFP(MIB, *this);
6257  case X86::SHLDROT32ri:
6258    return expandSHXDROT(MIB, get(X86::SHLD32rri8));
6259  case X86::SHLDROT64ri:
6260    return expandSHXDROT(MIB, get(X86::SHLD64rri8));
6261  case X86::SHRDROT32ri:
6262    return expandSHXDROT(MIB, get(X86::SHRD32rri8));
6263  case X86::SHRDROT64ri:
6264    return expandSHXDROT(MIB, get(X86::SHRD64rri8));
6265  case X86::ADD8rr_DB:
6266    MIB->setDesc(get(X86::OR8rr));
6267    break;
6268  case X86::ADD16rr_DB:
6269    MIB->setDesc(get(X86::OR16rr));
6270    break;
6271  case X86::ADD32rr_DB:
6272    MIB->setDesc(get(X86::OR32rr));
6273    break;
6274  case X86::ADD64rr_DB:
6275    MIB->setDesc(get(X86::OR64rr));
6276    break;
6277  case X86::ADD8ri_DB:
6278    MIB->setDesc(get(X86::OR8ri));
6279    break;
6280  case X86::ADD16ri_DB:
6281    MIB->setDesc(get(X86::OR16ri));
6282    break;
6283  case X86::ADD32ri_DB:
6284    MIB->setDesc(get(X86::OR32ri));
6285    break;
6286  case X86::ADD64ri32_DB:
6287    MIB->setDesc(get(X86::OR64ri32));
6288    break;
6289  }
6290  return false;
6291}
6292
6293/// Return true for all instructions that only update
6294/// the first 32 or 64-bits of the destination register and leave the rest
6295/// unmodified. This can be used to avoid folding loads if the instructions
6296/// only update part of the destination register, and the non-updated part is
6297/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
6298/// instructions breaks the partial register dependency and it can improve
6299/// performance. e.g.:
6300///
6301///   movss (%rdi), %xmm0
6302///   cvtss2sd %xmm0, %xmm0
6303///
6304/// Instead of
6305///   cvtss2sd (%rdi), %xmm0
6306///
6307/// FIXME: This should be turned into a TSFlags.
6308///
6309static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
6310                                bool ForLoadFold = false) {
6311  switch (Opcode) {
6312  case X86::CVTSI2SSrr:
6313  case X86::CVTSI2SSrm:
6314  case X86::CVTSI642SSrr:
6315  case X86::CVTSI642SSrm:
6316  case X86::CVTSI2SDrr:
6317  case X86::CVTSI2SDrm:
6318  case X86::CVTSI642SDrr:
6319  case X86::CVTSI642SDrm:
6320    // Load folding won't effect the undef register update since the input is
6321    // a GPR.
6322    return !ForLoadFold;
6323  case X86::CVTSD2SSrr:
6324  case X86::CVTSD2SSrm:
6325  case X86::CVTSS2SDrr:
6326  case X86::CVTSS2SDrm:
6327  case X86::MOVHPDrm:
6328  case X86::MOVHPSrm:
6329  case X86::MOVLPDrm:
6330  case X86::MOVLPSrm:
6331  case X86::RCPSSr:
6332  case X86::RCPSSm:
6333  case X86::RCPSSr_Int:
6334  case X86::RCPSSm_Int:
6335  case X86::ROUNDSDr:
6336  case X86::ROUNDSDm:
6337  case X86::ROUNDSSr:
6338  case X86::ROUNDSSm:
6339  case X86::RSQRTSSr:
6340  case X86::RSQRTSSm:
6341  case X86::RSQRTSSr_Int:
6342  case X86::RSQRTSSm_Int:
6343  case X86::SQRTSSr:
6344  case X86::SQRTSSm:
6345  case X86::SQRTSSr_Int:
6346  case X86::SQRTSSm_Int:
6347  case X86::SQRTSDr:
6348  case X86::SQRTSDm:
6349  case X86::SQRTSDr_Int:
6350  case X86::SQRTSDm_Int:
6351    return true;
6352  case X86::VFCMULCPHZ128rm:
6353  case X86::VFCMULCPHZ128rmb:
6354  case X86::VFCMULCPHZ128rmbkz:
6355  case X86::VFCMULCPHZ128rmkz:
6356  case X86::VFCMULCPHZ128rr:
6357  case X86::VFCMULCPHZ128rrkz:
6358  case X86::VFCMULCPHZ256rm:
6359  case X86::VFCMULCPHZ256rmb:
6360  case X86::VFCMULCPHZ256rmbkz:
6361  case X86::VFCMULCPHZ256rmkz:
6362  case X86::VFCMULCPHZ256rr:
6363  case X86::VFCMULCPHZ256rrkz:
6364  case X86::VFCMULCPHZrm:
6365  case X86::VFCMULCPHZrmb:
6366  case X86::VFCMULCPHZrmbkz:
6367  case X86::VFCMULCPHZrmkz:
6368  case X86::VFCMULCPHZrr:
6369  case X86::VFCMULCPHZrrb:
6370  case X86::VFCMULCPHZrrbkz:
6371  case X86::VFCMULCPHZrrkz:
6372  case X86::VFMULCPHZ128rm:
6373  case X86::VFMULCPHZ128rmb:
6374  case X86::VFMULCPHZ128rmbkz:
6375  case X86::VFMULCPHZ128rmkz:
6376  case X86::VFMULCPHZ128rr:
6377  case X86::VFMULCPHZ128rrkz:
6378  case X86::VFMULCPHZ256rm:
6379  case X86::VFMULCPHZ256rmb:
6380  case X86::VFMULCPHZ256rmbkz:
6381  case X86::VFMULCPHZ256rmkz:
6382  case X86::VFMULCPHZ256rr:
6383  case X86::VFMULCPHZ256rrkz:
6384  case X86::VFMULCPHZrm:
6385  case X86::VFMULCPHZrmb:
6386  case X86::VFMULCPHZrmbkz:
6387  case X86::VFMULCPHZrmkz:
6388  case X86::VFMULCPHZrr:
6389  case X86::VFMULCPHZrrb:
6390  case X86::VFMULCPHZrrbkz:
6391  case X86::VFMULCPHZrrkz:
6392  case X86::VFCMULCSHZrm:
6393  case X86::VFCMULCSHZrmkz:
6394  case X86::VFCMULCSHZrr:
6395  case X86::VFCMULCSHZrrb:
6396  case X86::VFCMULCSHZrrbkz:
6397  case X86::VFCMULCSHZrrkz:
6398  case X86::VFMULCSHZrm:
6399  case X86::VFMULCSHZrmkz:
6400  case X86::VFMULCSHZrr:
6401  case X86::VFMULCSHZrrb:
6402  case X86::VFMULCSHZrrbkz:
6403  case X86::VFMULCSHZrrkz:
6404    return Subtarget.hasMULCFalseDeps();
6405  case X86::VPERMDYrm:
6406  case X86::VPERMDYrr:
6407  case X86::VPERMQYmi:
6408  case X86::VPERMQYri:
6409  case X86::VPERMPSYrm:
6410  case X86::VPERMPSYrr:
6411  case X86::VPERMPDYmi:
6412  case X86::VPERMPDYri:
6413  case X86::VPERMDZ256rm:
6414  case X86::VPERMDZ256rmb:
6415  case X86::VPERMDZ256rmbkz:
6416  case X86::VPERMDZ256rmkz:
6417  case X86::VPERMDZ256rr:
6418  case X86::VPERMDZ256rrkz:
6419  case X86::VPERMDZrm:
6420  case X86::VPERMDZrmb:
6421  case X86::VPERMDZrmbkz:
6422  case X86::VPERMDZrmkz:
6423  case X86::VPERMDZrr:
6424  case X86::VPERMDZrrkz:
6425  case X86::VPERMQZ256mbi:
6426  case X86::VPERMQZ256mbikz:
6427  case X86::VPERMQZ256mi:
6428  case X86::VPERMQZ256mikz:
6429  case X86::VPERMQZ256ri:
6430  case X86::VPERMQZ256rikz:
6431  case X86::VPERMQZ256rm:
6432  case X86::VPERMQZ256rmb:
6433  case X86::VPERMQZ256rmbkz:
6434  case X86::VPERMQZ256rmkz:
6435  case X86::VPERMQZ256rr:
6436  case X86::VPERMQZ256rrkz:
6437  case X86::VPERMQZmbi:
6438  case X86::VPERMQZmbikz:
6439  case X86::VPERMQZmi:
6440  case X86::VPERMQZmikz:
6441  case X86::VPERMQZri:
6442  case X86::VPERMQZrikz:
6443  case X86::VPERMQZrm:
6444  case X86::VPERMQZrmb:
6445  case X86::VPERMQZrmbkz:
6446  case X86::VPERMQZrmkz:
6447  case X86::VPERMQZrr:
6448  case X86::VPERMQZrrkz:
6449  case X86::VPERMPSZ256rm:
6450  case X86::VPERMPSZ256rmb:
6451  case X86::VPERMPSZ256rmbkz:
6452  case X86::VPERMPSZ256rmkz:
6453  case X86::VPERMPSZ256rr:
6454  case X86::VPERMPSZ256rrkz:
6455  case X86::VPERMPSZrm:
6456  case X86::VPERMPSZrmb:
6457  case X86::VPERMPSZrmbkz:
6458  case X86::VPERMPSZrmkz:
6459  case X86::VPERMPSZrr:
6460  case X86::VPERMPSZrrkz:
6461  case X86::VPERMPDZ256mbi:
6462  case X86::VPERMPDZ256mbikz:
6463  case X86::VPERMPDZ256mi:
6464  case X86::VPERMPDZ256mikz:
6465  case X86::VPERMPDZ256ri:
6466  case X86::VPERMPDZ256rikz:
6467  case X86::VPERMPDZ256rm:
6468  case X86::VPERMPDZ256rmb:
6469  case X86::VPERMPDZ256rmbkz:
6470  case X86::VPERMPDZ256rmkz:
6471  case X86::VPERMPDZ256rr:
6472  case X86::VPERMPDZ256rrkz:
6473  case X86::VPERMPDZmbi:
6474  case X86::VPERMPDZmbikz:
6475  case X86::VPERMPDZmi:
6476  case X86::VPERMPDZmikz:
6477  case X86::VPERMPDZri:
6478  case X86::VPERMPDZrikz:
6479  case X86::VPERMPDZrm:
6480  case X86::VPERMPDZrmb:
6481  case X86::VPERMPDZrmbkz:
6482  case X86::VPERMPDZrmkz:
6483  case X86::VPERMPDZrr:
6484  case X86::VPERMPDZrrkz:
6485    return Subtarget.hasPERMFalseDeps();
6486  case X86::VRANGEPDZ128rmbi:
6487  case X86::VRANGEPDZ128rmbikz:
6488  case X86::VRANGEPDZ128rmi:
6489  case X86::VRANGEPDZ128rmikz:
6490  case X86::VRANGEPDZ128rri:
6491  case X86::VRANGEPDZ128rrikz:
6492  case X86::VRANGEPDZ256rmbi:
6493  case X86::VRANGEPDZ256rmbikz:
6494  case X86::VRANGEPDZ256rmi:
6495  case X86::VRANGEPDZ256rmikz:
6496  case X86::VRANGEPDZ256rri:
6497  case X86::VRANGEPDZ256rrikz:
6498  case X86::VRANGEPDZrmbi:
6499  case X86::VRANGEPDZrmbikz:
6500  case X86::VRANGEPDZrmi:
6501  case X86::VRANGEPDZrmikz:
6502  case X86::VRANGEPDZrri:
6503  case X86::VRANGEPDZrrib:
6504  case X86::VRANGEPDZrribkz:
6505  case X86::VRANGEPDZrrikz:
6506  case X86::VRANGEPSZ128rmbi:
6507  case X86::VRANGEPSZ128rmbikz:
6508  case X86::VRANGEPSZ128rmi:
6509  case X86::VRANGEPSZ128rmikz:
6510  case X86::VRANGEPSZ128rri:
6511  case X86::VRANGEPSZ128rrikz:
6512  case X86::VRANGEPSZ256rmbi:
6513  case X86::VRANGEPSZ256rmbikz:
6514  case X86::VRANGEPSZ256rmi:
6515  case X86::VRANGEPSZ256rmikz:
6516  case X86::VRANGEPSZ256rri:
6517  case X86::VRANGEPSZ256rrikz:
6518  case X86::VRANGEPSZrmbi:
6519  case X86::VRANGEPSZrmbikz:
6520  case X86::VRANGEPSZrmi:
6521  case X86::VRANGEPSZrmikz:
6522  case X86::VRANGEPSZrri:
6523  case X86::VRANGEPSZrrib:
6524  case X86::VRANGEPSZrribkz:
6525  case X86::VRANGEPSZrrikz:
6526  case X86::VRANGESDZrmi:
6527  case X86::VRANGESDZrmikz:
6528  case X86::VRANGESDZrri:
6529  case X86::VRANGESDZrrib:
6530  case X86::VRANGESDZrribkz:
6531  case X86::VRANGESDZrrikz:
6532  case X86::VRANGESSZrmi:
6533  case X86::VRANGESSZrmikz:
6534  case X86::VRANGESSZrri:
6535  case X86::VRANGESSZrrib:
6536  case X86::VRANGESSZrribkz:
6537  case X86::VRANGESSZrrikz:
6538    return Subtarget.hasRANGEFalseDeps();
6539  case X86::VGETMANTSSZrmi:
6540  case X86::VGETMANTSSZrmikz:
6541  case X86::VGETMANTSSZrri:
6542  case X86::VGETMANTSSZrrib:
6543  case X86::VGETMANTSSZrribkz:
6544  case X86::VGETMANTSSZrrikz:
6545  case X86::VGETMANTSDZrmi:
6546  case X86::VGETMANTSDZrmikz:
6547  case X86::VGETMANTSDZrri:
6548  case X86::VGETMANTSDZrrib:
6549  case X86::VGETMANTSDZrribkz:
6550  case X86::VGETMANTSDZrrikz:
6551  case X86::VGETMANTSHZrmi:
6552  case X86::VGETMANTSHZrmikz:
6553  case X86::VGETMANTSHZrri:
6554  case X86::VGETMANTSHZrrib:
6555  case X86::VGETMANTSHZrribkz:
6556  case X86::VGETMANTSHZrrikz:
6557  case X86::VGETMANTPSZ128rmbi:
6558  case X86::VGETMANTPSZ128rmbikz:
6559  case X86::VGETMANTPSZ128rmi:
6560  case X86::VGETMANTPSZ128rmikz:
6561  case X86::VGETMANTPSZ256rmbi:
6562  case X86::VGETMANTPSZ256rmbikz:
6563  case X86::VGETMANTPSZ256rmi:
6564  case X86::VGETMANTPSZ256rmikz:
6565  case X86::VGETMANTPSZrmbi:
6566  case X86::VGETMANTPSZrmbikz:
6567  case X86::VGETMANTPSZrmi:
6568  case X86::VGETMANTPSZrmikz:
6569  case X86::VGETMANTPDZ128rmbi:
6570  case X86::VGETMANTPDZ128rmbikz:
6571  case X86::VGETMANTPDZ128rmi:
6572  case X86::VGETMANTPDZ128rmikz:
6573  case X86::VGETMANTPDZ256rmbi:
6574  case X86::VGETMANTPDZ256rmbikz:
6575  case X86::VGETMANTPDZ256rmi:
6576  case X86::VGETMANTPDZ256rmikz:
6577  case X86::VGETMANTPDZrmbi:
6578  case X86::VGETMANTPDZrmbikz:
6579  case X86::VGETMANTPDZrmi:
6580  case X86::VGETMANTPDZrmikz:
6581    return Subtarget.hasGETMANTFalseDeps();
6582  case X86::VPMULLQZ128rm:
6583  case X86::VPMULLQZ128rmb:
6584  case X86::VPMULLQZ128rmbkz:
6585  case X86::VPMULLQZ128rmkz:
6586  case X86::VPMULLQZ128rr:
6587  case X86::VPMULLQZ128rrkz:
6588  case X86::VPMULLQZ256rm:
6589  case X86::VPMULLQZ256rmb:
6590  case X86::VPMULLQZ256rmbkz:
6591  case X86::VPMULLQZ256rmkz:
6592  case X86::VPMULLQZ256rr:
6593  case X86::VPMULLQZ256rrkz:
6594  case X86::VPMULLQZrm:
6595  case X86::VPMULLQZrmb:
6596  case X86::VPMULLQZrmbkz:
6597  case X86::VPMULLQZrmkz:
6598  case X86::VPMULLQZrr:
6599  case X86::VPMULLQZrrkz:
6600    return Subtarget.hasMULLQFalseDeps();
6601  // GPR
6602  case X86::POPCNT32rm:
6603  case X86::POPCNT32rr:
6604  case X86::POPCNT64rm:
6605  case X86::POPCNT64rr:
6606    return Subtarget.hasPOPCNTFalseDeps();
6607  case X86::LZCNT32rm:
6608  case X86::LZCNT32rr:
6609  case X86::LZCNT64rm:
6610  case X86::LZCNT64rr:
6611  case X86::TZCNT32rm:
6612  case X86::TZCNT32rr:
6613  case X86::TZCNT64rm:
6614  case X86::TZCNT64rr:
6615    return Subtarget.hasLZCNTFalseDeps();
6616  }
6617
6618  return false;
6619}
6620
6621/// Inform the BreakFalseDeps pass how many idle
6622/// instructions we would like before a partial register update.
6623unsigned X86InstrInfo::getPartialRegUpdateClearance(
6624    const MachineInstr &MI, unsigned OpNum,
6625    const TargetRegisterInfo *TRI) const {
6626  if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
6627    return 0;
6628
6629  // If MI is marked as reading Reg, the partial register update is wanted.
6630  const MachineOperand &MO = MI.getOperand(0);
6631  Register Reg = MO.getReg();
6632  if (Reg.isVirtual()) {
6633    if (MO.readsReg() || MI.readsVirtualRegister(Reg))
6634      return 0;
6635  } else {
6636    if (MI.readsRegister(Reg, TRI))
6637      return 0;
6638  }
6639
6640  // If any instructions in the clearance range are reading Reg, insert a
6641  // dependency breaking instruction, which is inexpensive and is likely to
6642  // be hidden in other instruction's cycles.
6643  return PartialRegUpdateClearance;
6644}
6645
6646// Return true for any instruction the copies the high bits of the first source
6647// operand into the unused high bits of the destination operand.
6648// Also returns true for instructions that have two inputs where one may
6649// be undef and we want it to use the same register as the other input.
6650static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
6651                              bool ForLoadFold = false) {
6652  // Set the OpNum parameter to the first source operand.
6653  switch (Opcode) {
6654  case X86::MMX_PUNPCKHBWrr:
6655  case X86::MMX_PUNPCKHWDrr:
6656  case X86::MMX_PUNPCKHDQrr:
6657  case X86::MMX_PUNPCKLBWrr:
6658  case X86::MMX_PUNPCKLWDrr:
6659  case X86::MMX_PUNPCKLDQrr:
6660  case X86::MOVHLPSrr:
6661  case X86::PACKSSWBrr:
6662  case X86::PACKUSWBrr:
6663  case X86::PACKSSDWrr:
6664  case X86::PACKUSDWrr:
6665  case X86::PUNPCKHBWrr:
6666  case X86::PUNPCKLBWrr:
6667  case X86::PUNPCKHWDrr:
6668  case X86::PUNPCKLWDrr:
6669  case X86::PUNPCKHDQrr:
6670  case X86::PUNPCKLDQrr:
6671  case X86::PUNPCKHQDQrr:
6672  case X86::PUNPCKLQDQrr:
6673  case X86::SHUFPDrri:
6674  case X86::SHUFPSrri:
6675    // These instructions are sometimes used with an undef first or second
6676    // source. Return true here so BreakFalseDeps will assign this source to the
6677    // same register as the first source to avoid a false dependency.
6678    // Operand 1 of these instructions is tied so they're separate from their
6679    // VEX counterparts.
6680    return OpNum == 2 && !ForLoadFold;
6681
6682  case X86::VMOVLHPSrr:
6683  case X86::VMOVLHPSZrr:
6684  case X86::VPACKSSWBrr:
6685  case X86::VPACKUSWBrr:
6686  case X86::VPACKSSDWrr:
6687  case X86::VPACKUSDWrr:
6688  case X86::VPACKSSWBZ128rr:
6689  case X86::VPACKUSWBZ128rr:
6690  case X86::VPACKSSDWZ128rr:
6691  case X86::VPACKUSDWZ128rr:
6692  case X86::VPERM2F128rr:
6693  case X86::VPERM2I128rr:
6694  case X86::VSHUFF32X4Z256rri:
6695  case X86::VSHUFF32X4Zrri:
6696  case X86::VSHUFF64X2Z256rri:
6697  case X86::VSHUFF64X2Zrri:
6698  case X86::VSHUFI32X4Z256rri:
6699  case X86::VSHUFI32X4Zrri:
6700  case X86::VSHUFI64X2Z256rri:
6701  case X86::VSHUFI64X2Zrri:
6702  case X86::VPUNPCKHBWrr:
6703  case X86::VPUNPCKLBWrr:
6704  case X86::VPUNPCKHBWYrr:
6705  case X86::VPUNPCKLBWYrr:
6706  case X86::VPUNPCKHBWZ128rr:
6707  case X86::VPUNPCKLBWZ128rr:
6708  case X86::VPUNPCKHBWZ256rr:
6709  case X86::VPUNPCKLBWZ256rr:
6710  case X86::VPUNPCKHBWZrr:
6711  case X86::VPUNPCKLBWZrr:
6712  case X86::VPUNPCKHWDrr:
6713  case X86::VPUNPCKLWDrr:
6714  case X86::VPUNPCKHWDYrr:
6715  case X86::VPUNPCKLWDYrr:
6716  case X86::VPUNPCKHWDZ128rr:
6717  case X86::VPUNPCKLWDZ128rr:
6718  case X86::VPUNPCKHWDZ256rr:
6719  case X86::VPUNPCKLWDZ256rr:
6720  case X86::VPUNPCKHWDZrr:
6721  case X86::VPUNPCKLWDZrr:
6722  case X86::VPUNPCKHDQrr:
6723  case X86::VPUNPCKLDQrr:
6724  case X86::VPUNPCKHDQYrr:
6725  case X86::VPUNPCKLDQYrr:
6726  case X86::VPUNPCKHDQZ128rr:
6727  case X86::VPUNPCKLDQZ128rr:
6728  case X86::VPUNPCKHDQZ256rr:
6729  case X86::VPUNPCKLDQZ256rr:
6730  case X86::VPUNPCKHDQZrr:
6731  case X86::VPUNPCKLDQZrr:
6732  case X86::VPUNPCKHQDQrr:
6733  case X86::VPUNPCKLQDQrr:
6734  case X86::VPUNPCKHQDQYrr:
6735  case X86::VPUNPCKLQDQYrr:
6736  case X86::VPUNPCKHQDQZ128rr:
6737  case X86::VPUNPCKLQDQZ128rr:
6738  case X86::VPUNPCKHQDQZ256rr:
6739  case X86::VPUNPCKLQDQZ256rr:
6740  case X86::VPUNPCKHQDQZrr:
6741  case X86::VPUNPCKLQDQZrr:
6742    // These instructions are sometimes used with an undef first or second
6743    // source. Return true here so BreakFalseDeps will assign this source to the
6744    // same register as the first source to avoid a false dependency.
6745    return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
6746
6747  case X86::VCVTSI2SSrr:
6748  case X86::VCVTSI2SSrm:
6749  case X86::VCVTSI2SSrr_Int:
6750  case X86::VCVTSI2SSrm_Int:
6751  case X86::VCVTSI642SSrr:
6752  case X86::VCVTSI642SSrm:
6753  case X86::VCVTSI642SSrr_Int:
6754  case X86::VCVTSI642SSrm_Int:
6755  case X86::VCVTSI2SDrr:
6756  case X86::VCVTSI2SDrm:
6757  case X86::VCVTSI2SDrr_Int:
6758  case X86::VCVTSI2SDrm_Int:
6759  case X86::VCVTSI642SDrr:
6760  case X86::VCVTSI642SDrm:
6761  case X86::VCVTSI642SDrr_Int:
6762  case X86::VCVTSI642SDrm_Int:
6763  // AVX-512
6764  case X86::VCVTSI2SSZrr:
6765  case X86::VCVTSI2SSZrm:
6766  case X86::VCVTSI2SSZrr_Int:
6767  case X86::VCVTSI2SSZrrb_Int:
6768  case X86::VCVTSI2SSZrm_Int:
6769  case X86::VCVTSI642SSZrr:
6770  case X86::VCVTSI642SSZrm:
6771  case X86::VCVTSI642SSZrr_Int:
6772  case X86::VCVTSI642SSZrrb_Int:
6773  case X86::VCVTSI642SSZrm_Int:
6774  case X86::VCVTSI2SDZrr:
6775  case X86::VCVTSI2SDZrm:
6776  case X86::VCVTSI2SDZrr_Int:
6777  case X86::VCVTSI2SDZrm_Int:
6778  case X86::VCVTSI642SDZrr:
6779  case X86::VCVTSI642SDZrm:
6780  case X86::VCVTSI642SDZrr_Int:
6781  case X86::VCVTSI642SDZrrb_Int:
6782  case X86::VCVTSI642SDZrm_Int:
6783  case X86::VCVTUSI2SSZrr:
6784  case X86::VCVTUSI2SSZrm:
6785  case X86::VCVTUSI2SSZrr_Int:
6786  case X86::VCVTUSI2SSZrrb_Int:
6787  case X86::VCVTUSI2SSZrm_Int:
6788  case X86::VCVTUSI642SSZrr:
6789  case X86::VCVTUSI642SSZrm:
6790  case X86::VCVTUSI642SSZrr_Int:
6791  case X86::VCVTUSI642SSZrrb_Int:
6792  case X86::VCVTUSI642SSZrm_Int:
6793  case X86::VCVTUSI2SDZrr:
6794  case X86::VCVTUSI2SDZrm:
6795  case X86::VCVTUSI2SDZrr_Int:
6796  case X86::VCVTUSI2SDZrm_Int:
6797  case X86::VCVTUSI642SDZrr:
6798  case X86::VCVTUSI642SDZrm:
6799  case X86::VCVTUSI642SDZrr_Int:
6800  case X86::VCVTUSI642SDZrrb_Int:
6801  case X86::VCVTUSI642SDZrm_Int:
6802  case X86::VCVTSI2SHZrr:
6803  case X86::VCVTSI2SHZrm:
6804  case X86::VCVTSI2SHZrr_Int:
6805  case X86::VCVTSI2SHZrrb_Int:
6806  case X86::VCVTSI2SHZrm_Int:
6807  case X86::VCVTSI642SHZrr:
6808  case X86::VCVTSI642SHZrm:
6809  case X86::VCVTSI642SHZrr_Int:
6810  case X86::VCVTSI642SHZrrb_Int:
6811  case X86::VCVTSI642SHZrm_Int:
6812  case X86::VCVTUSI2SHZrr:
6813  case X86::VCVTUSI2SHZrm:
6814  case X86::VCVTUSI2SHZrr_Int:
6815  case X86::VCVTUSI2SHZrrb_Int:
6816  case X86::VCVTUSI2SHZrm_Int:
6817  case X86::VCVTUSI642SHZrr:
6818  case X86::VCVTUSI642SHZrm:
6819  case X86::VCVTUSI642SHZrr_Int:
6820  case X86::VCVTUSI642SHZrrb_Int:
6821  case X86::VCVTUSI642SHZrm_Int:
6822    // Load folding won't effect the undef register update since the input is
6823    // a GPR.
6824    return OpNum == 1 && !ForLoadFold;
6825  case X86::VCVTSD2SSrr:
6826  case X86::VCVTSD2SSrm:
6827  case X86::VCVTSD2SSrr_Int:
6828  case X86::VCVTSD2SSrm_Int:
6829  case X86::VCVTSS2SDrr:
6830  case X86::VCVTSS2SDrm:
6831  case X86::VCVTSS2SDrr_Int:
6832  case X86::VCVTSS2SDrm_Int:
6833  case X86::VRCPSSr:
6834  case X86::VRCPSSr_Int:
6835  case X86::VRCPSSm:
6836  case X86::VRCPSSm_Int:
6837  case X86::VROUNDSDr:
6838  case X86::VROUNDSDm:
6839  case X86::VROUNDSDr_Int:
6840  case X86::VROUNDSDm_Int:
6841  case X86::VROUNDSSr:
6842  case X86::VROUNDSSm:
6843  case X86::VROUNDSSr_Int:
6844  case X86::VROUNDSSm_Int:
6845  case X86::VRSQRTSSr:
6846  case X86::VRSQRTSSr_Int:
6847  case X86::VRSQRTSSm:
6848  case X86::VRSQRTSSm_Int:
6849  case X86::VSQRTSSr:
6850  case X86::VSQRTSSr_Int:
6851  case X86::VSQRTSSm:
6852  case X86::VSQRTSSm_Int:
6853  case X86::VSQRTSDr:
6854  case X86::VSQRTSDr_Int:
6855  case X86::VSQRTSDm:
6856  case X86::VSQRTSDm_Int:
6857  // AVX-512
6858  case X86::VCVTSD2SSZrr:
6859  case X86::VCVTSD2SSZrr_Int:
6860  case X86::VCVTSD2SSZrrb_Int:
6861  case X86::VCVTSD2SSZrm:
6862  case X86::VCVTSD2SSZrm_Int:
6863  case X86::VCVTSS2SDZrr:
6864  case X86::VCVTSS2SDZrr_Int:
6865  case X86::VCVTSS2SDZrrb_Int:
6866  case X86::VCVTSS2SDZrm:
6867  case X86::VCVTSS2SDZrm_Int:
6868  case X86::VGETEXPSDZr:
6869  case X86::VGETEXPSDZrb:
6870  case X86::VGETEXPSDZm:
6871  case X86::VGETEXPSSZr:
6872  case X86::VGETEXPSSZrb:
6873  case X86::VGETEXPSSZm:
6874  case X86::VGETMANTSDZrri:
6875  case X86::VGETMANTSDZrrib:
6876  case X86::VGETMANTSDZrmi:
6877  case X86::VGETMANTSSZrri:
6878  case X86::VGETMANTSSZrrib:
6879  case X86::VGETMANTSSZrmi:
6880  case X86::VRNDSCALESDZr:
6881  case X86::VRNDSCALESDZr_Int:
6882  case X86::VRNDSCALESDZrb_Int:
6883  case X86::VRNDSCALESDZm:
6884  case X86::VRNDSCALESDZm_Int:
6885  case X86::VRNDSCALESSZr:
6886  case X86::VRNDSCALESSZr_Int:
6887  case X86::VRNDSCALESSZrb_Int:
6888  case X86::VRNDSCALESSZm:
6889  case X86::VRNDSCALESSZm_Int:
6890  case X86::VRCP14SDZrr:
6891  case X86::VRCP14SDZrm:
6892  case X86::VRCP14SSZrr:
6893  case X86::VRCP14SSZrm:
6894  case X86::VRCPSHZrr:
6895  case X86::VRCPSHZrm:
6896  case X86::VRSQRTSHZrr:
6897  case X86::VRSQRTSHZrm:
6898  case X86::VREDUCESHZrmi:
6899  case X86::VREDUCESHZrri:
6900  case X86::VREDUCESHZrrib:
6901  case X86::VGETEXPSHZr:
6902  case X86::VGETEXPSHZrb:
6903  case X86::VGETEXPSHZm:
6904  case X86::VGETMANTSHZrri:
6905  case X86::VGETMANTSHZrrib:
6906  case X86::VGETMANTSHZrmi:
6907  case X86::VRNDSCALESHZr:
6908  case X86::VRNDSCALESHZr_Int:
6909  case X86::VRNDSCALESHZrb_Int:
6910  case X86::VRNDSCALESHZm:
6911  case X86::VRNDSCALESHZm_Int:
6912  case X86::VSQRTSHZr:
6913  case X86::VSQRTSHZr_Int:
6914  case X86::VSQRTSHZrb_Int:
6915  case X86::VSQRTSHZm:
6916  case X86::VSQRTSHZm_Int:
6917  case X86::VRCP28SDZr:
6918  case X86::VRCP28SDZrb:
6919  case X86::VRCP28SDZm:
6920  case X86::VRCP28SSZr:
6921  case X86::VRCP28SSZrb:
6922  case X86::VRCP28SSZm:
6923  case X86::VREDUCESSZrmi:
6924  case X86::VREDUCESSZrri:
6925  case X86::VREDUCESSZrrib:
6926  case X86::VRSQRT14SDZrr:
6927  case X86::VRSQRT14SDZrm:
6928  case X86::VRSQRT14SSZrr:
6929  case X86::VRSQRT14SSZrm:
6930  case X86::VRSQRT28SDZr:
6931  case X86::VRSQRT28SDZrb:
6932  case X86::VRSQRT28SDZm:
6933  case X86::VRSQRT28SSZr:
6934  case X86::VRSQRT28SSZrb:
6935  case X86::VRSQRT28SSZm:
6936  case X86::VSQRTSSZr:
6937  case X86::VSQRTSSZr_Int:
6938  case X86::VSQRTSSZrb_Int:
6939  case X86::VSQRTSSZm:
6940  case X86::VSQRTSSZm_Int:
6941  case X86::VSQRTSDZr:
6942  case X86::VSQRTSDZr_Int:
6943  case X86::VSQRTSDZrb_Int:
6944  case X86::VSQRTSDZm:
6945  case X86::VSQRTSDZm_Int:
6946  case X86::VCVTSD2SHZrr:
6947  case X86::VCVTSD2SHZrr_Int:
6948  case X86::VCVTSD2SHZrrb_Int:
6949  case X86::VCVTSD2SHZrm:
6950  case X86::VCVTSD2SHZrm_Int:
6951  case X86::VCVTSS2SHZrr:
6952  case X86::VCVTSS2SHZrr_Int:
6953  case X86::VCVTSS2SHZrrb_Int:
6954  case X86::VCVTSS2SHZrm:
6955  case X86::VCVTSS2SHZrm_Int:
6956  case X86::VCVTSH2SDZrr:
6957  case X86::VCVTSH2SDZrr_Int:
6958  case X86::VCVTSH2SDZrrb_Int:
6959  case X86::VCVTSH2SDZrm:
6960  case X86::VCVTSH2SDZrm_Int:
6961  case X86::VCVTSH2SSZrr:
6962  case X86::VCVTSH2SSZrr_Int:
6963  case X86::VCVTSH2SSZrrb_Int:
6964  case X86::VCVTSH2SSZrm:
6965  case X86::VCVTSH2SSZrm_Int:
6966    return OpNum == 1;
6967  case X86::VMOVSSZrrk:
6968  case X86::VMOVSDZrrk:
6969    return OpNum == 3 && !ForLoadFold;
6970  case X86::VMOVSSZrrkz:
6971  case X86::VMOVSDZrrkz:
6972    return OpNum == 2 && !ForLoadFold;
6973  }
6974
6975  return false;
6976}
6977
6978/// Inform the BreakFalseDeps pass how many idle instructions we would like
6979/// before certain undef register reads.
6980///
6981/// This catches the VCVTSI2SD family of instructions:
6982///
6983/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
6984///
6985/// We should to be careful *not* to catch VXOR idioms which are presumably
6986/// handled specially in the pipeline:
6987///
6988/// vxorps undef %xmm1, undef %xmm1, %xmm1
6989///
6990/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
6991/// high bits that are passed-through are not live.
6992unsigned
6993X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned OpNum,
6994                                   const TargetRegisterInfo *TRI) const {
6995  const MachineOperand &MO = MI.getOperand(OpNum);
6996  if (MO.getReg().isPhysical() && hasUndefRegUpdate(MI.getOpcode(), OpNum))
6997    return UndefRegClearance;
6998
6999  return 0;
7000}
7001
7002void X86InstrInfo::breakPartialRegDependency(
7003    MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
7004  Register Reg = MI.getOperand(OpNum).getReg();
7005  // If MI kills this register, the false dependence is already broken.
7006  if (MI.killsRegister(Reg, TRI))
7007    return;
7008
7009  if (X86::VR128RegClass.contains(Reg)) {
7010    // These instructions are all floating point domain, so xorps is the best
7011    // choice.
7012    unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
7013    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
7014        .addReg(Reg, RegState::Undef)
7015        .addReg(Reg, RegState::Undef);
7016    MI.addRegisterKilled(Reg, TRI, true);
7017  } else if (X86::VR256RegClass.contains(Reg)) {
7018    // Use vxorps to clear the full ymm register.
7019    // It wants to read and write the xmm sub-register.
7020    Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7021    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
7022        .addReg(XReg, RegState::Undef)
7023        .addReg(XReg, RegState::Undef)
7024        .addReg(Reg, RegState::ImplicitDefine);
7025    MI.addRegisterKilled(Reg, TRI, true);
7026  } else if (X86::VR128XRegClass.contains(Reg)) {
7027    // Only handle VLX targets.
7028    if (!Subtarget.hasVLX())
7029      return;
7030    // Since vxorps requires AVX512DQ, vpxord should be the best choice.
7031    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), Reg)
7032        .addReg(Reg, RegState::Undef)
7033        .addReg(Reg, RegState::Undef);
7034    MI.addRegisterKilled(Reg, TRI, true);
7035  } else if (X86::VR256XRegClass.contains(Reg) ||
7036             X86::VR512RegClass.contains(Reg)) {
7037    // Only handle VLX targets.
7038    if (!Subtarget.hasVLX())
7039      return;
7040    // Use vpxord to clear the full ymm/zmm register.
7041    // It wants to read and write the xmm sub-register.
7042    Register XReg = TRI->getSubReg(Reg, X86::sub_xmm);
7043    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VPXORDZ128rr), XReg)
7044        .addReg(XReg, RegState::Undef)
7045        .addReg(XReg, RegState::Undef)
7046        .addReg(Reg, RegState::ImplicitDefine);
7047    MI.addRegisterKilled(Reg, TRI, true);
7048  } else if (X86::GR64RegClass.contains(Reg)) {
7049    // Using XOR32rr because it has shorter encoding and zeros up the upper bits
7050    // as well.
7051    Register XReg = TRI->getSubReg(Reg, X86::sub_32bit);
7052    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
7053        .addReg(XReg, RegState::Undef)
7054        .addReg(XReg, RegState::Undef)
7055        .addReg(Reg, RegState::ImplicitDefine);
7056    MI.addRegisterKilled(Reg, TRI, true);
7057  } else if (X86::GR32RegClass.contains(Reg)) {
7058    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
7059        .addReg(Reg, RegState::Undef)
7060        .addReg(Reg, RegState::Undef);
7061    MI.addRegisterKilled(Reg, TRI, true);
7062  }
7063}
7064
7065static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
7066                        int PtrOffset = 0) {
7067  unsigned NumAddrOps = MOs.size();
7068
7069  if (NumAddrOps < 4) {
7070    // FrameIndex only - add an immediate offset (whether its zero or not).
7071    for (unsigned i = 0; i != NumAddrOps; ++i)
7072      MIB.add(MOs[i]);
7073    addOffset(MIB, PtrOffset);
7074  } else {
7075    // General Memory Addressing - we need to add any offset to an existing
7076    // offset.
7077    assert(MOs.size() == 5 && "Unexpected memory operand list length");
7078    for (unsigned i = 0; i != NumAddrOps; ++i) {
7079      const MachineOperand &MO = MOs[i];
7080      if (i == 3 && PtrOffset != 0) {
7081        MIB.addDisp(MO, PtrOffset);
7082      } else {
7083        MIB.add(MO);
7084      }
7085    }
7086  }
7087}
7088
7089static void updateOperandRegConstraints(MachineFunction &MF,
7090                                        MachineInstr &NewMI,
7091                                        const TargetInstrInfo &TII) {
7092  MachineRegisterInfo &MRI = MF.getRegInfo();
7093  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
7094
7095  for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
7096    MachineOperand &MO = NewMI.getOperand(Idx);
7097    // We only need to update constraints on virtual register operands.
7098    if (!MO.isReg())
7099      continue;
7100    Register Reg = MO.getReg();
7101    if (!Reg.isVirtual())
7102      continue;
7103
7104    auto *NewRC = MRI.constrainRegClass(
7105        Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI, MF));
7106    if (!NewRC) {
7107      LLVM_DEBUG(
7108          dbgs() << "WARNING: Unable to update register constraint for operand "
7109                 << Idx << " of instruction:\n";
7110          NewMI.dump(); dbgs() << "\n");
7111    }
7112  }
7113}
7114
7115static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
7116                                     ArrayRef<MachineOperand> MOs,
7117                                     MachineBasicBlock::iterator InsertPt,
7118                                     MachineInstr &MI,
7119                                     const TargetInstrInfo &TII) {
7120  // Create the base instruction with the memory operand as the first part.
7121  // Omit the implicit operands, something BuildMI can't do.
7122  MachineInstr *NewMI =
7123      MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7124  MachineInstrBuilder MIB(MF, NewMI);
7125  addOperands(MIB, MOs);
7126
7127  // Loop over the rest of the ri operands, converting them over.
7128  unsigned NumOps = MI.getDesc().getNumOperands() - 2;
7129  for (unsigned i = 0; i != NumOps; ++i) {
7130    MachineOperand &MO = MI.getOperand(i + 2);
7131    MIB.add(MO);
7132  }
7133  for (const MachineOperand &MO : llvm::drop_begin(MI.operands(), NumOps + 2))
7134    MIB.add(MO);
7135
7136  updateOperandRegConstraints(MF, *NewMI, TII);
7137
7138  MachineBasicBlock *MBB = InsertPt->getParent();
7139  MBB->insert(InsertPt, NewMI);
7140
7141  return MIB;
7142}
7143
7144static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
7145                              unsigned OpNo, ArrayRef<MachineOperand> MOs,
7146                              MachineBasicBlock::iterator InsertPt,
7147                              MachineInstr &MI, const TargetInstrInfo &TII,
7148                              int PtrOffset = 0) {
7149  // Omit the implicit operands, something BuildMI can't do.
7150  MachineInstr *NewMI =
7151      MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
7152  MachineInstrBuilder MIB(MF, NewMI);
7153
7154  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
7155    MachineOperand &MO = MI.getOperand(i);
7156    if (i == OpNo) {
7157      assert(MO.isReg() && "Expected to fold into reg operand!");
7158      addOperands(MIB, MOs, PtrOffset);
7159    } else {
7160      MIB.add(MO);
7161    }
7162  }
7163
7164  updateOperandRegConstraints(MF, *NewMI, TII);
7165
7166  // Copy the NoFPExcept flag from the instruction we're fusing.
7167  if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept))
7168    NewMI->setFlag(MachineInstr::MIFlag::NoFPExcept);
7169
7170  MachineBasicBlock *MBB = InsertPt->getParent();
7171  MBB->insert(InsertPt, NewMI);
7172
7173  return MIB;
7174}
7175
7176static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
7177                                ArrayRef<MachineOperand> MOs,
7178                                MachineBasicBlock::iterator InsertPt,
7179                                MachineInstr &MI) {
7180  MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
7181                                    MI.getDebugLoc(), TII.get(Opcode));
7182  addOperands(MIB, MOs);
7183  return MIB.addImm(0);
7184}
7185
7186MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
7187    MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7188    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
7189    unsigned Size, Align Alignment) const {
7190  switch (MI.getOpcode()) {
7191  case X86::INSERTPSrr:
7192  case X86::VINSERTPSrr:
7193  case X86::VINSERTPSZrr:
7194    // Attempt to convert the load of inserted vector into a fold load
7195    // of a single float.
7196    if (OpNum == 2) {
7197      unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
7198      unsigned ZMask = Imm & 15;
7199      unsigned DstIdx = (Imm >> 4) & 3;
7200      unsigned SrcIdx = (Imm >> 6) & 3;
7201
7202      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7203      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7204      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7205      if ((Size == 0 || Size >= 16) && RCSize >= 16 &&
7206          (MI.getOpcode() != X86::INSERTPSrr || Alignment >= Align(4))) {
7207        int PtrOffset = SrcIdx * 4;
7208        unsigned NewImm = (DstIdx << 4) | ZMask;
7209        unsigned NewOpCode =
7210            (MI.getOpcode() == X86::VINSERTPSZrr)  ? X86::VINSERTPSZrm
7211            : (MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm
7212                                                   : X86::INSERTPSrm;
7213        MachineInstr *NewMI =
7214            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
7215        NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
7216        return NewMI;
7217      }
7218    }
7219    break;
7220  case X86::MOVHLPSrr:
7221  case X86::VMOVHLPSrr:
7222  case X86::VMOVHLPSZrr:
7223    // Move the upper 64-bits of the second operand to the lower 64-bits.
7224    // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
7225    // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
7226    if (OpNum == 2) {
7227      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7228      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7229      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7230      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
7231        unsigned NewOpCode =
7232            (MI.getOpcode() == X86::VMOVHLPSZrr)  ? X86::VMOVLPSZ128rm
7233            : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm
7234                                                  : X86::MOVLPSrm;
7235        MachineInstr *NewMI =
7236            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
7237        return NewMI;
7238      }
7239    }
7240    break;
7241  case X86::UNPCKLPDrr:
7242    // If we won't be able to fold this to the memory form of UNPCKL, use
7243    // MOVHPD instead. Done as custom because we can't have this in the load
7244    // table twice.
7245    if (OpNum == 2) {
7246      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7247      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7248      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7249      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
7250        MachineInstr *NewMI =
7251            FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
7252        return NewMI;
7253      }
7254    }
7255    break;
7256  }
7257
7258  return nullptr;
7259}
7260
7261static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
7262                                               MachineInstr &MI) {
7263  if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/ true) ||
7264      !MI.getOperand(1).isReg())
7265    return false;
7266
7267  // The are two cases we need to handle depending on where in the pipeline
7268  // the folding attempt is being made.
7269  // -Register has the undef flag set.
7270  // -Register is produced by the IMPLICIT_DEF instruction.
7271
7272  if (MI.getOperand(1).isUndef())
7273    return true;
7274
7275  MachineRegisterInfo &RegInfo = MF.getRegInfo();
7276  MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
7277  return VRegDef && VRegDef->isImplicitDef();
7278}
7279
7280MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
7281    MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
7282    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
7283    unsigned Size, Align Alignment, bool AllowCommute) const {
7284  bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
7285  bool isTwoAddrFold = false;
7286
7287  // For CPUs that favor the register form of a call or push,
7288  // do not fold loads into calls or pushes, unless optimizing for size
7289  // aggressively.
7290  if (isSlowTwoMemOps && !MF.getFunction().hasMinSize() &&
7291      (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r ||
7292       MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r ||
7293       MI.getOpcode() == X86::PUSH64r))
7294    return nullptr;
7295
7296  // Avoid partial and undef register update stalls unless optimizing for size.
7297  if (!MF.getFunction().hasOptSize() &&
7298      (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7299       shouldPreventUndefRegUpdateMemFold(MF, MI)))
7300    return nullptr;
7301
7302  unsigned NumOps = MI.getDesc().getNumOperands();
7303  bool isTwoAddr =
7304      NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
7305
7306  // FIXME: AsmPrinter doesn't know how to handle
7307  // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
7308  if (MI.getOpcode() == X86::ADD32ri &&
7309      MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
7310    return nullptr;
7311
7312  // GOTTPOFF relocation loads can only be folded into add instructions.
7313  // FIXME: Need to exclude other relocations that only support specific
7314  // instructions.
7315  if (MOs.size() == X86::AddrNumOperands &&
7316      MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF &&
7317      MI.getOpcode() != X86::ADD64rr)
7318    return nullptr;
7319
7320  // Don't fold loads into indirect calls that need a KCFI check as we'll
7321  // have to unfold these in X86TargetLowering::EmitKCFICheck anyway.
7322  if (MI.isCall() && MI.getCFIType())
7323    return nullptr;
7324
7325  MachineInstr *NewMI = nullptr;
7326
7327  // Attempt to fold any custom cases we have.
7328  if (MachineInstr *CustomMI = foldMemoryOperandCustom(
7329          MF, MI, OpNum, MOs, InsertPt, Size, Alignment))
7330    return CustomMI;
7331
7332  const X86FoldTableEntry *I = nullptr;
7333
7334  // Folding a memory location into the two-address part of a two-address
7335  // instruction is different than folding it other places.  It requires
7336  // replacing the *two* registers with the memory location.
7337  if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() &&
7338      MI.getOperand(1).isReg() &&
7339      MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
7340    I = lookupTwoAddrFoldTable(MI.getOpcode());
7341    isTwoAddrFold = true;
7342  } else {
7343    if (OpNum == 0) {
7344      if (MI.getOpcode() == X86::MOV32r0) {
7345        NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
7346        if (NewMI)
7347          return NewMI;
7348      }
7349    }
7350
7351    I = lookupFoldTable(MI.getOpcode(), OpNum);
7352  }
7353
7354  if (I != nullptr) {
7355    unsigned Opcode = I->DstOp;
7356    bool FoldedLoad =
7357        isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_LOAD) || OpNum > 0;
7358    bool FoldedStore =
7359        isTwoAddrFold || (OpNum == 0 && I->Flags & TB_FOLDED_STORE);
7360    if (Alignment <
7361        Align(1ULL << ((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT)))
7362      return nullptr;
7363    bool NarrowToMOV32rm = false;
7364    if (Size) {
7365      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7366      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
7367      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
7368      // Check if it's safe to fold the load. If the size of the object is
7369      // narrower than the load width, then it's not.
7370      // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
7371      if (FoldedLoad && Size < RCSize) {
7372        // If this is a 64-bit load, but the spill slot is 32, then we can do
7373        // a 32-bit load which is implicitly zero-extended. This likely is
7374        // due to live interval analysis remat'ing a load from stack slot.
7375        if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
7376          return nullptr;
7377        if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
7378          return nullptr;
7379        Opcode = X86::MOV32rm;
7380        NarrowToMOV32rm = true;
7381      }
7382      // For stores, make sure the size of the object is equal to the size of
7383      // the store. If the object is larger, the extra bits would be garbage. If
7384      // the object is smaller we might overwrite another object or fault.
7385      if (FoldedStore && Size != RCSize)
7386        return nullptr;
7387    }
7388
7389    if (isTwoAddrFold)
7390      NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
7391    else
7392      NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
7393
7394    if (NarrowToMOV32rm) {
7395      // If this is the special case where we use a MOV32rm to load a 32-bit
7396      // value and zero-extend the top bits. Change the destination register
7397      // to a 32-bit one.
7398      Register DstReg = NewMI->getOperand(0).getReg();
7399      if (DstReg.isPhysical())
7400        NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
7401      else
7402        NewMI->getOperand(0).setSubReg(X86::sub_32bit);
7403    }
7404    return NewMI;
7405  }
7406
7407  // If the instruction and target operand are commutable, commute the
7408  // instruction and try again.
7409  if (AllowCommute) {
7410    unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
7411    if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
7412      bool HasDef = MI.getDesc().getNumDefs();
7413      Register Reg0 = HasDef ? MI.getOperand(0).getReg() : Register();
7414      Register Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
7415      Register Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
7416      bool Tied1 =
7417          0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
7418      bool Tied2 =
7419          0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
7420
7421      // If either of the commutable operands are tied to the destination
7422      // then we can not commute + fold.
7423      if ((HasDef && Reg0 == Reg1 && Tied1) ||
7424          (HasDef && Reg0 == Reg2 && Tied2))
7425        return nullptr;
7426
7427      MachineInstr *CommutedMI =
7428          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
7429      if (!CommutedMI) {
7430        // Unable to commute.
7431        return nullptr;
7432      }
7433      if (CommutedMI != &MI) {
7434        // New instruction. We can't fold from this.
7435        CommutedMI->eraseFromParent();
7436        return nullptr;
7437      }
7438
7439      // Attempt to fold with the commuted version of the instruction.
7440      NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
7441                                    Alignment, /*AllowCommute=*/false);
7442      if (NewMI)
7443        return NewMI;
7444
7445      // Folding failed again - undo the commute before returning.
7446      MachineInstr *UncommutedMI =
7447          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
7448      if (!UncommutedMI) {
7449        // Unable to commute.
7450        return nullptr;
7451      }
7452      if (UncommutedMI != &MI) {
7453        // New instruction. It doesn't need to be kept.
7454        UncommutedMI->eraseFromParent();
7455        return nullptr;
7456      }
7457
7458      // Return here to prevent duplicate fuse failure report.
7459      return nullptr;
7460    }
7461  }
7462
7463  // No fusion
7464  if (PrintFailedFusing && !MI.isCopy())
7465    dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
7466  return nullptr;
7467}
7468
7469MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
7470    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
7471    MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
7472    VirtRegMap *VRM) const {
7473  // Check switch flag
7474  if (NoFusing)
7475    return nullptr;
7476
7477  // Avoid partial and undef register update stalls unless optimizing for size.
7478  if (!MF.getFunction().hasOptSize() &&
7479      (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
7480       shouldPreventUndefRegUpdateMemFold(MF, MI)))
7481    return nullptr;
7482
7483  // Don't fold subreg spills, or reloads that use a high subreg.
7484  for (auto Op : Ops) {
7485    MachineOperand &MO = MI.getOperand(Op);
7486    auto SubReg = MO.getSubReg();
7487    if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
7488      return nullptr;
7489  }
7490
7491  const MachineFrameInfo &MFI = MF.getFrameInfo();
7492  unsigned Size = MFI.getObjectSize(FrameIndex);
7493  Align Alignment = MFI.getObjectAlign(FrameIndex);
7494  // If the function stack isn't realigned we don't want to fold instructions
7495  // that need increased alignment.
7496  if (!RI.hasStackRealignment(MF))
7497    Alignment =
7498        std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
7499  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
7500    unsigned NewOpc = 0;
7501    unsigned RCSize = 0;
7502    switch (MI.getOpcode()) {
7503    default:
7504      return nullptr;
7505    case X86::TEST8rr:
7506      NewOpc = X86::CMP8ri;
7507      RCSize = 1;
7508      break;
7509    case X86::TEST16rr:
7510      NewOpc = X86::CMP16ri;
7511      RCSize = 2;
7512      break;
7513    case X86::TEST32rr:
7514      NewOpc = X86::CMP32ri;
7515      RCSize = 4;
7516      break;
7517    case X86::TEST64rr:
7518      NewOpc = X86::CMP64ri32;
7519      RCSize = 8;
7520      break;
7521    }
7522    // Check if it's safe to fold the load. If the size of the object is
7523    // narrower than the load width, then it's not.
7524    if (Size < RCSize)
7525      return nullptr;
7526    // Change to CMPXXri r, 0 first.
7527    MI.setDesc(get(NewOpc));
7528    MI.getOperand(1).ChangeToImmediate(0);
7529  } else if (Ops.size() != 1)
7530    return nullptr;
7531
7532  return foldMemoryOperandImpl(MF, MI, Ops[0],
7533                               MachineOperand::CreateFI(FrameIndex), InsertPt,
7534                               Size, Alignment, /*AllowCommute=*/true);
7535}
7536
7537/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
7538/// because the latter uses contents that wouldn't be defined in the folded
7539/// version.  For instance, this transformation isn't legal:
7540///   movss (%rdi), %xmm0
7541///   addps %xmm0, %xmm0
7542/// ->
7543///   addps (%rdi), %xmm0
7544///
7545/// But this one is:
7546///   movss (%rdi), %xmm0
7547///   addss %xmm0, %xmm0
7548/// ->
7549///   addss (%rdi), %xmm0
7550///
7551static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
7552                                             const MachineInstr &UserMI,
7553                                             const MachineFunction &MF) {
7554  unsigned Opc = LoadMI.getOpcode();
7555  unsigned UserOpc = UserMI.getOpcode();
7556  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
7557  const TargetRegisterClass *RC =
7558      MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
7559  unsigned RegSize = TRI.getRegSizeInBits(*RC);
7560
7561  if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm ||
7562       Opc == X86::MOVSSrm_alt || Opc == X86::VMOVSSrm_alt ||
7563       Opc == X86::VMOVSSZrm_alt) &&
7564      RegSize > 32) {
7565    // These instructions only load 32 bits, we can't fold them if the
7566    // destination register is wider than 32 bits (4 bytes), and its user
7567    // instruction isn't scalar (SS).
7568    switch (UserOpc) {
7569    case X86::CVTSS2SDrr_Int:
7570    case X86::VCVTSS2SDrr_Int:
7571    case X86::VCVTSS2SDZrr_Int:
7572    case X86::VCVTSS2SDZrr_Intk:
7573    case X86::VCVTSS2SDZrr_Intkz:
7574    case X86::CVTSS2SIrr_Int:
7575    case X86::CVTSS2SI64rr_Int:
7576    case X86::VCVTSS2SIrr_Int:
7577    case X86::VCVTSS2SI64rr_Int:
7578    case X86::VCVTSS2SIZrr_Int:
7579    case X86::VCVTSS2SI64Zrr_Int:
7580    case X86::CVTTSS2SIrr_Int:
7581    case X86::CVTTSS2SI64rr_Int:
7582    case X86::VCVTTSS2SIrr_Int:
7583    case X86::VCVTTSS2SI64rr_Int:
7584    case X86::VCVTTSS2SIZrr_Int:
7585    case X86::VCVTTSS2SI64Zrr_Int:
7586    case X86::VCVTSS2USIZrr_Int:
7587    case X86::VCVTSS2USI64Zrr_Int:
7588    case X86::VCVTTSS2USIZrr_Int:
7589    case X86::VCVTTSS2USI64Zrr_Int:
7590    case X86::RCPSSr_Int:
7591    case X86::VRCPSSr_Int:
7592    case X86::RSQRTSSr_Int:
7593    case X86::VRSQRTSSr_Int:
7594    case X86::ROUNDSSr_Int:
7595    case X86::VROUNDSSr_Int:
7596    case X86::COMISSrr_Int:
7597    case X86::VCOMISSrr_Int:
7598    case X86::VCOMISSZrr_Int:
7599    case X86::UCOMISSrr_Int:
7600    case X86::VUCOMISSrr_Int:
7601    case X86::VUCOMISSZrr_Int:
7602    case X86::ADDSSrr_Int:
7603    case X86::VADDSSrr_Int:
7604    case X86::VADDSSZrr_Int:
7605    case X86::CMPSSrr_Int:
7606    case X86::VCMPSSrr_Int:
7607    case X86::VCMPSSZrr_Int:
7608    case X86::DIVSSrr_Int:
7609    case X86::VDIVSSrr_Int:
7610    case X86::VDIVSSZrr_Int:
7611    case X86::MAXSSrr_Int:
7612    case X86::VMAXSSrr_Int:
7613    case X86::VMAXSSZrr_Int:
7614    case X86::MINSSrr_Int:
7615    case X86::VMINSSrr_Int:
7616    case X86::VMINSSZrr_Int:
7617    case X86::MULSSrr_Int:
7618    case X86::VMULSSrr_Int:
7619    case X86::VMULSSZrr_Int:
7620    case X86::SQRTSSr_Int:
7621    case X86::VSQRTSSr_Int:
7622    case X86::VSQRTSSZr_Int:
7623    case X86::SUBSSrr_Int:
7624    case X86::VSUBSSrr_Int:
7625    case X86::VSUBSSZrr_Int:
7626    case X86::VADDSSZrr_Intk:
7627    case X86::VADDSSZrr_Intkz:
7628    case X86::VCMPSSZrr_Intk:
7629    case X86::VDIVSSZrr_Intk:
7630    case X86::VDIVSSZrr_Intkz:
7631    case X86::VMAXSSZrr_Intk:
7632    case X86::VMAXSSZrr_Intkz:
7633    case X86::VMINSSZrr_Intk:
7634    case X86::VMINSSZrr_Intkz:
7635    case X86::VMULSSZrr_Intk:
7636    case X86::VMULSSZrr_Intkz:
7637    case X86::VSQRTSSZr_Intk:
7638    case X86::VSQRTSSZr_Intkz:
7639    case X86::VSUBSSZrr_Intk:
7640    case X86::VSUBSSZrr_Intkz:
7641    case X86::VFMADDSS4rr_Int:
7642    case X86::VFNMADDSS4rr_Int:
7643    case X86::VFMSUBSS4rr_Int:
7644    case X86::VFNMSUBSS4rr_Int:
7645    case X86::VFMADD132SSr_Int:
7646    case X86::VFNMADD132SSr_Int:
7647    case X86::VFMADD213SSr_Int:
7648    case X86::VFNMADD213SSr_Int:
7649    case X86::VFMADD231SSr_Int:
7650    case X86::VFNMADD231SSr_Int:
7651    case X86::VFMSUB132SSr_Int:
7652    case X86::VFNMSUB132SSr_Int:
7653    case X86::VFMSUB213SSr_Int:
7654    case X86::VFNMSUB213SSr_Int:
7655    case X86::VFMSUB231SSr_Int:
7656    case X86::VFNMSUB231SSr_Int:
7657    case X86::VFMADD132SSZr_Int:
7658    case X86::VFNMADD132SSZr_Int:
7659    case X86::VFMADD213SSZr_Int:
7660    case X86::VFNMADD213SSZr_Int:
7661    case X86::VFMADD231SSZr_Int:
7662    case X86::VFNMADD231SSZr_Int:
7663    case X86::VFMSUB132SSZr_Int:
7664    case X86::VFNMSUB132SSZr_Int:
7665    case X86::VFMSUB213SSZr_Int:
7666    case X86::VFNMSUB213SSZr_Int:
7667    case X86::VFMSUB231SSZr_Int:
7668    case X86::VFNMSUB231SSZr_Int:
7669    case X86::VFMADD132SSZr_Intk:
7670    case X86::VFNMADD132SSZr_Intk:
7671    case X86::VFMADD213SSZr_Intk:
7672    case X86::VFNMADD213SSZr_Intk:
7673    case X86::VFMADD231SSZr_Intk:
7674    case X86::VFNMADD231SSZr_Intk:
7675    case X86::VFMSUB132SSZr_Intk:
7676    case X86::VFNMSUB132SSZr_Intk:
7677    case X86::VFMSUB213SSZr_Intk:
7678    case X86::VFNMSUB213SSZr_Intk:
7679    case X86::VFMSUB231SSZr_Intk:
7680    case X86::VFNMSUB231SSZr_Intk:
7681    case X86::VFMADD132SSZr_Intkz:
7682    case X86::VFNMADD132SSZr_Intkz:
7683    case X86::VFMADD213SSZr_Intkz:
7684    case X86::VFNMADD213SSZr_Intkz:
7685    case X86::VFMADD231SSZr_Intkz:
7686    case X86::VFNMADD231SSZr_Intkz:
7687    case X86::VFMSUB132SSZr_Intkz:
7688    case X86::VFNMSUB132SSZr_Intkz:
7689    case X86::VFMSUB213SSZr_Intkz:
7690    case X86::VFNMSUB213SSZr_Intkz:
7691    case X86::VFMSUB231SSZr_Intkz:
7692    case X86::VFNMSUB231SSZr_Intkz:
7693    case X86::VFIXUPIMMSSZrri:
7694    case X86::VFIXUPIMMSSZrrik:
7695    case X86::VFIXUPIMMSSZrrikz:
7696    case X86::VFPCLASSSSZrr:
7697    case X86::VFPCLASSSSZrrk:
7698    case X86::VGETEXPSSZr:
7699    case X86::VGETEXPSSZrk:
7700    case X86::VGETEXPSSZrkz:
7701    case X86::VGETMANTSSZrri:
7702    case X86::VGETMANTSSZrrik:
7703    case X86::VGETMANTSSZrrikz:
7704    case X86::VRANGESSZrri:
7705    case X86::VRANGESSZrrik:
7706    case X86::VRANGESSZrrikz:
7707    case X86::VRCP14SSZrr:
7708    case X86::VRCP14SSZrrk:
7709    case X86::VRCP14SSZrrkz:
7710    case X86::VRCP28SSZr:
7711    case X86::VRCP28SSZrk:
7712    case X86::VRCP28SSZrkz:
7713    case X86::VREDUCESSZrri:
7714    case X86::VREDUCESSZrrik:
7715    case X86::VREDUCESSZrrikz:
7716    case X86::VRNDSCALESSZr_Int:
7717    case X86::VRNDSCALESSZr_Intk:
7718    case X86::VRNDSCALESSZr_Intkz:
7719    case X86::VRSQRT14SSZrr:
7720    case X86::VRSQRT14SSZrrk:
7721    case X86::VRSQRT14SSZrrkz:
7722    case X86::VRSQRT28SSZr:
7723    case X86::VRSQRT28SSZrk:
7724    case X86::VRSQRT28SSZrkz:
7725    case X86::VSCALEFSSZrr:
7726    case X86::VSCALEFSSZrrk:
7727    case X86::VSCALEFSSZrrkz:
7728      return false;
7729    default:
7730      return true;
7731    }
7732  }
7733
7734  if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm ||
7735       Opc == X86::MOVSDrm_alt || Opc == X86::VMOVSDrm_alt ||
7736       Opc == X86::VMOVSDZrm_alt) &&
7737      RegSize > 64) {
7738    // These instructions only load 64 bits, we can't fold them if the
7739    // destination register is wider than 64 bits (8 bytes), and its user
7740    // instruction isn't scalar (SD).
7741    switch (UserOpc) {
7742    case X86::CVTSD2SSrr_Int:
7743    case X86::VCVTSD2SSrr_Int:
7744    case X86::VCVTSD2SSZrr_Int:
7745    case X86::VCVTSD2SSZrr_Intk:
7746    case X86::VCVTSD2SSZrr_Intkz:
7747    case X86::CVTSD2SIrr_Int:
7748    case X86::CVTSD2SI64rr_Int:
7749    case X86::VCVTSD2SIrr_Int:
7750    case X86::VCVTSD2SI64rr_Int:
7751    case X86::VCVTSD2SIZrr_Int:
7752    case X86::VCVTSD2SI64Zrr_Int:
7753    case X86::CVTTSD2SIrr_Int:
7754    case X86::CVTTSD2SI64rr_Int:
7755    case X86::VCVTTSD2SIrr_Int:
7756    case X86::VCVTTSD2SI64rr_Int:
7757    case X86::VCVTTSD2SIZrr_Int:
7758    case X86::VCVTTSD2SI64Zrr_Int:
7759    case X86::VCVTSD2USIZrr_Int:
7760    case X86::VCVTSD2USI64Zrr_Int:
7761    case X86::VCVTTSD2USIZrr_Int:
7762    case X86::VCVTTSD2USI64Zrr_Int:
7763    case X86::ROUNDSDr_Int:
7764    case X86::VROUNDSDr_Int:
7765    case X86::COMISDrr_Int:
7766    case X86::VCOMISDrr_Int:
7767    case X86::VCOMISDZrr_Int:
7768    case X86::UCOMISDrr_Int:
7769    case X86::VUCOMISDrr_Int:
7770    case X86::VUCOMISDZrr_Int:
7771    case X86::ADDSDrr_Int:
7772    case X86::VADDSDrr_Int:
7773    case X86::VADDSDZrr_Int:
7774    case X86::CMPSDrr_Int:
7775    case X86::VCMPSDrr_Int:
7776    case X86::VCMPSDZrr_Int:
7777    case X86::DIVSDrr_Int:
7778    case X86::VDIVSDrr_Int:
7779    case X86::VDIVSDZrr_Int:
7780    case X86::MAXSDrr_Int:
7781    case X86::VMAXSDrr_Int:
7782    case X86::VMAXSDZrr_Int:
7783    case X86::MINSDrr_Int:
7784    case X86::VMINSDrr_Int:
7785    case X86::VMINSDZrr_Int:
7786    case X86::MULSDrr_Int:
7787    case X86::VMULSDrr_Int:
7788    case X86::VMULSDZrr_Int:
7789    case X86::SQRTSDr_Int:
7790    case X86::VSQRTSDr_Int:
7791    case X86::VSQRTSDZr_Int:
7792    case X86::SUBSDrr_Int:
7793    case X86::VSUBSDrr_Int:
7794    case X86::VSUBSDZrr_Int:
7795    case X86::VADDSDZrr_Intk:
7796    case X86::VADDSDZrr_Intkz:
7797    case X86::VCMPSDZrr_Intk:
7798    case X86::VDIVSDZrr_Intk:
7799    case X86::VDIVSDZrr_Intkz:
7800    case X86::VMAXSDZrr_Intk:
7801    case X86::VMAXSDZrr_Intkz:
7802    case X86::VMINSDZrr_Intk:
7803    case X86::VMINSDZrr_Intkz:
7804    case X86::VMULSDZrr_Intk:
7805    case X86::VMULSDZrr_Intkz:
7806    case X86::VSQRTSDZr_Intk:
7807    case X86::VSQRTSDZr_Intkz:
7808    case X86::VSUBSDZrr_Intk:
7809    case X86::VSUBSDZrr_Intkz:
7810    case X86::VFMADDSD4rr_Int:
7811    case X86::VFNMADDSD4rr_Int:
7812    case X86::VFMSUBSD4rr_Int:
7813    case X86::VFNMSUBSD4rr_Int:
7814    case X86::VFMADD132SDr_Int:
7815    case X86::VFNMADD132SDr_Int:
7816    case X86::VFMADD213SDr_Int:
7817    case X86::VFNMADD213SDr_Int:
7818    case X86::VFMADD231SDr_Int:
7819    case X86::VFNMADD231SDr_Int:
7820    case X86::VFMSUB132SDr_Int:
7821    case X86::VFNMSUB132SDr_Int:
7822    case X86::VFMSUB213SDr_Int:
7823    case X86::VFNMSUB213SDr_Int:
7824    case X86::VFMSUB231SDr_Int:
7825    case X86::VFNMSUB231SDr_Int:
7826    case X86::VFMADD132SDZr_Int:
7827    case X86::VFNMADD132SDZr_Int:
7828    case X86::VFMADD213SDZr_Int:
7829    case X86::VFNMADD213SDZr_Int:
7830    case X86::VFMADD231SDZr_Int:
7831    case X86::VFNMADD231SDZr_Int:
7832    case X86::VFMSUB132SDZr_Int:
7833    case X86::VFNMSUB132SDZr_Int:
7834    case X86::VFMSUB213SDZr_Int:
7835    case X86::VFNMSUB213SDZr_Int:
7836    case X86::VFMSUB231SDZr_Int:
7837    case X86::VFNMSUB231SDZr_Int:
7838    case X86::VFMADD132SDZr_Intk:
7839    case X86::VFNMADD132SDZr_Intk:
7840    case X86::VFMADD213SDZr_Intk:
7841    case X86::VFNMADD213SDZr_Intk:
7842    case X86::VFMADD231SDZr_Intk:
7843    case X86::VFNMADD231SDZr_Intk:
7844    case X86::VFMSUB132SDZr_Intk:
7845    case X86::VFNMSUB132SDZr_Intk:
7846    case X86::VFMSUB213SDZr_Intk:
7847    case X86::VFNMSUB213SDZr_Intk:
7848    case X86::VFMSUB231SDZr_Intk:
7849    case X86::VFNMSUB231SDZr_Intk:
7850    case X86::VFMADD132SDZr_Intkz:
7851    case X86::VFNMADD132SDZr_Intkz:
7852    case X86::VFMADD213SDZr_Intkz:
7853    case X86::VFNMADD213SDZr_Intkz:
7854    case X86::VFMADD231SDZr_Intkz:
7855    case X86::VFNMADD231SDZr_Intkz:
7856    case X86::VFMSUB132SDZr_Intkz:
7857    case X86::VFNMSUB132SDZr_Intkz:
7858    case X86::VFMSUB213SDZr_Intkz:
7859    case X86::VFNMSUB213SDZr_Intkz:
7860    case X86::VFMSUB231SDZr_Intkz:
7861    case X86::VFNMSUB231SDZr_Intkz:
7862    case X86::VFIXUPIMMSDZrri:
7863    case X86::VFIXUPIMMSDZrrik:
7864    case X86::VFIXUPIMMSDZrrikz:
7865    case X86::VFPCLASSSDZrr:
7866    case X86::VFPCLASSSDZrrk:
7867    case X86::VGETEXPSDZr:
7868    case X86::VGETEXPSDZrk:
7869    case X86::VGETEXPSDZrkz:
7870    case X86::VGETMANTSDZrri:
7871    case X86::VGETMANTSDZrrik:
7872    case X86::VGETMANTSDZrrikz:
7873    case X86::VRANGESDZrri:
7874    case X86::VRANGESDZrrik:
7875    case X86::VRANGESDZrrikz:
7876    case X86::VRCP14SDZrr:
7877    case X86::VRCP14SDZrrk:
7878    case X86::VRCP14SDZrrkz:
7879    case X86::VRCP28SDZr:
7880    case X86::VRCP28SDZrk:
7881    case X86::VRCP28SDZrkz:
7882    case X86::VREDUCESDZrri:
7883    case X86::VREDUCESDZrrik:
7884    case X86::VREDUCESDZrrikz:
7885    case X86::VRNDSCALESDZr_Int:
7886    case X86::VRNDSCALESDZr_Intk:
7887    case X86::VRNDSCALESDZr_Intkz:
7888    case X86::VRSQRT14SDZrr:
7889    case X86::VRSQRT14SDZrrk:
7890    case X86::VRSQRT14SDZrrkz:
7891    case X86::VRSQRT28SDZr:
7892    case X86::VRSQRT28SDZrk:
7893    case X86::VRSQRT28SDZrkz:
7894    case X86::VSCALEFSDZrr:
7895    case X86::VSCALEFSDZrrk:
7896    case X86::VSCALEFSDZrrkz:
7897      return false;
7898    default:
7899      return true;
7900    }
7901  }
7902
7903  if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) {
7904    // These instructions only load 16 bits, we can't fold them if the
7905    // destination register is wider than 16 bits (2 bytes), and its user
7906    // instruction isn't scalar (SH).
7907    switch (UserOpc) {
7908    case X86::VADDSHZrr_Int:
7909    case X86::VCMPSHZrr_Int:
7910    case X86::VDIVSHZrr_Int:
7911    case X86::VMAXSHZrr_Int:
7912    case X86::VMINSHZrr_Int:
7913    case X86::VMULSHZrr_Int:
7914    case X86::VSUBSHZrr_Int:
7915    case X86::VADDSHZrr_Intk:
7916    case X86::VADDSHZrr_Intkz:
7917    case X86::VCMPSHZrr_Intk:
7918    case X86::VDIVSHZrr_Intk:
7919    case X86::VDIVSHZrr_Intkz:
7920    case X86::VMAXSHZrr_Intk:
7921    case X86::VMAXSHZrr_Intkz:
7922    case X86::VMINSHZrr_Intk:
7923    case X86::VMINSHZrr_Intkz:
7924    case X86::VMULSHZrr_Intk:
7925    case X86::VMULSHZrr_Intkz:
7926    case X86::VSUBSHZrr_Intk:
7927    case X86::VSUBSHZrr_Intkz:
7928    case X86::VFMADD132SHZr_Int:
7929    case X86::VFNMADD132SHZr_Int:
7930    case X86::VFMADD213SHZr_Int:
7931    case X86::VFNMADD213SHZr_Int:
7932    case X86::VFMADD231SHZr_Int:
7933    case X86::VFNMADD231SHZr_Int:
7934    case X86::VFMSUB132SHZr_Int:
7935    case X86::VFNMSUB132SHZr_Int:
7936    case X86::VFMSUB213SHZr_Int:
7937    case X86::VFNMSUB213SHZr_Int:
7938    case X86::VFMSUB231SHZr_Int:
7939    case X86::VFNMSUB231SHZr_Int:
7940    case X86::VFMADD132SHZr_Intk:
7941    case X86::VFNMADD132SHZr_Intk:
7942    case X86::VFMADD213SHZr_Intk:
7943    case X86::VFNMADD213SHZr_Intk:
7944    case X86::VFMADD231SHZr_Intk:
7945    case X86::VFNMADD231SHZr_Intk:
7946    case X86::VFMSUB132SHZr_Intk:
7947    case X86::VFNMSUB132SHZr_Intk:
7948    case X86::VFMSUB213SHZr_Intk:
7949    case X86::VFNMSUB213SHZr_Intk:
7950    case X86::VFMSUB231SHZr_Intk:
7951    case X86::VFNMSUB231SHZr_Intk:
7952    case X86::VFMADD132SHZr_Intkz:
7953    case X86::VFNMADD132SHZr_Intkz:
7954    case X86::VFMADD213SHZr_Intkz:
7955    case X86::VFNMADD213SHZr_Intkz:
7956    case X86::VFMADD231SHZr_Intkz:
7957    case X86::VFNMADD231SHZr_Intkz:
7958    case X86::VFMSUB132SHZr_Intkz:
7959    case X86::VFNMSUB132SHZr_Intkz:
7960    case X86::VFMSUB213SHZr_Intkz:
7961    case X86::VFNMSUB213SHZr_Intkz:
7962    case X86::VFMSUB231SHZr_Intkz:
7963    case X86::VFNMSUB231SHZr_Intkz:
7964      return false;
7965    default:
7966      return true;
7967    }
7968  }
7969
7970  return false;
7971}
7972
7973MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
7974    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
7975    MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
7976    LiveIntervals *LIS) const {
7977
7978  // TODO: Support the case where LoadMI loads a wide register, but MI
7979  // only uses a subreg.
7980  for (auto Op : Ops) {
7981    if (MI.getOperand(Op).getSubReg())
7982      return nullptr;
7983  }
7984
7985  // If loading from a FrameIndex, fold directly from the FrameIndex.
7986  unsigned NumOps = LoadMI.getDesc().getNumOperands();
7987  int FrameIndex;
7988  if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
7989    if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
7990      return nullptr;
7991    return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
7992  }
7993
7994  // Check switch flag
7995  if (NoFusing)
7996    return nullptr;
7997
7998  // Avoid partial and undef register update stalls unless optimizing for size.
7999  if (!MF.getFunction().hasOptSize() &&
8000      (hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/ true) ||
8001       shouldPreventUndefRegUpdateMemFold(MF, MI)))
8002    return nullptr;
8003
8004  // Determine the alignment of the load.
8005  Align Alignment;
8006  if (LoadMI.hasOneMemOperand())
8007    Alignment = (*LoadMI.memoperands_begin())->getAlign();
8008  else
8009    switch (LoadMI.getOpcode()) {
8010    case X86::AVX512_512_SET0:
8011    case X86::AVX512_512_SETALLONES:
8012      Alignment = Align(64);
8013      break;
8014    case X86::AVX2_SETALLONES:
8015    case X86::AVX1_SETALLONES:
8016    case X86::AVX_SET0:
8017    case X86::AVX512_256_SET0:
8018      Alignment = Align(32);
8019      break;
8020    case X86::V_SET0:
8021    case X86::V_SETALLONES:
8022    case X86::AVX512_128_SET0:
8023    case X86::FsFLD0F128:
8024    case X86::AVX512_FsFLD0F128:
8025      Alignment = Align(16);
8026      break;
8027    case X86::MMX_SET0:
8028    case X86::FsFLD0SD:
8029    case X86::AVX512_FsFLD0SD:
8030      Alignment = Align(8);
8031      break;
8032    case X86::FsFLD0SS:
8033    case X86::AVX512_FsFLD0SS:
8034      Alignment = Align(4);
8035      break;
8036    case X86::FsFLD0SH:
8037    case X86::AVX512_FsFLD0SH:
8038      Alignment = Align(2);
8039      break;
8040    default:
8041      return nullptr;
8042    }
8043  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
8044    unsigned NewOpc = 0;
8045    switch (MI.getOpcode()) {
8046    default:
8047      return nullptr;
8048    case X86::TEST8rr:
8049      NewOpc = X86::CMP8ri;
8050      break;
8051    case X86::TEST16rr:
8052      NewOpc = X86::CMP16ri;
8053      break;
8054    case X86::TEST32rr:
8055      NewOpc = X86::CMP32ri;
8056      break;
8057    case X86::TEST64rr:
8058      NewOpc = X86::CMP64ri32;
8059      break;
8060    }
8061    // Change to CMPXXri r, 0 first.
8062    MI.setDesc(get(NewOpc));
8063    MI.getOperand(1).ChangeToImmediate(0);
8064  } else if (Ops.size() != 1)
8065    return nullptr;
8066
8067  // Make sure the subregisters match.
8068  // Otherwise we risk changing the size of the load.
8069  if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
8070    return nullptr;
8071
8072  SmallVector<MachineOperand, X86::AddrNumOperands> MOs;
8073  switch (LoadMI.getOpcode()) {
8074  case X86::MMX_SET0:
8075  case X86::V_SET0:
8076  case X86::V_SETALLONES:
8077  case X86::AVX2_SETALLONES:
8078  case X86::AVX1_SETALLONES:
8079  case X86::AVX_SET0:
8080  case X86::AVX512_128_SET0:
8081  case X86::AVX512_256_SET0:
8082  case X86::AVX512_512_SET0:
8083  case X86::AVX512_512_SETALLONES:
8084  case X86::FsFLD0SH:
8085  case X86::AVX512_FsFLD0SH:
8086  case X86::FsFLD0SD:
8087  case X86::AVX512_FsFLD0SD:
8088  case X86::FsFLD0SS:
8089  case X86::AVX512_FsFLD0SS:
8090  case X86::FsFLD0F128:
8091  case X86::AVX512_FsFLD0F128: {
8092    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
8093    // Create a constant-pool entry and operands to load from it.
8094
8095    // Large code model can't fold loads this way.
8096    if (MF.getTarget().getCodeModel() == CodeModel::Large)
8097      return nullptr;
8098
8099    // x86-32 PIC requires a PIC base register for constant pools.
8100    unsigned PICBase = 0;
8101    // Since we're using Small or Kernel code model, we can always use
8102    // RIP-relative addressing for a smaller encoding.
8103    if (Subtarget.is64Bit()) {
8104      PICBase = X86::RIP;
8105    } else if (MF.getTarget().isPositionIndependent()) {
8106      // FIXME: PICBase = getGlobalBaseReg(&MF);
8107      // This doesn't work for several reasons.
8108      // 1. GlobalBaseReg may have been spilled.
8109      // 2. It may not be live at MI.
8110      return nullptr;
8111    }
8112
8113    // Create a constant-pool entry.
8114    MachineConstantPool &MCP = *MF.getConstantPool();
8115    Type *Ty;
8116    unsigned Opc = LoadMI.getOpcode();
8117    if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS)
8118      Ty = Type::getFloatTy(MF.getFunction().getContext());
8119    else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
8120      Ty = Type::getDoubleTy(MF.getFunction().getContext());
8121    else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
8122      Ty = Type::getFP128Ty(MF.getFunction().getContext());
8123    else if (Opc == X86::FsFLD0SH || Opc == X86::AVX512_FsFLD0SH)
8124      Ty = Type::getHalfTy(MF.getFunction().getContext());
8125    else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
8126      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
8127                                16);
8128    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
8129             Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
8130      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
8131                                8);
8132    else if (Opc == X86::MMX_SET0)
8133      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
8134                                2);
8135    else
8136      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
8137                                4);
8138
8139    bool IsAllOnes =
8140        (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
8141         Opc == X86::AVX512_512_SETALLONES || Opc == X86::AVX1_SETALLONES);
8142    const Constant *C =
8143        IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty);
8144    unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
8145
8146    // Create operands to load from the constant pool entry.
8147    MOs.push_back(MachineOperand::CreateReg(PICBase, false));
8148    MOs.push_back(MachineOperand::CreateImm(1));
8149    MOs.push_back(MachineOperand::CreateReg(0, false));
8150    MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
8151    MOs.push_back(MachineOperand::CreateReg(0, false));
8152    break;
8153  }
8154  default: {
8155    if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
8156      return nullptr;
8157
8158    // Folding a normal load. Just copy the load's address operands.
8159    MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands,
8160               LoadMI.operands_begin() + NumOps);
8161    break;
8162  }
8163  }
8164  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
8165                               /*Size=*/0, Alignment, /*AllowCommute=*/true);
8166}
8167
8168static SmallVector<MachineMemOperand *, 2>
8169extractLoadMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
8170  SmallVector<MachineMemOperand *, 2> LoadMMOs;
8171
8172  for (MachineMemOperand *MMO : MMOs) {
8173    if (!MMO->isLoad())
8174      continue;
8175
8176    if (!MMO->isStore()) {
8177      // Reuse the MMO.
8178      LoadMMOs.push_back(MMO);
8179    } else {
8180      // Clone the MMO and unset the store flag.
8181      LoadMMOs.push_back(MF.getMachineMemOperand(
8182          MMO, MMO->getFlags() & ~MachineMemOperand::MOStore));
8183    }
8184  }
8185
8186  return LoadMMOs;
8187}
8188
8189static SmallVector<MachineMemOperand *, 2>
8190extractStoreMMOs(ArrayRef<MachineMemOperand *> MMOs, MachineFunction &MF) {
8191  SmallVector<MachineMemOperand *, 2> StoreMMOs;
8192
8193  for (MachineMemOperand *MMO : MMOs) {
8194    if (!MMO->isStore())
8195      continue;
8196
8197    if (!MMO->isLoad()) {
8198      // Reuse the MMO.
8199      StoreMMOs.push_back(MMO);
8200    } else {
8201      // Clone the MMO and unset the load flag.
8202      StoreMMOs.push_back(MF.getMachineMemOperand(
8203          MMO, MMO->getFlags() & ~MachineMemOperand::MOLoad));
8204    }
8205  }
8206
8207  return StoreMMOs;
8208}
8209
8210static unsigned getBroadcastOpcode(const X86FoldTableEntry *I,
8211                                   const TargetRegisterClass *RC,
8212                                   const X86Subtarget &STI) {
8213  assert(STI.hasAVX512() && "Expected at least AVX512!");
8214  unsigned SpillSize = STI.getRegisterInfo()->getSpillSize(*RC);
8215  assert((SpillSize == 64 || STI.hasVLX()) &&
8216         "Can't broadcast less than 64 bytes without AVX512VL!");
8217
8218  switch (I->Flags & TB_BCAST_MASK) {
8219  default:
8220    llvm_unreachable("Unexpected broadcast type!");
8221  case TB_BCAST_D:
8222    switch (SpillSize) {
8223    default:
8224      llvm_unreachable("Unknown spill size");
8225    case 16:
8226      return X86::VPBROADCASTDZ128rm;
8227    case 32:
8228      return X86::VPBROADCASTDZ256rm;
8229    case 64:
8230      return X86::VPBROADCASTDZrm;
8231    }
8232    break;
8233  case TB_BCAST_Q:
8234    switch (SpillSize) {
8235    default:
8236      llvm_unreachable("Unknown spill size");
8237    case 16:
8238      return X86::VPBROADCASTQZ128rm;
8239    case 32:
8240      return X86::VPBROADCASTQZ256rm;
8241    case 64:
8242      return X86::VPBROADCASTQZrm;
8243    }
8244    break;
8245  case TB_BCAST_SS:
8246    switch (SpillSize) {
8247    default:
8248      llvm_unreachable("Unknown spill size");
8249    case 16:
8250      return X86::VBROADCASTSSZ128rm;
8251    case 32:
8252      return X86::VBROADCASTSSZ256rm;
8253    case 64:
8254      return X86::VBROADCASTSSZrm;
8255    }
8256    break;
8257  case TB_BCAST_SD:
8258    switch (SpillSize) {
8259    default:
8260      llvm_unreachable("Unknown spill size");
8261    case 16:
8262      return X86::VMOVDDUPZ128rm;
8263    case 32:
8264      return X86::VBROADCASTSDZ256rm;
8265    case 64:
8266      return X86::VBROADCASTSDZrm;
8267    }
8268    break;
8269  }
8270}
8271
8272bool X86InstrInfo::unfoldMemoryOperand(
8273    MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
8274    bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
8275  const X86FoldTableEntry *I = lookupUnfoldTable(MI.getOpcode());
8276  if (I == nullptr)
8277    return false;
8278  unsigned Opc = I->DstOp;
8279  unsigned Index = I->Flags & TB_INDEX_MASK;
8280  bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8281  bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8282  bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
8283  if (UnfoldLoad && !FoldedLoad)
8284    return false;
8285  UnfoldLoad &= FoldedLoad;
8286  if (UnfoldStore && !FoldedStore)
8287    return false;
8288  UnfoldStore &= FoldedStore;
8289
8290  const MCInstrDesc &MCID = get(Opc);
8291
8292  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
8293  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
8294  // TODO: Check if 32-byte or greater accesses are slow too?
8295  if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
8296      Subtarget.isUnalignedMem16Slow())
8297    // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
8298    // conservatively assume the address is unaligned. That's bad for
8299    // performance.
8300    return false;
8301  SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
8302  SmallVector<MachineOperand, 2> BeforeOps;
8303  SmallVector<MachineOperand, 2> AfterOps;
8304  SmallVector<MachineOperand, 4> ImpOps;
8305  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
8306    MachineOperand &Op = MI.getOperand(i);
8307    if (i >= Index && i < Index + X86::AddrNumOperands)
8308      AddrOps.push_back(Op);
8309    else if (Op.isReg() && Op.isImplicit())
8310      ImpOps.push_back(Op);
8311    else if (i < Index)
8312      BeforeOps.push_back(Op);
8313    else if (i > Index)
8314      AfterOps.push_back(Op);
8315  }
8316
8317  // Emit the load or broadcast instruction.
8318  if (UnfoldLoad) {
8319    auto MMOs = extractLoadMMOs(MI.memoperands(), MF);
8320
8321    unsigned Opc;
8322    if (FoldedBCast) {
8323      Opc = getBroadcastOpcode(I, RC, Subtarget);
8324    } else {
8325      unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8326      bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8327      Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
8328    }
8329
8330    DebugLoc DL;
8331    MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
8332    for (const MachineOperand &AddrOp : AddrOps)
8333      MIB.add(AddrOp);
8334    MIB.setMemRefs(MMOs);
8335    NewMIs.push_back(MIB);
8336
8337    if (UnfoldStore) {
8338      // Address operands cannot be marked isKill.
8339      for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
8340        MachineOperand &MO = NewMIs[0]->getOperand(i);
8341        if (MO.isReg())
8342          MO.setIsKill(false);
8343      }
8344    }
8345  }
8346
8347  // Emit the data processing instruction.
8348  MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
8349  MachineInstrBuilder MIB(MF, DataMI);
8350
8351  if (FoldedStore)
8352    MIB.addReg(Reg, RegState::Define);
8353  for (MachineOperand &BeforeOp : BeforeOps)
8354    MIB.add(BeforeOp);
8355  if (FoldedLoad)
8356    MIB.addReg(Reg);
8357  for (MachineOperand &AfterOp : AfterOps)
8358    MIB.add(AfterOp);
8359  for (MachineOperand &ImpOp : ImpOps) {
8360    MIB.addReg(ImpOp.getReg(), getDefRegState(ImpOp.isDef()) |
8361                                   RegState::Implicit |
8362                                   getKillRegState(ImpOp.isKill()) |
8363                                   getDeadRegState(ImpOp.isDead()) |
8364                                   getUndefRegState(ImpOp.isUndef()));
8365  }
8366  // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8367  switch (DataMI->getOpcode()) {
8368  default:
8369    break;
8370  case X86::CMP64ri32:
8371  case X86::CMP32ri:
8372  case X86::CMP16ri:
8373  case X86::CMP8ri: {
8374    MachineOperand &MO0 = DataMI->getOperand(0);
8375    MachineOperand &MO1 = DataMI->getOperand(1);
8376    if (MO1.isImm() && MO1.getImm() == 0) {
8377      unsigned NewOpc;
8378      switch (DataMI->getOpcode()) {
8379      default:
8380        llvm_unreachable("Unreachable!");
8381      case X86::CMP64ri32:
8382        NewOpc = X86::TEST64rr;
8383        break;
8384      case X86::CMP32ri:
8385        NewOpc = X86::TEST32rr;
8386        break;
8387      case X86::CMP16ri:
8388        NewOpc = X86::TEST16rr;
8389        break;
8390      case X86::CMP8ri:
8391        NewOpc = X86::TEST8rr;
8392        break;
8393      }
8394      DataMI->setDesc(get(NewOpc));
8395      MO1.ChangeToRegister(MO0.getReg(), false);
8396    }
8397  }
8398  }
8399  NewMIs.push_back(DataMI);
8400
8401  // Emit the store instruction.
8402  if (UnfoldStore) {
8403    const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
8404    auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
8405    unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
8406    bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8407    unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
8408    DebugLoc DL;
8409    MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
8410    for (const MachineOperand &AddrOp : AddrOps)
8411      MIB.add(AddrOp);
8412    MIB.addReg(Reg, RegState::Kill);
8413    MIB.setMemRefs(MMOs);
8414    NewMIs.push_back(MIB);
8415  }
8416
8417  return true;
8418}
8419
8420bool X86InstrInfo::unfoldMemoryOperand(
8421    SelectionDAG &DAG, SDNode *N, SmallVectorImpl<SDNode *> &NewNodes) const {
8422  if (!N->isMachineOpcode())
8423    return false;
8424
8425  const X86FoldTableEntry *I = lookupUnfoldTable(N->getMachineOpcode());
8426  if (I == nullptr)
8427    return false;
8428  unsigned Opc = I->DstOp;
8429  unsigned Index = I->Flags & TB_INDEX_MASK;
8430  bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8431  bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8432  bool FoldedBCast = I->Flags & TB_FOLDED_BCAST;
8433  const MCInstrDesc &MCID = get(Opc);
8434  MachineFunction &MF = DAG.getMachineFunction();
8435  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
8436  const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
8437  unsigned NumDefs = MCID.NumDefs;
8438  std::vector<SDValue> AddrOps;
8439  std::vector<SDValue> BeforeOps;
8440  std::vector<SDValue> AfterOps;
8441  SDLoc dl(N);
8442  unsigned NumOps = N->getNumOperands();
8443  for (unsigned i = 0; i != NumOps - 1; ++i) {
8444    SDValue Op = N->getOperand(i);
8445    if (i >= Index - NumDefs && i < Index - NumDefs + X86::AddrNumOperands)
8446      AddrOps.push_back(Op);
8447    else if (i < Index - NumDefs)
8448      BeforeOps.push_back(Op);
8449    else if (i > Index - NumDefs)
8450      AfterOps.push_back(Op);
8451  }
8452  SDValue Chain = N->getOperand(NumOps - 1);
8453  AddrOps.push_back(Chain);
8454
8455  // Emit the load instruction.
8456  SDNode *Load = nullptr;
8457  if (FoldedLoad) {
8458    EVT VT = *TRI.legalclasstypes_begin(*RC);
8459    auto MMOs = extractLoadMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8460    if (MMOs.empty() && RC == &X86::VR128RegClass &&
8461        Subtarget.isUnalignedMem16Slow())
8462      // Do not introduce a slow unaligned load.
8463      return false;
8464    // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8465    // memory access is slow above.
8466
8467    unsigned Opc;
8468    if (FoldedBCast) {
8469      Opc = getBroadcastOpcode(I, RC, Subtarget);
8470    } else {
8471      unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8472      bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8473      Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
8474    }
8475
8476    Load = DAG.getMachineNode(Opc, dl, VT, MVT::Other, AddrOps);
8477    NewNodes.push_back(Load);
8478
8479    // Preserve memory reference information.
8480    DAG.setNodeMemRefs(cast<MachineSDNode>(Load), MMOs);
8481  }
8482
8483  // Emit the data processing instruction.
8484  std::vector<EVT> VTs;
8485  const TargetRegisterClass *DstRC = nullptr;
8486  if (MCID.getNumDefs() > 0) {
8487    DstRC = getRegClass(MCID, 0, &RI, MF);
8488    VTs.push_back(*TRI.legalclasstypes_begin(*DstRC));
8489  }
8490  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
8491    EVT VT = N->getValueType(i);
8492    if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
8493      VTs.push_back(VT);
8494  }
8495  if (Load)
8496    BeforeOps.push_back(SDValue(Load, 0));
8497  llvm::append_range(BeforeOps, AfterOps);
8498  // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
8499  switch (Opc) {
8500  default:
8501    break;
8502  case X86::CMP64ri32:
8503  case X86::CMP32ri:
8504  case X86::CMP16ri:
8505  case X86::CMP8ri:
8506    if (isNullConstant(BeforeOps[1])) {
8507      switch (Opc) {
8508      default:
8509        llvm_unreachable("Unreachable!");
8510      case X86::CMP64ri32:
8511        Opc = X86::TEST64rr;
8512        break;
8513      case X86::CMP32ri:
8514        Opc = X86::TEST32rr;
8515        break;
8516      case X86::CMP16ri:
8517        Opc = X86::TEST16rr;
8518        break;
8519      case X86::CMP8ri:
8520        Opc = X86::TEST8rr;
8521        break;
8522      }
8523      BeforeOps[1] = BeforeOps[0];
8524    }
8525  }
8526  SDNode *NewNode = DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
8527  NewNodes.push_back(NewNode);
8528
8529  // Emit the store instruction.
8530  if (FoldedStore) {
8531    AddrOps.pop_back();
8532    AddrOps.push_back(SDValue(NewNode, 0));
8533    AddrOps.push_back(Chain);
8534    auto MMOs = extractStoreMMOs(cast<MachineSDNode>(N)->memoperands(), MF);
8535    if (MMOs.empty() && RC == &X86::VR128RegClass &&
8536        Subtarget.isUnalignedMem16Slow())
8537      // Do not introduce a slow unaligned store.
8538      return false;
8539    // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
8540    // memory access is slow above.
8541    unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
8542    bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
8543    SDNode *Store =
8544        DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
8545                           dl, MVT::Other, AddrOps);
8546    NewNodes.push_back(Store);
8547
8548    // Preserve memory reference information.
8549    DAG.setNodeMemRefs(cast<MachineSDNode>(Store), MMOs);
8550  }
8551
8552  return true;
8553}
8554
8555unsigned
8556X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad,
8557                                         bool UnfoldStore,
8558                                         unsigned *LoadRegIndex) const {
8559  const X86FoldTableEntry *I = lookupUnfoldTable(Opc);
8560  if (I == nullptr)
8561    return 0;
8562  bool FoldedLoad = I->Flags & TB_FOLDED_LOAD;
8563  bool FoldedStore = I->Flags & TB_FOLDED_STORE;
8564  if (UnfoldLoad && !FoldedLoad)
8565    return 0;
8566  if (UnfoldStore && !FoldedStore)
8567    return 0;
8568  if (LoadRegIndex)
8569    *LoadRegIndex = I->Flags & TB_INDEX_MASK;
8570  return I->DstOp;
8571}
8572
8573bool X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
8574                                           int64_t &Offset1,
8575                                           int64_t &Offset2) const {
8576  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
8577    return false;
8578
8579  auto IsLoadOpcode = [&](unsigned Opcode) {
8580    switch (Opcode) {
8581    default:
8582      return false;
8583    case X86::MOV8rm:
8584    case X86::MOV16rm:
8585    case X86::MOV32rm:
8586    case X86::MOV64rm:
8587    case X86::LD_Fp32m:
8588    case X86::LD_Fp64m:
8589    case X86::LD_Fp80m:
8590    case X86::MOVSSrm:
8591    case X86::MOVSSrm_alt:
8592    case X86::MOVSDrm:
8593    case X86::MOVSDrm_alt:
8594    case X86::MMX_MOVD64rm:
8595    case X86::MMX_MOVQ64rm:
8596    case X86::MOVAPSrm:
8597    case X86::MOVUPSrm:
8598    case X86::MOVAPDrm:
8599    case X86::MOVUPDrm:
8600    case X86::MOVDQArm:
8601    case X86::MOVDQUrm:
8602    // AVX load instructions
8603    case X86::VMOVSSrm:
8604    case X86::VMOVSSrm_alt:
8605    case X86::VMOVSDrm:
8606    case X86::VMOVSDrm_alt:
8607    case X86::VMOVAPSrm:
8608    case X86::VMOVUPSrm:
8609    case X86::VMOVAPDrm:
8610    case X86::VMOVUPDrm:
8611    case X86::VMOVDQArm:
8612    case X86::VMOVDQUrm:
8613    case X86::VMOVAPSYrm:
8614    case X86::VMOVUPSYrm:
8615    case X86::VMOVAPDYrm:
8616    case X86::VMOVUPDYrm:
8617    case X86::VMOVDQAYrm:
8618    case X86::VMOVDQUYrm:
8619    // AVX512 load instructions
8620    case X86::VMOVSSZrm:
8621    case X86::VMOVSSZrm_alt:
8622    case X86::VMOVSDZrm:
8623    case X86::VMOVSDZrm_alt:
8624    case X86::VMOVAPSZ128rm:
8625    case X86::VMOVUPSZ128rm:
8626    case X86::VMOVAPSZ128rm_NOVLX:
8627    case X86::VMOVUPSZ128rm_NOVLX:
8628    case X86::VMOVAPDZ128rm:
8629    case X86::VMOVUPDZ128rm:
8630    case X86::VMOVDQU8Z128rm:
8631    case X86::VMOVDQU16Z128rm:
8632    case X86::VMOVDQA32Z128rm:
8633    case X86::VMOVDQU32Z128rm:
8634    case X86::VMOVDQA64Z128rm:
8635    case X86::VMOVDQU64Z128rm:
8636    case X86::VMOVAPSZ256rm:
8637    case X86::VMOVUPSZ256rm:
8638    case X86::VMOVAPSZ256rm_NOVLX:
8639    case X86::VMOVUPSZ256rm_NOVLX:
8640    case X86::VMOVAPDZ256rm:
8641    case X86::VMOVUPDZ256rm:
8642    case X86::VMOVDQU8Z256rm:
8643    case X86::VMOVDQU16Z256rm:
8644    case X86::VMOVDQA32Z256rm:
8645    case X86::VMOVDQU32Z256rm:
8646    case X86::VMOVDQA64Z256rm:
8647    case X86::VMOVDQU64Z256rm:
8648    case X86::VMOVAPSZrm:
8649    case X86::VMOVUPSZrm:
8650    case X86::VMOVAPDZrm:
8651    case X86::VMOVUPDZrm:
8652    case X86::VMOVDQU8Zrm:
8653    case X86::VMOVDQU16Zrm:
8654    case X86::VMOVDQA32Zrm:
8655    case X86::VMOVDQU32Zrm:
8656    case X86::VMOVDQA64Zrm:
8657    case X86::VMOVDQU64Zrm:
8658    case X86::KMOVBkm:
8659    case X86::KMOVBkm_EVEX:
8660    case X86::KMOVWkm:
8661    case X86::KMOVWkm_EVEX:
8662    case X86::KMOVDkm:
8663    case X86::KMOVDkm_EVEX:
8664    case X86::KMOVQkm:
8665    case X86::KMOVQkm_EVEX:
8666      return true;
8667    }
8668  };
8669
8670  if (!IsLoadOpcode(Load1->getMachineOpcode()) ||
8671      !IsLoadOpcode(Load2->getMachineOpcode()))
8672    return false;
8673
8674  // Lambda to check if both the loads have the same value for an operand index.
8675  auto HasSameOp = [&](int I) {
8676    return Load1->getOperand(I) == Load2->getOperand(I);
8677  };
8678
8679  // All operands except the displacement should match.
8680  if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
8681      !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
8682    return false;
8683
8684  // Chain Operand must be the same.
8685  if (!HasSameOp(5))
8686    return false;
8687
8688  // Now let's examine if the displacements are constants.
8689  auto Disp1 = dyn_cast<ConstantSDNode>(Load1->getOperand(X86::AddrDisp));
8690  auto Disp2 = dyn_cast<ConstantSDNode>(Load2->getOperand(X86::AddrDisp));
8691  if (!Disp1 || !Disp2)
8692    return false;
8693
8694  Offset1 = Disp1->getSExtValue();
8695  Offset2 = Disp2->getSExtValue();
8696  return true;
8697}
8698
8699bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
8700                                           int64_t Offset1, int64_t Offset2,
8701                                           unsigned NumLoads) const {
8702  assert(Offset2 > Offset1);
8703  if ((Offset2 - Offset1) / 8 > 64)
8704    return false;
8705
8706  unsigned Opc1 = Load1->getMachineOpcode();
8707  unsigned Opc2 = Load2->getMachineOpcode();
8708  if (Opc1 != Opc2)
8709    return false; // FIXME: overly conservative?
8710
8711  switch (Opc1) {
8712  default:
8713    break;
8714  case X86::LD_Fp32m:
8715  case X86::LD_Fp64m:
8716  case X86::LD_Fp80m:
8717  case X86::MMX_MOVD64rm:
8718  case X86::MMX_MOVQ64rm:
8719    return false;
8720  }
8721
8722  EVT VT = Load1->getValueType(0);
8723  switch (VT.getSimpleVT().SimpleTy) {
8724  default:
8725    // XMM registers. In 64-bit mode we can be a bit more aggressive since we
8726    // have 16 of them to play with.
8727    if (Subtarget.is64Bit()) {
8728      if (NumLoads >= 3)
8729        return false;
8730    } else if (NumLoads) {
8731      return false;
8732    }
8733    break;
8734  case MVT::i8:
8735  case MVT::i16:
8736  case MVT::i32:
8737  case MVT::i64:
8738  case MVT::f32:
8739  case MVT::f64:
8740    if (NumLoads)
8741      return false;
8742    break;
8743  }
8744
8745  return true;
8746}
8747
8748bool X86InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
8749                                        const MachineBasicBlock *MBB,
8750                                        const MachineFunction &MF) const {
8751
8752  // ENDBR instructions should not be scheduled around.
8753  unsigned Opcode = MI.getOpcode();
8754  if (Opcode == X86::ENDBR64 || Opcode == X86::ENDBR32 ||
8755      Opcode == X86::PLDTILECFGV)
8756    return true;
8757
8758  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF);
8759}
8760
8761bool X86InstrInfo::reverseBranchCondition(
8762    SmallVectorImpl<MachineOperand> &Cond) const {
8763  assert(Cond.size() == 1 && "Invalid X86 branch condition!");
8764  X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
8765  Cond[0].setImm(GetOppositeBranchCondition(CC));
8766  return false;
8767}
8768
8769bool X86InstrInfo::isSafeToMoveRegClassDefs(
8770    const TargetRegisterClass *RC) const {
8771  // FIXME: Return false for x87 stack register classes for now. We can't
8772  // allow any loads of these registers before FpGet_ST0_80.
8773  return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass ||
8774           RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass ||
8775           RC == &X86::RFP80RegClass);
8776}
8777
8778/// Return a virtual register initialized with the
8779/// the global base register value. Output instructions required to
8780/// initialize the register in the function entry block, if necessary.
8781///
8782/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
8783///
8784unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
8785  X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
8786  Register GlobalBaseReg = X86FI->getGlobalBaseReg();
8787  if (GlobalBaseReg != 0)
8788    return GlobalBaseReg;
8789
8790  // Create the register. The code to initialize it is inserted
8791  // later, by the CGBR pass (below).
8792  MachineRegisterInfo &RegInfo = MF->getRegInfo();
8793  GlobalBaseReg = RegInfo.createVirtualRegister(
8794      Subtarget.is64Bit() ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass);
8795  X86FI->setGlobalBaseReg(GlobalBaseReg);
8796  return GlobalBaseReg;
8797}
8798
8799// FIXME: Some shuffle and unpack instructions have equivalents in different
8800// domains, but they require a bit more work than just switching opcodes.
8801
8802static const uint16_t *lookup(unsigned opcode, unsigned domain,
8803                              ArrayRef<uint16_t[3]> Table) {
8804  for (const uint16_t(&Row)[3] : Table)
8805    if (Row[domain - 1] == opcode)
8806      return Row;
8807  return nullptr;
8808}
8809
8810static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
8811                                    ArrayRef<uint16_t[4]> Table) {
8812  // If this is the integer domain make sure to check both integer columns.
8813  for (const uint16_t(&Row)[4] : Table)
8814    if (Row[domain - 1] == opcode || (domain == 3 && Row[3] == opcode))
8815      return Row;
8816  return nullptr;
8817}
8818
8819// Helper to attempt to widen/narrow blend masks.
8820static bool AdjustBlendMask(unsigned OldMask, unsigned OldWidth,
8821                            unsigned NewWidth, unsigned *pNewMask = nullptr) {
8822  assert(((OldWidth % NewWidth) == 0 || (NewWidth % OldWidth) == 0) &&
8823         "Illegal blend mask scale");
8824  unsigned NewMask = 0;
8825
8826  if ((OldWidth % NewWidth) == 0) {
8827    unsigned Scale = OldWidth / NewWidth;
8828    unsigned SubMask = (1u << Scale) - 1;
8829    for (unsigned i = 0; i != NewWidth; ++i) {
8830      unsigned Sub = (OldMask >> (i * Scale)) & SubMask;
8831      if (Sub == SubMask)
8832        NewMask |= (1u << i);
8833      else if (Sub != 0x0)
8834        return false;
8835    }
8836  } else {
8837    unsigned Scale = NewWidth / OldWidth;
8838    unsigned SubMask = (1u << Scale) - 1;
8839    for (unsigned i = 0; i != OldWidth; ++i) {
8840      if (OldMask & (1 << i)) {
8841        NewMask |= (SubMask << (i * Scale));
8842      }
8843    }
8844  }
8845
8846  if (pNewMask)
8847    *pNewMask = NewMask;
8848  return true;
8849}
8850
8851uint16_t X86InstrInfo::getExecutionDomainCustom(const MachineInstr &MI) const {
8852  unsigned Opcode = MI.getOpcode();
8853  unsigned NumOperands = MI.getDesc().getNumOperands();
8854
8855  auto GetBlendDomains = [&](unsigned ImmWidth, bool Is256) {
8856    uint16_t validDomains = 0;
8857    if (MI.getOperand(NumOperands - 1).isImm()) {
8858      unsigned Imm = MI.getOperand(NumOperands - 1).getImm();
8859      if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4))
8860        validDomains |= 0x2; // PackedSingle
8861      if (AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2))
8862        validDomains |= 0x4; // PackedDouble
8863      if (!Is256 || Subtarget.hasAVX2())
8864        validDomains |= 0x8; // PackedInt
8865    }
8866    return validDomains;
8867  };
8868
8869  switch (Opcode) {
8870  case X86::BLENDPDrmi:
8871  case X86::BLENDPDrri:
8872  case X86::VBLENDPDrmi:
8873  case X86::VBLENDPDrri:
8874    return GetBlendDomains(2, false);
8875  case X86::VBLENDPDYrmi:
8876  case X86::VBLENDPDYrri:
8877    return GetBlendDomains(4, true);
8878  case X86::BLENDPSrmi:
8879  case X86::BLENDPSrri:
8880  case X86::VBLENDPSrmi:
8881  case X86::VBLENDPSrri:
8882  case X86::VPBLENDDrmi:
8883  case X86::VPBLENDDrri:
8884    return GetBlendDomains(4, false);
8885  case X86::VBLENDPSYrmi:
8886  case X86::VBLENDPSYrri:
8887  case X86::VPBLENDDYrmi:
8888  case X86::VPBLENDDYrri:
8889    return GetBlendDomains(8, true);
8890  case X86::PBLENDWrmi:
8891  case X86::PBLENDWrri:
8892  case X86::VPBLENDWrmi:
8893  case X86::VPBLENDWrri:
8894  // Treat VPBLENDWY as a 128-bit vector as it repeats the lo/hi masks.
8895  case X86::VPBLENDWYrmi:
8896  case X86::VPBLENDWYrri:
8897    return GetBlendDomains(8, false);
8898  case X86::VPANDDZ128rr:
8899  case X86::VPANDDZ128rm:
8900  case X86::VPANDDZ256rr:
8901  case X86::VPANDDZ256rm:
8902  case X86::VPANDQZ128rr:
8903  case X86::VPANDQZ128rm:
8904  case X86::VPANDQZ256rr:
8905  case X86::VPANDQZ256rm:
8906  case X86::VPANDNDZ128rr:
8907  case X86::VPANDNDZ128rm:
8908  case X86::VPANDNDZ256rr:
8909  case X86::VPANDNDZ256rm:
8910  case X86::VPANDNQZ128rr:
8911  case X86::VPANDNQZ128rm:
8912  case X86::VPANDNQZ256rr:
8913  case X86::VPANDNQZ256rm:
8914  case X86::VPORDZ128rr:
8915  case X86::VPORDZ128rm:
8916  case X86::VPORDZ256rr:
8917  case X86::VPORDZ256rm:
8918  case X86::VPORQZ128rr:
8919  case X86::VPORQZ128rm:
8920  case X86::VPORQZ256rr:
8921  case X86::VPORQZ256rm:
8922  case X86::VPXORDZ128rr:
8923  case X86::VPXORDZ128rm:
8924  case X86::VPXORDZ256rr:
8925  case X86::VPXORDZ256rm:
8926  case X86::VPXORQZ128rr:
8927  case X86::VPXORQZ128rm:
8928  case X86::VPXORQZ256rr:
8929  case X86::VPXORQZ256rm:
8930    // If we don't have DQI see if we can still switch from an EVEX integer
8931    // instruction to a VEX floating point instruction.
8932    if (Subtarget.hasDQI())
8933      return 0;
8934
8935    if (RI.getEncodingValue(MI.getOperand(0).getReg()) >= 16)
8936      return 0;
8937    if (RI.getEncodingValue(MI.getOperand(1).getReg()) >= 16)
8938      return 0;
8939    // Register forms will have 3 operands. Memory form will have more.
8940    if (NumOperands == 3 &&
8941        RI.getEncodingValue(MI.getOperand(2).getReg()) >= 16)
8942      return 0;
8943
8944    // All domains are valid.
8945    return 0xe;
8946  case X86::MOVHLPSrr:
8947    // We can swap domains when both inputs are the same register.
8948    // FIXME: This doesn't catch all the cases we would like. If the input
8949    // register isn't KILLed by the instruction, the two address instruction
8950    // pass puts a COPY on one input. The other input uses the original
8951    // register. This prevents the same physical register from being used by
8952    // both inputs.
8953    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
8954        MI.getOperand(0).getSubReg() == 0 &&
8955        MI.getOperand(1).getSubReg() == 0 && MI.getOperand(2).getSubReg() == 0)
8956      return 0x6;
8957    return 0;
8958  case X86::SHUFPDrri:
8959    return 0x6;
8960  }
8961  return 0;
8962}
8963
8964#include "X86ReplaceableInstrs.def"
8965
8966bool X86InstrInfo::setExecutionDomainCustom(MachineInstr &MI,
8967                                            unsigned Domain) const {
8968  assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
8969  uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
8970  assert(dom && "Not an SSE instruction");
8971
8972  unsigned Opcode = MI.getOpcode();
8973  unsigned NumOperands = MI.getDesc().getNumOperands();
8974
8975  auto SetBlendDomain = [&](unsigned ImmWidth, bool Is256) {
8976    if (MI.getOperand(NumOperands - 1).isImm()) {
8977      unsigned Imm = MI.getOperand(NumOperands - 1).getImm() & 255;
8978      Imm = (ImmWidth == 16 ? ((Imm << 8) | Imm) : Imm);
8979      unsigned NewImm = Imm;
8980
8981      const uint16_t *table = lookup(Opcode, dom, ReplaceableBlendInstrs);
8982      if (!table)
8983        table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
8984
8985      if (Domain == 1) { // PackedSingle
8986        AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
8987      } else if (Domain == 2) { // PackedDouble
8988        AdjustBlendMask(Imm, ImmWidth, Is256 ? 4 : 2, &NewImm);
8989      } else if (Domain == 3) { // PackedInt
8990        if (Subtarget.hasAVX2()) {
8991          // If we are already VPBLENDW use that, else use VPBLENDD.
8992          if ((ImmWidth / (Is256 ? 2 : 1)) != 8) {
8993            table = lookup(Opcode, dom, ReplaceableBlendAVX2Instrs);
8994            AdjustBlendMask(Imm, ImmWidth, Is256 ? 8 : 4, &NewImm);
8995          }
8996        } else {
8997          assert(!Is256 && "128-bit vector expected");
8998          AdjustBlendMask(Imm, ImmWidth, 8, &NewImm);
8999        }
9000      }
9001
9002      assert(table && table[Domain - 1] && "Unknown domain op");
9003      MI.setDesc(get(table[Domain - 1]));
9004      MI.getOperand(NumOperands - 1).setImm(NewImm & 255);
9005    }
9006    return true;
9007  };
9008
9009  switch (Opcode) {
9010  case X86::BLENDPDrmi:
9011  case X86::BLENDPDrri:
9012  case X86::VBLENDPDrmi:
9013  case X86::VBLENDPDrri:
9014    return SetBlendDomain(2, false);
9015  case X86::VBLENDPDYrmi:
9016  case X86::VBLENDPDYrri:
9017    return SetBlendDomain(4, true);
9018  case X86::BLENDPSrmi:
9019  case X86::BLENDPSrri:
9020  case X86::VBLENDPSrmi:
9021  case X86::VBLENDPSrri:
9022  case X86::VPBLENDDrmi:
9023  case X86::VPBLENDDrri:
9024    return SetBlendDomain(4, false);
9025  case X86::VBLENDPSYrmi:
9026  case X86::VBLENDPSYrri:
9027  case X86::VPBLENDDYrmi:
9028  case X86::VPBLENDDYrri:
9029    return SetBlendDomain(8, true);
9030  case X86::PBLENDWrmi:
9031  case X86::PBLENDWrri:
9032  case X86::VPBLENDWrmi:
9033  case X86::VPBLENDWrri:
9034    return SetBlendDomain(8, false);
9035  case X86::VPBLENDWYrmi:
9036  case X86::VPBLENDWYrri:
9037    return SetBlendDomain(16, true);
9038  case X86::VPANDDZ128rr:
9039  case X86::VPANDDZ128rm:
9040  case X86::VPANDDZ256rr:
9041  case X86::VPANDDZ256rm:
9042  case X86::VPANDQZ128rr:
9043  case X86::VPANDQZ128rm:
9044  case X86::VPANDQZ256rr:
9045  case X86::VPANDQZ256rm:
9046  case X86::VPANDNDZ128rr:
9047  case X86::VPANDNDZ128rm:
9048  case X86::VPANDNDZ256rr:
9049  case X86::VPANDNDZ256rm:
9050  case X86::VPANDNQZ128rr:
9051  case X86::VPANDNQZ128rm:
9052  case X86::VPANDNQZ256rr:
9053  case X86::VPANDNQZ256rm:
9054  case X86::VPORDZ128rr:
9055  case X86::VPORDZ128rm:
9056  case X86::VPORDZ256rr:
9057  case X86::VPORDZ256rm:
9058  case X86::VPORQZ128rr:
9059  case X86::VPORQZ128rm:
9060  case X86::VPORQZ256rr:
9061  case X86::VPORQZ256rm:
9062  case X86::VPXORDZ128rr:
9063  case X86::VPXORDZ128rm:
9064  case X86::VPXORDZ256rr:
9065  case X86::VPXORDZ256rm:
9066  case X86::VPXORQZ128rr:
9067  case X86::VPXORQZ128rm:
9068  case X86::VPXORQZ256rr:
9069  case X86::VPXORQZ256rm: {
9070    // Without DQI, convert EVEX instructions to VEX instructions.
9071    if (Subtarget.hasDQI())
9072      return false;
9073
9074    const uint16_t *table =
9075        lookupAVX512(MI.getOpcode(), dom, ReplaceableCustomAVX512LogicInstrs);
9076    assert(table && "Instruction not found in table?");
9077    // Don't change integer Q instructions to D instructions and
9078    // use D intructions if we started with a PS instruction.
9079    if (Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9080      Domain = 4;
9081    MI.setDesc(get(table[Domain - 1]));
9082    return true;
9083  }
9084  case X86::UNPCKHPDrr:
9085  case X86::MOVHLPSrr:
9086    // We just need to commute the instruction which will switch the domains.
9087    if (Domain != dom && Domain != 3 &&
9088        MI.getOperand(1).getReg() == MI.getOperand(2).getReg() &&
9089        MI.getOperand(0).getSubReg() == 0 &&
9090        MI.getOperand(1).getSubReg() == 0 &&
9091        MI.getOperand(2).getSubReg() == 0) {
9092      commuteInstruction(MI, false);
9093      return true;
9094    }
9095    // We must always return true for MOVHLPSrr.
9096    if (Opcode == X86::MOVHLPSrr)
9097      return true;
9098    break;
9099  case X86::SHUFPDrri: {
9100    if (Domain == 1) {
9101      unsigned Imm = MI.getOperand(3).getImm();
9102      unsigned NewImm = 0x44;
9103      if (Imm & 1)
9104        NewImm |= 0x0a;
9105      if (Imm & 2)
9106        NewImm |= 0xa0;
9107      MI.getOperand(3).setImm(NewImm);
9108      MI.setDesc(get(X86::SHUFPSrri));
9109    }
9110    return true;
9111  }
9112  }
9113  return false;
9114}
9115
9116std::pair<uint16_t, uint16_t>
9117X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
9118  uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9119  unsigned opcode = MI.getOpcode();
9120  uint16_t validDomains = 0;
9121  if (domain) {
9122    // Attempt to match for custom instructions.
9123    validDomains = getExecutionDomainCustom(MI);
9124    if (validDomains)
9125      return std::make_pair(domain, validDomains);
9126
9127    if (lookup(opcode, domain, ReplaceableInstrs)) {
9128      validDomains = 0xe;
9129    } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
9130      validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
9131    } else if (lookup(opcode, domain, ReplaceableInstrsFP)) {
9132      validDomains = 0x6;
9133    } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
9134      // Insert/extract instructions should only effect domain if AVX2
9135      // is enabled.
9136      if (!Subtarget.hasAVX2())
9137        return std::make_pair(0, 0);
9138      validDomains = 0xe;
9139    } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
9140      validDomains = 0xe;
9141    } else if (Subtarget.hasDQI() &&
9142               lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
9143      validDomains = 0xe;
9144    } else if (Subtarget.hasDQI()) {
9145      if (const uint16_t *table =
9146              lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQMasked)) {
9147        if (domain == 1 || (domain == 3 && table[3] == opcode))
9148          validDomains = 0xa;
9149        else
9150          validDomains = 0xc;
9151      }
9152    }
9153  }
9154  return std::make_pair(domain, validDomains);
9155}
9156
9157void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
9158  assert(Domain > 0 && Domain < 4 && "Invalid execution domain");
9159  uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
9160  assert(dom && "Not an SSE instruction");
9161
9162  // Attempt to match for custom instructions.
9163  if (setExecutionDomainCustom(MI, Domain))
9164    return;
9165
9166  const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
9167  if (!table) { // try the other table
9168    assert((Subtarget.hasAVX2() || Domain < 3) &&
9169           "256-bit vector operations only available in AVX2");
9170    table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
9171  }
9172  if (!table) { // try the FP table
9173    table = lookup(MI.getOpcode(), dom, ReplaceableInstrsFP);
9174    assert((!table || Domain < 3) &&
9175           "Can only select PackedSingle or PackedDouble");
9176  }
9177  if (!table) { // try the other table
9178    assert(Subtarget.hasAVX2() &&
9179           "256-bit insert/extract only available in AVX2");
9180    table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
9181  }
9182  if (!table) { // try the AVX512 table
9183    assert(Subtarget.hasAVX512() && "Requires AVX-512");
9184    table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
9185    // Don't change integer Q instructions to D instructions.
9186    if (table && Domain == 3 && table[3] == MI.getOpcode())
9187      Domain = 4;
9188  }
9189  if (!table) { // try the AVX512DQ table
9190    assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9191    table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
9192    // Don't change integer Q instructions to D instructions and
9193    // use D instructions if we started with a PS instruction.
9194    if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9195      Domain = 4;
9196  }
9197  if (!table) { // try the AVX512DQMasked table
9198    assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
9199    table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
9200    if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
9201      Domain = 4;
9202  }
9203  assert(table && "Cannot change domain");
9204  MI.setDesc(get(table[Domain - 1]));
9205}
9206
9207void X86InstrInfo::insertNoop(MachineBasicBlock &MBB,
9208                              MachineBasicBlock::iterator MI) const {
9209  DebugLoc DL;
9210  BuildMI(MBB, MI, DL, get(X86::NOOP));
9211}
9212
9213/// Return the noop instruction to use for a noop.
9214MCInst X86InstrInfo::getNop() const {
9215  MCInst Nop;
9216  Nop.setOpcode(X86::NOOP);
9217  return Nop;
9218}
9219
9220bool X86InstrInfo::isHighLatencyDef(int opc) const {
9221  switch (opc) {
9222  default:
9223    return false;
9224  case X86::DIVPDrm:
9225  case X86::DIVPDrr:
9226  case X86::DIVPSrm:
9227  case X86::DIVPSrr:
9228  case X86::DIVSDrm:
9229  case X86::DIVSDrm_Int:
9230  case X86::DIVSDrr:
9231  case X86::DIVSDrr_Int:
9232  case X86::DIVSSrm:
9233  case X86::DIVSSrm_Int:
9234  case X86::DIVSSrr:
9235  case X86::DIVSSrr_Int:
9236  case X86::SQRTPDm:
9237  case X86::SQRTPDr:
9238  case X86::SQRTPSm:
9239  case X86::SQRTPSr:
9240  case X86::SQRTSDm:
9241  case X86::SQRTSDm_Int:
9242  case X86::SQRTSDr:
9243  case X86::SQRTSDr_Int:
9244  case X86::SQRTSSm:
9245  case X86::SQRTSSm_Int:
9246  case X86::SQRTSSr:
9247  case X86::SQRTSSr_Int:
9248  // AVX instructions with high latency
9249  case X86::VDIVPDrm:
9250  case X86::VDIVPDrr:
9251  case X86::VDIVPDYrm:
9252  case X86::VDIVPDYrr:
9253  case X86::VDIVPSrm:
9254  case X86::VDIVPSrr:
9255  case X86::VDIVPSYrm:
9256  case X86::VDIVPSYrr:
9257  case X86::VDIVSDrm:
9258  case X86::VDIVSDrm_Int:
9259  case X86::VDIVSDrr:
9260  case X86::VDIVSDrr_Int:
9261  case X86::VDIVSSrm:
9262  case X86::VDIVSSrm_Int:
9263  case X86::VDIVSSrr:
9264  case X86::VDIVSSrr_Int:
9265  case X86::VSQRTPDm:
9266  case X86::VSQRTPDr:
9267  case X86::VSQRTPDYm:
9268  case X86::VSQRTPDYr:
9269  case X86::VSQRTPSm:
9270  case X86::VSQRTPSr:
9271  case X86::VSQRTPSYm:
9272  case X86::VSQRTPSYr:
9273  case X86::VSQRTSDm:
9274  case X86::VSQRTSDm_Int:
9275  case X86::VSQRTSDr:
9276  case X86::VSQRTSDr_Int:
9277  case X86::VSQRTSSm:
9278  case X86::VSQRTSSm_Int:
9279  case X86::VSQRTSSr:
9280  case X86::VSQRTSSr_Int:
9281  // AVX512 instructions with high latency
9282  case X86::VDIVPDZ128rm:
9283  case X86::VDIVPDZ128rmb:
9284  case X86::VDIVPDZ128rmbk:
9285  case X86::VDIVPDZ128rmbkz:
9286  case X86::VDIVPDZ128rmk:
9287  case X86::VDIVPDZ128rmkz:
9288  case X86::VDIVPDZ128rr:
9289  case X86::VDIVPDZ128rrk:
9290  case X86::VDIVPDZ128rrkz:
9291  case X86::VDIVPDZ256rm:
9292  case X86::VDIVPDZ256rmb:
9293  case X86::VDIVPDZ256rmbk:
9294  case X86::VDIVPDZ256rmbkz:
9295  case X86::VDIVPDZ256rmk:
9296  case X86::VDIVPDZ256rmkz:
9297  case X86::VDIVPDZ256rr:
9298  case X86::VDIVPDZ256rrk:
9299  case X86::VDIVPDZ256rrkz:
9300  case X86::VDIVPDZrrb:
9301  case X86::VDIVPDZrrbk:
9302  case X86::VDIVPDZrrbkz:
9303  case X86::VDIVPDZrm:
9304  case X86::VDIVPDZrmb:
9305  case X86::VDIVPDZrmbk:
9306  case X86::VDIVPDZrmbkz:
9307  case X86::VDIVPDZrmk:
9308  case X86::VDIVPDZrmkz:
9309  case X86::VDIVPDZrr:
9310  case X86::VDIVPDZrrk:
9311  case X86::VDIVPDZrrkz:
9312  case X86::VDIVPSZ128rm:
9313  case X86::VDIVPSZ128rmb:
9314  case X86::VDIVPSZ128rmbk:
9315  case X86::VDIVPSZ128rmbkz:
9316  case X86::VDIVPSZ128rmk:
9317  case X86::VDIVPSZ128rmkz:
9318  case X86::VDIVPSZ128rr:
9319  case X86::VDIVPSZ128rrk:
9320  case X86::VDIVPSZ128rrkz:
9321  case X86::VDIVPSZ256rm:
9322  case X86::VDIVPSZ256rmb:
9323  case X86::VDIVPSZ256rmbk:
9324  case X86::VDIVPSZ256rmbkz:
9325  case X86::VDIVPSZ256rmk:
9326  case X86::VDIVPSZ256rmkz:
9327  case X86::VDIVPSZ256rr:
9328  case X86::VDIVPSZ256rrk:
9329  case X86::VDIVPSZ256rrkz:
9330  case X86::VDIVPSZrrb:
9331  case X86::VDIVPSZrrbk:
9332  case X86::VDIVPSZrrbkz:
9333  case X86::VDIVPSZrm:
9334  case X86::VDIVPSZrmb:
9335  case X86::VDIVPSZrmbk:
9336  case X86::VDIVPSZrmbkz:
9337  case X86::VDIVPSZrmk:
9338  case X86::VDIVPSZrmkz:
9339  case X86::VDIVPSZrr:
9340  case X86::VDIVPSZrrk:
9341  case X86::VDIVPSZrrkz:
9342  case X86::VDIVSDZrm:
9343  case X86::VDIVSDZrr:
9344  case X86::VDIVSDZrm_Int:
9345  case X86::VDIVSDZrm_Intk:
9346  case X86::VDIVSDZrm_Intkz:
9347  case X86::VDIVSDZrr_Int:
9348  case X86::VDIVSDZrr_Intk:
9349  case X86::VDIVSDZrr_Intkz:
9350  case X86::VDIVSDZrrb_Int:
9351  case X86::VDIVSDZrrb_Intk:
9352  case X86::VDIVSDZrrb_Intkz:
9353  case X86::VDIVSSZrm:
9354  case X86::VDIVSSZrr:
9355  case X86::VDIVSSZrm_Int:
9356  case X86::VDIVSSZrm_Intk:
9357  case X86::VDIVSSZrm_Intkz:
9358  case X86::VDIVSSZrr_Int:
9359  case X86::VDIVSSZrr_Intk:
9360  case X86::VDIVSSZrr_Intkz:
9361  case X86::VDIVSSZrrb_Int:
9362  case X86::VDIVSSZrrb_Intk:
9363  case X86::VDIVSSZrrb_Intkz:
9364  case X86::VSQRTPDZ128m:
9365  case X86::VSQRTPDZ128mb:
9366  case X86::VSQRTPDZ128mbk:
9367  case X86::VSQRTPDZ128mbkz:
9368  case X86::VSQRTPDZ128mk:
9369  case X86::VSQRTPDZ128mkz:
9370  case X86::VSQRTPDZ128r:
9371  case X86::VSQRTPDZ128rk:
9372  case X86::VSQRTPDZ128rkz:
9373  case X86::VSQRTPDZ256m:
9374  case X86::VSQRTPDZ256mb:
9375  case X86::VSQRTPDZ256mbk:
9376  case X86::VSQRTPDZ256mbkz:
9377  case X86::VSQRTPDZ256mk:
9378  case X86::VSQRTPDZ256mkz:
9379  case X86::VSQRTPDZ256r:
9380  case X86::VSQRTPDZ256rk:
9381  case X86::VSQRTPDZ256rkz:
9382  case X86::VSQRTPDZm:
9383  case X86::VSQRTPDZmb:
9384  case X86::VSQRTPDZmbk:
9385  case X86::VSQRTPDZmbkz:
9386  case X86::VSQRTPDZmk:
9387  case X86::VSQRTPDZmkz:
9388  case X86::VSQRTPDZr:
9389  case X86::VSQRTPDZrb:
9390  case X86::VSQRTPDZrbk:
9391  case X86::VSQRTPDZrbkz:
9392  case X86::VSQRTPDZrk:
9393  case X86::VSQRTPDZrkz:
9394  case X86::VSQRTPSZ128m:
9395  case X86::VSQRTPSZ128mb:
9396  case X86::VSQRTPSZ128mbk:
9397  case X86::VSQRTPSZ128mbkz:
9398  case X86::VSQRTPSZ128mk:
9399  case X86::VSQRTPSZ128mkz:
9400  case X86::VSQRTPSZ128r:
9401  case X86::VSQRTPSZ128rk:
9402  case X86::VSQRTPSZ128rkz:
9403  case X86::VSQRTPSZ256m:
9404  case X86::VSQRTPSZ256mb:
9405  case X86::VSQRTPSZ256mbk:
9406  case X86::VSQRTPSZ256mbkz:
9407  case X86::VSQRTPSZ256mk:
9408  case X86::VSQRTPSZ256mkz:
9409  case X86::VSQRTPSZ256r:
9410  case X86::VSQRTPSZ256rk:
9411  case X86::VSQRTPSZ256rkz:
9412  case X86::VSQRTPSZm:
9413  case X86::VSQRTPSZmb:
9414  case X86::VSQRTPSZmbk:
9415  case X86::VSQRTPSZmbkz:
9416  case X86::VSQRTPSZmk:
9417  case X86::VSQRTPSZmkz:
9418  case X86::VSQRTPSZr:
9419  case X86::VSQRTPSZrb:
9420  case X86::VSQRTPSZrbk:
9421  case X86::VSQRTPSZrbkz:
9422  case X86::VSQRTPSZrk:
9423  case X86::VSQRTPSZrkz:
9424  case X86::VSQRTSDZm:
9425  case X86::VSQRTSDZm_Int:
9426  case X86::VSQRTSDZm_Intk:
9427  case X86::VSQRTSDZm_Intkz:
9428  case X86::VSQRTSDZr:
9429  case X86::VSQRTSDZr_Int:
9430  case X86::VSQRTSDZr_Intk:
9431  case X86::VSQRTSDZr_Intkz:
9432  case X86::VSQRTSDZrb_Int:
9433  case X86::VSQRTSDZrb_Intk:
9434  case X86::VSQRTSDZrb_Intkz:
9435  case X86::VSQRTSSZm:
9436  case X86::VSQRTSSZm_Int:
9437  case X86::VSQRTSSZm_Intk:
9438  case X86::VSQRTSSZm_Intkz:
9439  case X86::VSQRTSSZr:
9440  case X86::VSQRTSSZr_Int:
9441  case X86::VSQRTSSZr_Intk:
9442  case X86::VSQRTSSZr_Intkz:
9443  case X86::VSQRTSSZrb_Int:
9444  case X86::VSQRTSSZrb_Intk:
9445  case X86::VSQRTSSZrb_Intkz:
9446
9447  case X86::VGATHERDPDYrm:
9448  case X86::VGATHERDPDZ128rm:
9449  case X86::VGATHERDPDZ256rm:
9450  case X86::VGATHERDPDZrm:
9451  case X86::VGATHERDPDrm:
9452  case X86::VGATHERDPSYrm:
9453  case X86::VGATHERDPSZ128rm:
9454  case X86::VGATHERDPSZ256rm:
9455  case X86::VGATHERDPSZrm:
9456  case X86::VGATHERDPSrm:
9457  case X86::VGATHERPF0DPDm:
9458  case X86::VGATHERPF0DPSm:
9459  case X86::VGATHERPF0QPDm:
9460  case X86::VGATHERPF0QPSm:
9461  case X86::VGATHERPF1DPDm:
9462  case X86::VGATHERPF1DPSm:
9463  case X86::VGATHERPF1QPDm:
9464  case X86::VGATHERPF1QPSm:
9465  case X86::VGATHERQPDYrm:
9466  case X86::VGATHERQPDZ128rm:
9467  case X86::VGATHERQPDZ256rm:
9468  case X86::VGATHERQPDZrm:
9469  case X86::VGATHERQPDrm:
9470  case X86::VGATHERQPSYrm:
9471  case X86::VGATHERQPSZ128rm:
9472  case X86::VGATHERQPSZ256rm:
9473  case X86::VGATHERQPSZrm:
9474  case X86::VGATHERQPSrm:
9475  case X86::VPGATHERDDYrm:
9476  case X86::VPGATHERDDZ128rm:
9477  case X86::VPGATHERDDZ256rm:
9478  case X86::VPGATHERDDZrm:
9479  case X86::VPGATHERDDrm:
9480  case X86::VPGATHERDQYrm:
9481  case X86::VPGATHERDQZ128rm:
9482  case X86::VPGATHERDQZ256rm:
9483  case X86::VPGATHERDQZrm:
9484  case X86::VPGATHERDQrm:
9485  case X86::VPGATHERQDYrm:
9486  case X86::VPGATHERQDZ128rm:
9487  case X86::VPGATHERQDZ256rm:
9488  case X86::VPGATHERQDZrm:
9489  case X86::VPGATHERQDrm:
9490  case X86::VPGATHERQQYrm:
9491  case X86::VPGATHERQQZ128rm:
9492  case X86::VPGATHERQQZ256rm:
9493  case X86::VPGATHERQQZrm:
9494  case X86::VPGATHERQQrm:
9495  case X86::VSCATTERDPDZ128mr:
9496  case X86::VSCATTERDPDZ256mr:
9497  case X86::VSCATTERDPDZmr:
9498  case X86::VSCATTERDPSZ128mr:
9499  case X86::VSCATTERDPSZ256mr:
9500  case X86::VSCATTERDPSZmr:
9501  case X86::VSCATTERPF0DPDm:
9502  case X86::VSCATTERPF0DPSm:
9503  case X86::VSCATTERPF0QPDm:
9504  case X86::VSCATTERPF0QPSm:
9505  case X86::VSCATTERPF1DPDm:
9506  case X86::VSCATTERPF1DPSm:
9507  case X86::VSCATTERPF1QPDm:
9508  case X86::VSCATTERPF1QPSm:
9509  case X86::VSCATTERQPDZ128mr:
9510  case X86::VSCATTERQPDZ256mr:
9511  case X86::VSCATTERQPDZmr:
9512  case X86::VSCATTERQPSZ128mr:
9513  case X86::VSCATTERQPSZ256mr:
9514  case X86::VSCATTERQPSZmr:
9515  case X86::VPSCATTERDDZ128mr:
9516  case X86::VPSCATTERDDZ256mr:
9517  case X86::VPSCATTERDDZmr:
9518  case X86::VPSCATTERDQZ128mr:
9519  case X86::VPSCATTERDQZ256mr:
9520  case X86::VPSCATTERDQZmr:
9521  case X86::VPSCATTERQDZ128mr:
9522  case X86::VPSCATTERQDZ256mr:
9523  case X86::VPSCATTERQDZmr:
9524  case X86::VPSCATTERQQZ128mr:
9525  case X86::VPSCATTERQQZ256mr:
9526  case X86::VPSCATTERQQZmr:
9527    return true;
9528  }
9529}
9530
9531bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
9532                                         const MachineRegisterInfo *MRI,
9533                                         const MachineInstr &DefMI,
9534                                         unsigned DefIdx,
9535                                         const MachineInstr &UseMI,
9536                                         unsigned UseIdx) const {
9537  return isHighLatencyDef(DefMI.getOpcode());
9538}
9539
9540bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
9541                                           const MachineBasicBlock *MBB) const {
9542  assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 &&
9543         Inst.getNumDefs() <= 2 && "Reassociation needs binary operators");
9544
9545  // Integer binary math/logic instructions have a third source operand:
9546  // the EFLAGS register. That operand must be both defined here and never
9547  // used; ie, it must be dead. If the EFLAGS operand is live, then we can
9548  // not change anything because rearranging the operands could affect other
9549  // instructions that depend on the exact status flags (zero, sign, etc.)
9550  // that are set by using these particular operands with this operation.
9551  const MachineOperand *FlagDef = Inst.findRegisterDefOperand(X86::EFLAGS);
9552  assert((Inst.getNumDefs() == 1 || FlagDef) && "Implicit def isn't flags?");
9553  if (FlagDef && !FlagDef->isDead())
9554    return false;
9555
9556  return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
9557}
9558
9559// TODO: There are many more machine instruction opcodes to match:
9560//       1. Other data types (integer, vectors)
9561//       2. Other math / logic operations (xor, or)
9562//       3. Other forms of the same operation (intrinsics and other variants)
9563bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
9564                                               bool Invert) const {
9565  if (Invert)
9566    return false;
9567  switch (Inst.getOpcode()) {
9568  case X86::ADD8rr:
9569  case X86::ADD16rr:
9570  case X86::ADD32rr:
9571  case X86::ADD64rr:
9572  case X86::AND8rr:
9573  case X86::AND16rr:
9574  case X86::AND32rr:
9575  case X86::AND64rr:
9576  case X86::OR8rr:
9577  case X86::OR16rr:
9578  case X86::OR32rr:
9579  case X86::OR64rr:
9580  case X86::XOR8rr:
9581  case X86::XOR16rr:
9582  case X86::XOR32rr:
9583  case X86::XOR64rr:
9584  case X86::IMUL16rr:
9585  case X86::IMUL32rr:
9586  case X86::IMUL64rr:
9587  case X86::PANDrr:
9588  case X86::PORrr:
9589  case X86::PXORrr:
9590  case X86::ANDPDrr:
9591  case X86::ANDPSrr:
9592  case X86::ORPDrr:
9593  case X86::ORPSrr:
9594  case X86::XORPDrr:
9595  case X86::XORPSrr:
9596  case X86::PADDBrr:
9597  case X86::PADDWrr:
9598  case X86::PADDDrr:
9599  case X86::PADDQrr:
9600  case X86::PMULLWrr:
9601  case X86::PMULLDrr:
9602  case X86::PMAXSBrr:
9603  case X86::PMAXSDrr:
9604  case X86::PMAXSWrr:
9605  case X86::PMAXUBrr:
9606  case X86::PMAXUDrr:
9607  case X86::PMAXUWrr:
9608  case X86::PMINSBrr:
9609  case X86::PMINSDrr:
9610  case X86::PMINSWrr:
9611  case X86::PMINUBrr:
9612  case X86::PMINUDrr:
9613  case X86::PMINUWrr:
9614  case X86::VPANDrr:
9615  case X86::VPANDYrr:
9616  case X86::VPANDDZ128rr:
9617  case X86::VPANDDZ256rr:
9618  case X86::VPANDDZrr:
9619  case X86::VPANDQZ128rr:
9620  case X86::VPANDQZ256rr:
9621  case X86::VPANDQZrr:
9622  case X86::VPORrr:
9623  case X86::VPORYrr:
9624  case X86::VPORDZ128rr:
9625  case X86::VPORDZ256rr:
9626  case X86::VPORDZrr:
9627  case X86::VPORQZ128rr:
9628  case X86::VPORQZ256rr:
9629  case X86::VPORQZrr:
9630  case X86::VPXORrr:
9631  case X86::VPXORYrr:
9632  case X86::VPXORDZ128rr:
9633  case X86::VPXORDZ256rr:
9634  case X86::VPXORDZrr:
9635  case X86::VPXORQZ128rr:
9636  case X86::VPXORQZ256rr:
9637  case X86::VPXORQZrr:
9638  case X86::VANDPDrr:
9639  case X86::VANDPSrr:
9640  case X86::VANDPDYrr:
9641  case X86::VANDPSYrr:
9642  case X86::VANDPDZ128rr:
9643  case X86::VANDPSZ128rr:
9644  case X86::VANDPDZ256rr:
9645  case X86::VANDPSZ256rr:
9646  case X86::VANDPDZrr:
9647  case X86::VANDPSZrr:
9648  case X86::VORPDrr:
9649  case X86::VORPSrr:
9650  case X86::VORPDYrr:
9651  case X86::VORPSYrr:
9652  case X86::VORPDZ128rr:
9653  case X86::VORPSZ128rr:
9654  case X86::VORPDZ256rr:
9655  case X86::VORPSZ256rr:
9656  case X86::VORPDZrr:
9657  case X86::VORPSZrr:
9658  case X86::VXORPDrr:
9659  case X86::VXORPSrr:
9660  case X86::VXORPDYrr:
9661  case X86::VXORPSYrr:
9662  case X86::VXORPDZ128rr:
9663  case X86::VXORPSZ128rr:
9664  case X86::VXORPDZ256rr:
9665  case X86::VXORPSZ256rr:
9666  case X86::VXORPDZrr:
9667  case X86::VXORPSZrr:
9668  case X86::KADDBrr:
9669  case X86::KADDWrr:
9670  case X86::KADDDrr:
9671  case X86::KADDQrr:
9672  case X86::KANDBrr:
9673  case X86::KANDWrr:
9674  case X86::KANDDrr:
9675  case X86::KANDQrr:
9676  case X86::KORBrr:
9677  case X86::KORWrr:
9678  case X86::KORDrr:
9679  case X86::KORQrr:
9680  case X86::KXORBrr:
9681  case X86::KXORWrr:
9682  case X86::KXORDrr:
9683  case X86::KXORQrr:
9684  case X86::VPADDBrr:
9685  case X86::VPADDWrr:
9686  case X86::VPADDDrr:
9687  case X86::VPADDQrr:
9688  case X86::VPADDBYrr:
9689  case X86::VPADDWYrr:
9690  case X86::VPADDDYrr:
9691  case X86::VPADDQYrr:
9692  case X86::VPADDBZ128rr:
9693  case X86::VPADDWZ128rr:
9694  case X86::VPADDDZ128rr:
9695  case X86::VPADDQZ128rr:
9696  case X86::VPADDBZ256rr:
9697  case X86::VPADDWZ256rr:
9698  case X86::VPADDDZ256rr:
9699  case X86::VPADDQZ256rr:
9700  case X86::VPADDBZrr:
9701  case X86::VPADDWZrr:
9702  case X86::VPADDDZrr:
9703  case X86::VPADDQZrr:
9704  case X86::VPMULLWrr:
9705  case X86::VPMULLWYrr:
9706  case X86::VPMULLWZ128rr:
9707  case X86::VPMULLWZ256rr:
9708  case X86::VPMULLWZrr:
9709  case X86::VPMULLDrr:
9710  case X86::VPMULLDYrr:
9711  case X86::VPMULLDZ128rr:
9712  case X86::VPMULLDZ256rr:
9713  case X86::VPMULLDZrr:
9714  case X86::VPMULLQZ128rr:
9715  case X86::VPMULLQZ256rr:
9716  case X86::VPMULLQZrr:
9717  case X86::VPMAXSBrr:
9718  case X86::VPMAXSBYrr:
9719  case X86::VPMAXSBZ128rr:
9720  case X86::VPMAXSBZ256rr:
9721  case X86::VPMAXSBZrr:
9722  case X86::VPMAXSDrr:
9723  case X86::VPMAXSDYrr:
9724  case X86::VPMAXSDZ128rr:
9725  case X86::VPMAXSDZ256rr:
9726  case X86::VPMAXSDZrr:
9727  case X86::VPMAXSQZ128rr:
9728  case X86::VPMAXSQZ256rr:
9729  case X86::VPMAXSQZrr:
9730  case X86::VPMAXSWrr:
9731  case X86::VPMAXSWYrr:
9732  case X86::VPMAXSWZ128rr:
9733  case X86::VPMAXSWZ256rr:
9734  case X86::VPMAXSWZrr:
9735  case X86::VPMAXUBrr:
9736  case X86::VPMAXUBYrr:
9737  case X86::VPMAXUBZ128rr:
9738  case X86::VPMAXUBZ256rr:
9739  case X86::VPMAXUBZrr:
9740  case X86::VPMAXUDrr:
9741  case X86::VPMAXUDYrr:
9742  case X86::VPMAXUDZ128rr:
9743  case X86::VPMAXUDZ256rr:
9744  case X86::VPMAXUDZrr:
9745  case X86::VPMAXUQZ128rr:
9746  case X86::VPMAXUQZ256rr:
9747  case X86::VPMAXUQZrr:
9748  case X86::VPMAXUWrr:
9749  case X86::VPMAXUWYrr:
9750  case X86::VPMAXUWZ128rr:
9751  case X86::VPMAXUWZ256rr:
9752  case X86::VPMAXUWZrr:
9753  case X86::VPMINSBrr:
9754  case X86::VPMINSBYrr:
9755  case X86::VPMINSBZ128rr:
9756  case X86::VPMINSBZ256rr:
9757  case X86::VPMINSBZrr:
9758  case X86::VPMINSDrr:
9759  case X86::VPMINSDYrr:
9760  case X86::VPMINSDZ128rr:
9761  case X86::VPMINSDZ256rr:
9762  case X86::VPMINSDZrr:
9763  case X86::VPMINSQZ128rr:
9764  case X86::VPMINSQZ256rr:
9765  case X86::VPMINSQZrr:
9766  case X86::VPMINSWrr:
9767  case X86::VPMINSWYrr:
9768  case X86::VPMINSWZ128rr:
9769  case X86::VPMINSWZ256rr:
9770  case X86::VPMINSWZrr:
9771  case X86::VPMINUBrr:
9772  case X86::VPMINUBYrr:
9773  case X86::VPMINUBZ128rr:
9774  case X86::VPMINUBZ256rr:
9775  case X86::VPMINUBZrr:
9776  case X86::VPMINUDrr:
9777  case X86::VPMINUDYrr:
9778  case X86::VPMINUDZ128rr:
9779  case X86::VPMINUDZ256rr:
9780  case X86::VPMINUDZrr:
9781  case X86::VPMINUQZ128rr:
9782  case X86::VPMINUQZ256rr:
9783  case X86::VPMINUQZrr:
9784  case X86::VPMINUWrr:
9785  case X86::VPMINUWYrr:
9786  case X86::VPMINUWZ128rr:
9787  case X86::VPMINUWZ256rr:
9788  case X86::VPMINUWZrr:
9789  // Normal min/max instructions are not commutative because of NaN and signed
9790  // zero semantics, but these are. Thus, there's no need to check for global
9791  // relaxed math; the instructions themselves have the properties we need.
9792  case X86::MAXCPDrr:
9793  case X86::MAXCPSrr:
9794  case X86::MAXCSDrr:
9795  case X86::MAXCSSrr:
9796  case X86::MINCPDrr:
9797  case X86::MINCPSrr:
9798  case X86::MINCSDrr:
9799  case X86::MINCSSrr:
9800  case X86::VMAXCPDrr:
9801  case X86::VMAXCPSrr:
9802  case X86::VMAXCPDYrr:
9803  case X86::VMAXCPSYrr:
9804  case X86::VMAXCPDZ128rr:
9805  case X86::VMAXCPSZ128rr:
9806  case X86::VMAXCPDZ256rr:
9807  case X86::VMAXCPSZ256rr:
9808  case X86::VMAXCPDZrr:
9809  case X86::VMAXCPSZrr:
9810  case X86::VMAXCSDrr:
9811  case X86::VMAXCSSrr:
9812  case X86::VMAXCSDZrr:
9813  case X86::VMAXCSSZrr:
9814  case X86::VMINCPDrr:
9815  case X86::VMINCPSrr:
9816  case X86::VMINCPDYrr:
9817  case X86::VMINCPSYrr:
9818  case X86::VMINCPDZ128rr:
9819  case X86::VMINCPSZ128rr:
9820  case X86::VMINCPDZ256rr:
9821  case X86::VMINCPSZ256rr:
9822  case X86::VMINCPDZrr:
9823  case X86::VMINCPSZrr:
9824  case X86::VMINCSDrr:
9825  case X86::VMINCSSrr:
9826  case X86::VMINCSDZrr:
9827  case X86::VMINCSSZrr:
9828  case X86::VMAXCPHZ128rr:
9829  case X86::VMAXCPHZ256rr:
9830  case X86::VMAXCPHZrr:
9831  case X86::VMAXCSHZrr:
9832  case X86::VMINCPHZ128rr:
9833  case X86::VMINCPHZ256rr:
9834  case X86::VMINCPHZrr:
9835  case X86::VMINCSHZrr:
9836    return true;
9837  case X86::ADDPDrr:
9838  case X86::ADDPSrr:
9839  case X86::ADDSDrr:
9840  case X86::ADDSSrr:
9841  case X86::MULPDrr:
9842  case X86::MULPSrr:
9843  case X86::MULSDrr:
9844  case X86::MULSSrr:
9845  case X86::VADDPDrr:
9846  case X86::VADDPSrr:
9847  case X86::VADDPDYrr:
9848  case X86::VADDPSYrr:
9849  case X86::VADDPDZ128rr:
9850  case X86::VADDPSZ128rr:
9851  case X86::VADDPDZ256rr:
9852  case X86::VADDPSZ256rr:
9853  case X86::VADDPDZrr:
9854  case X86::VADDPSZrr:
9855  case X86::VADDSDrr:
9856  case X86::VADDSSrr:
9857  case X86::VADDSDZrr:
9858  case X86::VADDSSZrr:
9859  case X86::VMULPDrr:
9860  case X86::VMULPSrr:
9861  case X86::VMULPDYrr:
9862  case X86::VMULPSYrr:
9863  case X86::VMULPDZ128rr:
9864  case X86::VMULPSZ128rr:
9865  case X86::VMULPDZ256rr:
9866  case X86::VMULPSZ256rr:
9867  case X86::VMULPDZrr:
9868  case X86::VMULPSZrr:
9869  case X86::VMULSDrr:
9870  case X86::VMULSSrr:
9871  case X86::VMULSDZrr:
9872  case X86::VMULSSZrr:
9873  case X86::VADDPHZ128rr:
9874  case X86::VADDPHZ256rr:
9875  case X86::VADDPHZrr:
9876  case X86::VADDSHZrr:
9877  case X86::VMULPHZ128rr:
9878  case X86::VMULPHZ256rr:
9879  case X86::VMULPHZrr:
9880  case X86::VMULSHZrr:
9881    return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
9882           Inst.getFlag(MachineInstr::MIFlag::FmNsz);
9883  default:
9884    return false;
9885  }
9886}
9887
9888/// If \p DescribedReg overlaps with the MOVrr instruction's destination
9889/// register then, if possible, describe the value in terms of the source
9890/// register.
9891static std::optional<ParamLoadedValue>
9892describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg,
9893                         const TargetRegisterInfo *TRI) {
9894  Register DestReg = MI.getOperand(0).getReg();
9895  Register SrcReg = MI.getOperand(1).getReg();
9896
9897  auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
9898
9899  // If the described register is the destination, just return the source.
9900  if (DestReg == DescribedReg)
9901    return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9902
9903  // If the described register is a sub-register of the destination register,
9904  // then pick out the source register's corresponding sub-register.
9905  if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
9906    Register SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
9907    return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
9908  }
9909
9910  // The remaining case to consider is when the described register is a
9911  // super-register of the destination register. MOV8rr and MOV16rr does not
9912  // write to any of the other bytes in the register, meaning that we'd have to
9913  // describe the value using a combination of the source register and the
9914  // non-overlapping bits in the described register, which is not currently
9915  // possible.
9916  if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr ||
9917      !TRI->isSuperRegister(DestReg, DescribedReg))
9918    return std::nullopt;
9919
9920  assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case");
9921  return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
9922}
9923
9924std::optional<ParamLoadedValue>
9925X86InstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const {
9926  const MachineOperand *Op = nullptr;
9927  DIExpression *Expr = nullptr;
9928
9929  const TargetRegisterInfo *TRI = &getRegisterInfo();
9930
9931  switch (MI.getOpcode()) {
9932  case X86::LEA32r:
9933  case X86::LEA64r:
9934  case X86::LEA64_32r: {
9935    // We may need to describe a 64-bit parameter with a 32-bit LEA.
9936    if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
9937      return std::nullopt;
9938
9939    // Operand 4 could be global address. For now we do not support
9940    // such situation.
9941    if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
9942      return std::nullopt;
9943
9944    const MachineOperand &Op1 = MI.getOperand(1);
9945    const MachineOperand &Op2 = MI.getOperand(3);
9946    assert(Op2.isReg() &&
9947           (Op2.getReg() == X86::NoRegister || Op2.getReg().isPhysical()));
9948
9949    // Omit situations like:
9950    // %rsi = lea %rsi, 4, ...
9951    if ((Op1.isReg() && Op1.getReg() == MI.getOperand(0).getReg()) ||
9952        Op2.getReg() == MI.getOperand(0).getReg())
9953      return std::nullopt;
9954    else if ((Op1.isReg() && Op1.getReg() != X86::NoRegister &&
9955              TRI->regsOverlap(Op1.getReg(), MI.getOperand(0).getReg())) ||
9956             (Op2.getReg() != X86::NoRegister &&
9957              TRI->regsOverlap(Op2.getReg(), MI.getOperand(0).getReg())))
9958      return std::nullopt;
9959
9960    int64_t Coef = MI.getOperand(2).getImm();
9961    int64_t Offset = MI.getOperand(4).getImm();
9962    SmallVector<uint64_t, 8> Ops;
9963
9964    if ((Op1.isReg() && Op1.getReg() != X86::NoRegister)) {
9965      Op = &Op1;
9966    } else if (Op1.isFI())
9967      Op = &Op1;
9968
9969    if (Op && Op->isReg() && Op->getReg() == Op2.getReg() && Coef > 0) {
9970      Ops.push_back(dwarf::DW_OP_constu);
9971      Ops.push_back(Coef + 1);
9972      Ops.push_back(dwarf::DW_OP_mul);
9973    } else {
9974      if (Op && Op2.getReg() != X86::NoRegister) {
9975        int dwarfReg = TRI->getDwarfRegNum(Op2.getReg(), false);
9976        if (dwarfReg < 0)
9977          return std::nullopt;
9978        else if (dwarfReg < 32) {
9979          Ops.push_back(dwarf::DW_OP_breg0 + dwarfReg);
9980          Ops.push_back(0);
9981        } else {
9982          Ops.push_back(dwarf::DW_OP_bregx);
9983          Ops.push_back(dwarfReg);
9984          Ops.push_back(0);
9985        }
9986      } else if (!Op) {
9987        assert(Op2.getReg() != X86::NoRegister);
9988        Op = &Op2;
9989      }
9990
9991      if (Coef > 1) {
9992        assert(Op2.getReg() != X86::NoRegister);
9993        Ops.push_back(dwarf::DW_OP_constu);
9994        Ops.push_back(Coef);
9995        Ops.push_back(dwarf::DW_OP_mul);
9996      }
9997
9998      if (((Op1.isReg() && Op1.getReg() != X86::NoRegister) || Op1.isFI()) &&
9999          Op2.getReg() != X86::NoRegister) {
10000        Ops.push_back(dwarf::DW_OP_plus);
10001      }
10002    }
10003
10004    DIExpression::appendOffset(Ops, Offset);
10005    Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), Ops);
10006
10007    return ParamLoadedValue(*Op, Expr);
10008  }
10009  case X86::MOV8ri:
10010  case X86::MOV16ri:
10011    // TODO: Handle MOV8ri and MOV16ri.
10012    return std::nullopt;
10013  case X86::MOV32ri:
10014  case X86::MOV64ri:
10015  case X86::MOV64ri32:
10016    // MOV32ri may be used for producing zero-extended 32-bit immediates in
10017    // 64-bit parameters, so we need to consider super-registers.
10018    if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10019      return std::nullopt;
10020    return ParamLoadedValue(MI.getOperand(1), Expr);
10021  case X86::MOV8rr:
10022  case X86::MOV16rr:
10023  case X86::MOV32rr:
10024  case X86::MOV64rr:
10025    return describeMOVrrLoadedValue(MI, Reg, TRI);
10026  case X86::XOR32rr: {
10027    // 64-bit parameters are zero-materialized using XOR32rr, so also consider
10028    // super-registers.
10029    if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
10030      return std::nullopt;
10031    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
10032      return ParamLoadedValue(MachineOperand::CreateImm(0), Expr);
10033    return std::nullopt;
10034  }
10035  case X86::MOVSX64rr32: {
10036    // We may need to describe the lower 32 bits of the MOVSX; for example, in
10037    // cases like this:
10038    //
10039    //  $ebx = [...]
10040    //  $rdi = MOVSX64rr32 $ebx
10041    //  $esi = MOV32rr $edi
10042    if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg))
10043      return std::nullopt;
10044
10045    Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
10046
10047    // If the described register is the destination register we need to
10048    // sign-extend the source register from 32 bits. The other case we handle
10049    // is when the described register is the 32-bit sub-register of the
10050    // destination register, in case we just need to return the source
10051    // register.
10052    if (Reg == MI.getOperand(0).getReg())
10053      Expr = DIExpression::appendExt(Expr, 32, 64, true);
10054    else
10055      assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) &&
10056             "Unhandled sub-register case for MOVSX64rr32");
10057
10058    return ParamLoadedValue(MI.getOperand(1), Expr);
10059  }
10060  default:
10061    assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction");
10062    return TargetInstrInfo::describeLoadedValue(MI, Reg);
10063  }
10064}
10065
10066/// This is an architecture-specific helper function of reassociateOps.
10067/// Set special operand attributes for new instructions after reassociation.
10068void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
10069                                         MachineInstr &OldMI2,
10070                                         MachineInstr &NewMI1,
10071                                         MachineInstr &NewMI2) const {
10072  // Integer instructions may define an implicit EFLAGS dest register operand.
10073  MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS);
10074  MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS);
10075
10076  assert(!OldFlagDef1 == !OldFlagDef2 &&
10077         "Unexpected instruction type for reassociation");
10078
10079  if (!OldFlagDef1 || !OldFlagDef2)
10080    return;
10081
10082  assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() &&
10083         "Must have dead EFLAGS operand in reassociable instruction");
10084
10085  MachineOperand *NewFlagDef1 = NewMI1.findRegisterDefOperand(X86::EFLAGS);
10086  MachineOperand *NewFlagDef2 = NewMI2.findRegisterDefOperand(X86::EFLAGS);
10087
10088  assert(NewFlagDef1 && NewFlagDef2 &&
10089         "Unexpected operand in reassociable instruction");
10090
10091  // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
10092  // of this pass or other passes. The EFLAGS operands must be dead in these new
10093  // instructions because the EFLAGS operands in the original instructions must
10094  // be dead in order for reassociation to occur.
10095  NewFlagDef1->setIsDead();
10096  NewFlagDef2->setIsDead();
10097}
10098
10099std::pair<unsigned, unsigned>
10100X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
10101  return std::make_pair(TF, 0u);
10102}
10103
10104ArrayRef<std::pair<unsigned, const char *>>
10105X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
10106  using namespace X86II;
10107  static const std::pair<unsigned, const char *> TargetFlags[] = {
10108      {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
10109      {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
10110      {MO_GOT, "x86-got"},
10111      {MO_GOTOFF, "x86-gotoff"},
10112      {MO_GOTPCREL, "x86-gotpcrel"},
10113      {MO_GOTPCREL_NORELAX, "x86-gotpcrel-norelax"},
10114      {MO_PLT, "x86-plt"},
10115      {MO_TLSGD, "x86-tlsgd"},
10116      {MO_TLSLD, "x86-tlsld"},
10117      {MO_TLSLDM, "x86-tlsldm"},
10118      {MO_GOTTPOFF, "x86-gottpoff"},
10119      {MO_INDNTPOFF, "x86-indntpoff"},
10120      {MO_TPOFF, "x86-tpoff"},
10121      {MO_DTPOFF, "x86-dtpoff"},
10122      {MO_NTPOFF, "x86-ntpoff"},
10123      {MO_GOTNTPOFF, "x86-gotntpoff"},
10124      {MO_DLLIMPORT, "x86-dllimport"},
10125      {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
10126      {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
10127      {MO_TLVP, "x86-tlvp"},
10128      {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
10129      {MO_SECREL, "x86-secrel"},
10130      {MO_COFFSTUB, "x86-coffstub"}};
10131  return ArrayRef(TargetFlags);
10132}
10133
10134namespace {
10135/// Create Global Base Reg pass. This initializes the PIC
10136/// global base register for x86-32.
10137struct CGBR : public MachineFunctionPass {
10138  static char ID;
10139  CGBR() : MachineFunctionPass(ID) {}
10140
10141  bool runOnMachineFunction(MachineFunction &MF) override {
10142    const X86TargetMachine *TM =
10143        static_cast<const X86TargetMachine *>(&MF.getTarget());
10144    const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
10145
10146    // Only emit a global base reg in PIC mode.
10147    if (!TM->isPositionIndependent())
10148      return false;
10149
10150    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
10151    Register GlobalBaseReg = X86FI->getGlobalBaseReg();
10152
10153    // If we didn't need a GlobalBaseReg, don't insert code.
10154    if (GlobalBaseReg == 0)
10155      return false;
10156
10157    // Insert the set of GlobalBaseReg into the first MBB of the function
10158    MachineBasicBlock &FirstMBB = MF.front();
10159    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
10160    DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
10161    MachineRegisterInfo &RegInfo = MF.getRegInfo();
10162    const X86InstrInfo *TII = STI.getInstrInfo();
10163
10164    Register PC;
10165    if (STI.isPICStyleGOT())
10166      PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
10167    else
10168      PC = GlobalBaseReg;
10169
10170    if (STI.is64Bit()) {
10171      if (TM->getCodeModel() == CodeModel::Large) {
10172        // In the large code model, we are aiming for this code, though the
10173        // register allocation may vary:
10174        //   leaq .LN$pb(%rip), %rax
10175        //   movq $_GLOBAL_OFFSET_TABLE_ - .LN$pb, %rcx
10176        //   addq %rcx, %rax
10177        // RAX now holds address of _GLOBAL_OFFSET_TABLE_.
10178        Register PBReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
10179        Register GOTReg = RegInfo.createVirtualRegister(&X86::GR64RegClass);
10180        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PBReg)
10181            .addReg(X86::RIP)
10182            .addImm(0)
10183            .addReg(0)
10184            .addSym(MF.getPICBaseSymbol())
10185            .addReg(0);
10186        std::prev(MBBI)->setPreInstrSymbol(MF, MF.getPICBaseSymbol());
10187        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOV64ri), GOTReg)
10188            .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
10189                               X86II::MO_PIC_BASE_OFFSET);
10190        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD64rr), PC)
10191            .addReg(PBReg, RegState::Kill)
10192            .addReg(GOTReg, RegState::Kill);
10193      } else {
10194        // In other code models, use a RIP-relative LEA to materialize the
10195        // GOT.
10196        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::LEA64r), PC)
10197            .addReg(X86::RIP)
10198            .addImm(0)
10199            .addReg(0)
10200            .addExternalSymbol("_GLOBAL_OFFSET_TABLE_")
10201            .addReg(0);
10202      }
10203    } else {
10204      // Operand of MovePCtoStack is completely ignored by asm printer. It's
10205      // only used in JIT code emission as displacement to pc.
10206      BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
10207
10208      // If we're using vanilla 'GOT' PIC style, we should use relative
10209      // addressing not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
10210      if (STI.isPICStyleGOT()) {
10211        // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel],
10212        // %some_register
10213        BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
10214            .addReg(PC)
10215            .addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
10216                               X86II::MO_GOT_ABSOLUTE_ADDRESS);
10217      }
10218    }
10219
10220    return true;
10221  }
10222
10223  StringRef getPassName() const override {
10224    return "X86 PIC Global Base Reg Initialization";
10225  }
10226
10227  void getAnalysisUsage(AnalysisUsage &AU) const override {
10228    AU.setPreservesCFG();
10229    MachineFunctionPass::getAnalysisUsage(AU);
10230  }
10231};
10232} // namespace
10233
10234char CGBR::ID = 0;
10235FunctionPass *llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
10236
10237namespace {
10238struct LDTLSCleanup : public MachineFunctionPass {
10239  static char ID;
10240  LDTLSCleanup() : MachineFunctionPass(ID) {}
10241
10242  bool runOnMachineFunction(MachineFunction &MF) override {
10243    if (skipFunction(MF.getFunction()))
10244      return false;
10245
10246    X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
10247    if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
10248      // No point folding accesses if there isn't at least two.
10249      return false;
10250    }
10251
10252    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
10253    return VisitNode(DT->getRootNode(), 0);
10254  }
10255
10256  // Visit the dominator subtree rooted at Node in pre-order.
10257  // If TLSBaseAddrReg is non-null, then use that to replace any
10258  // TLS_base_addr instructions. Otherwise, create the register
10259  // when the first such instruction is seen, and then use it
10260  // as we encounter more instructions.
10261  bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
10262    MachineBasicBlock *BB = Node->getBlock();
10263    bool Changed = false;
10264
10265    // Traverse the current block.
10266    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
10267         ++I) {
10268      switch (I->getOpcode()) {
10269      case X86::TLS_base_addr32:
10270      case X86::TLS_base_addr64:
10271        if (TLSBaseAddrReg)
10272          I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
10273        else
10274          I = SetRegister(*I, &TLSBaseAddrReg);
10275        Changed = true;
10276        break;
10277      default:
10278        break;
10279      }
10280    }
10281
10282    // Visit the children of this block in the dominator tree.
10283    for (auto &I : *Node) {
10284      Changed |= VisitNode(I, TLSBaseAddrReg);
10285    }
10286
10287    return Changed;
10288  }
10289
10290  // Replace the TLS_base_addr instruction I with a copy from
10291  // TLSBaseAddrReg, returning the new instruction.
10292  MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
10293                                       unsigned TLSBaseAddrReg) {
10294    MachineFunction *MF = I.getParent()->getParent();
10295    const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
10296    const bool is64Bit = STI.is64Bit();
10297    const X86InstrInfo *TII = STI.getInstrInfo();
10298
10299    // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
10300    MachineInstr *Copy =
10301        BuildMI(*I.getParent(), I, I.getDebugLoc(),
10302                TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
10303            .addReg(TLSBaseAddrReg);
10304
10305    // Erase the TLS_base_addr instruction.
10306    I.eraseFromParent();
10307
10308    return Copy;
10309  }
10310
10311  // Create a virtual register in *TLSBaseAddrReg, and populate it by
10312  // inserting a copy instruction after I. Returns the new instruction.
10313  MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
10314    MachineFunction *MF = I.getParent()->getParent();
10315    const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
10316    const bool is64Bit = STI.is64Bit();
10317    const X86InstrInfo *TII = STI.getInstrInfo();
10318
10319    // Create a virtual register for the TLS base address.
10320    MachineRegisterInfo &RegInfo = MF->getRegInfo();
10321    *TLSBaseAddrReg = RegInfo.createVirtualRegister(
10322        is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass);
10323
10324    // Insert a copy from RAX/EAX to TLSBaseAddrReg.
10325    MachineInstr *Next = I.getNextNode();
10326    MachineInstr *Copy = BuildMI(*I.getParent(), Next, I.getDebugLoc(),
10327                                 TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
10328                             .addReg(is64Bit ? X86::RAX : X86::EAX);
10329
10330    return Copy;
10331  }
10332
10333  StringRef getPassName() const override {
10334    return "Local Dynamic TLS Access Clean-up";
10335  }
10336
10337  void getAnalysisUsage(AnalysisUsage &AU) const override {
10338    AU.setPreservesCFG();
10339    AU.addRequired<MachineDominatorTree>();
10340    MachineFunctionPass::getAnalysisUsage(AU);
10341  }
10342};
10343} // namespace
10344
10345char LDTLSCleanup::ID = 0;
10346FunctionPass *llvm::createCleanupLocalDynamicTLSPass() {
10347  return new LDTLSCleanup();
10348}
10349
10350/// Constants defining how certain sequences should be outlined.
10351///
10352/// \p MachineOutlinerDefault implies that the function is called with a call
10353/// instruction, and a return must be emitted for the outlined function frame.
10354///
10355/// That is,
10356///
10357/// I1                                 OUTLINED_FUNCTION:
10358/// I2 --> call OUTLINED_FUNCTION       I1
10359/// I3                                  I2
10360///                                     I3
10361///                                     ret
10362///
10363/// * Call construction overhead: 1 (call instruction)
10364/// * Frame construction overhead: 1 (return instruction)
10365///
10366/// \p MachineOutlinerTailCall implies that the function is being tail called.
10367/// A jump is emitted instead of a call, and the return is already present in
10368/// the outlined sequence. That is,
10369///
10370/// I1                                 OUTLINED_FUNCTION:
10371/// I2 --> jmp OUTLINED_FUNCTION       I1
10372/// ret                                I2
10373///                                    ret
10374///
10375/// * Call construction overhead: 1 (jump instruction)
10376/// * Frame construction overhead: 0 (don't need to return)
10377///
10378enum MachineOutlinerClass { MachineOutlinerDefault, MachineOutlinerTailCall };
10379
10380std::optional<outliner::OutlinedFunction>
10381X86InstrInfo::getOutliningCandidateInfo(
10382    std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
10383  unsigned SequenceSize = 0;
10384  for (auto &MI : RepeatedSequenceLocs[0]) {
10385    // FIXME: x86 doesn't implement getInstSizeInBytes, so
10386    // we can't tell the cost.  Just assume each instruction
10387    // is one byte.
10388    if (MI.isDebugInstr() || MI.isKill())
10389      continue;
10390    SequenceSize += 1;
10391  }
10392
10393  // We check to see if CFI Instructions are present, and if they are
10394  // we find the number of CFI Instructions in the candidates.
10395  unsigned CFICount = 0;
10396  for (auto &I : RepeatedSequenceLocs[0]) {
10397    if (I.isCFIInstruction())
10398      CFICount++;
10399  }
10400
10401  // We compare the number of found CFI Instructions to  the number of CFI
10402  // instructions in the parent function for each candidate.  We must check this
10403  // since if we outline one of the CFI instructions in a function, we have to
10404  // outline them all for correctness. If we do not, the address offsets will be
10405  // incorrect between the two sections of the program.
10406  for (outliner::Candidate &C : RepeatedSequenceLocs) {
10407    std::vector<MCCFIInstruction> CFIInstructions =
10408        C.getMF()->getFrameInstructions();
10409
10410    if (CFICount > 0 && CFICount != CFIInstructions.size())
10411      return std::nullopt;
10412  }
10413
10414  // FIXME: Use real size in bytes for call and ret instructions.
10415  if (RepeatedSequenceLocs[0].back().isTerminator()) {
10416    for (outliner::Candidate &C : RepeatedSequenceLocs)
10417      C.setCallInfo(MachineOutlinerTailCall, 1);
10418
10419    return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
10420                                      0, // Number of bytes to emit frame.
10421                                      MachineOutlinerTailCall // Type of frame.
10422    );
10423  }
10424
10425  if (CFICount > 0)
10426    return std::nullopt;
10427
10428  for (outliner::Candidate &C : RepeatedSequenceLocs)
10429    C.setCallInfo(MachineOutlinerDefault, 1);
10430
10431  return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 1,
10432                                    MachineOutlinerDefault);
10433}
10434
10435bool X86InstrInfo::isFunctionSafeToOutlineFrom(
10436    MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
10437  const Function &F = MF.getFunction();
10438
10439  // Does the function use a red zone? If it does, then we can't risk messing
10440  // with the stack.
10441  if (Subtarget.getFrameLowering()->has128ByteRedZone(MF)) {
10442    // It could have a red zone. If it does, then we don't want to touch it.
10443    const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
10444    if (!X86FI || X86FI->getUsesRedZone())
10445      return false;
10446  }
10447
10448  // If we *don't* want to outline from things that could potentially be deduped
10449  // then return false.
10450  if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
10451    return false;
10452
10453  // This function is viable for outlining, so return true.
10454  return true;
10455}
10456
10457outliner::InstrType
10458X86InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
10459                                   unsigned Flags) const {
10460  MachineInstr &MI = *MIT;
10461
10462  // Is this a terminator for a basic block?
10463  if (MI.isTerminator())
10464    // TargetInstrInfo::getOutliningType has already filtered out anything
10465    // that would break this, so we can allow it here.
10466    return outliner::InstrType::Legal;
10467
10468  // Don't outline anything that modifies or reads from the stack pointer.
10469  //
10470  // FIXME: There are instructions which are being manually built without
10471  // explicit uses/defs so we also have to check the MCInstrDesc. We should be
10472  // able to remove the extra checks once those are fixed up. For example,
10473  // sometimes we might get something like %rax = POP64r 1. This won't be
10474  // caught by modifiesRegister or readsRegister even though the instruction
10475  // really ought to be formed so that modifiesRegister/readsRegister would
10476  // catch it.
10477  if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
10478      MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
10479      MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
10480    return outliner::InstrType::Illegal;
10481
10482  // Outlined calls change the instruction pointer, so don't read from it.
10483  if (MI.readsRegister(X86::RIP, &RI) ||
10484      MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
10485      MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
10486    return outliner::InstrType::Illegal;
10487
10488  // Don't outline CFI instructions.
10489  if (MI.isCFIInstruction())
10490    return outliner::InstrType::Illegal;
10491
10492  return outliner::InstrType::Legal;
10493}
10494
10495void X86InstrInfo::buildOutlinedFrame(
10496    MachineBasicBlock &MBB, MachineFunction &MF,
10497    const outliner::OutlinedFunction &OF) const {
10498  // If we're a tail call, we already have a return, so don't do anything.
10499  if (OF.FrameConstructionID == MachineOutlinerTailCall)
10500    return;
10501
10502  // We're a normal call, so our sequence doesn't have a return instruction.
10503  // Add it in.
10504  MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RET64));
10505  MBB.insert(MBB.end(), retq);
10506}
10507
10508MachineBasicBlock::iterator X86InstrInfo::insertOutlinedCall(
10509    Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
10510    MachineFunction &MF, outliner::Candidate &C) const {
10511  // Is it a tail call?
10512  if (C.CallConstructionID == MachineOutlinerTailCall) {
10513    // Yes, just insert a JMP.
10514    It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::TAILJMPd64))
10515                            .addGlobalAddress(M.getNamedValue(MF.getName())));
10516  } else {
10517    // No, insert a call.
10518    It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
10519                            .addGlobalAddress(M.getNamedValue(MF.getName())));
10520  }
10521
10522  return It;
10523}
10524
10525void X86InstrInfo::buildClearRegister(Register Reg, MachineBasicBlock &MBB,
10526                                      MachineBasicBlock::iterator Iter,
10527                                      DebugLoc &DL,
10528                                      bool AllowSideEffects) const {
10529  const MachineFunction &MF = *MBB.getParent();
10530  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
10531  const TargetRegisterInfo &TRI = getRegisterInfo();
10532
10533  if (ST.hasMMX() && X86::VR64RegClass.contains(Reg))
10534    // FIXME: Should we ignore MMX registers?
10535    return;
10536
10537  if (TRI.isGeneralPurposeRegister(MF, Reg)) {
10538    // Convert register to the 32-bit version. Both 'movl' and 'xorl' clear the
10539    // upper bits of a 64-bit register automagically.
10540    Reg = getX86SubSuperRegister(Reg, 32);
10541
10542    if (!AllowSideEffects)
10543      // XOR affects flags, so use a MOV instead.
10544      BuildMI(MBB, Iter, DL, get(X86::MOV32ri), Reg).addImm(0);
10545    else
10546      BuildMI(MBB, Iter, DL, get(X86::XOR32rr), Reg)
10547          .addReg(Reg, RegState::Undef)
10548          .addReg(Reg, RegState::Undef);
10549  } else if (X86::VR128RegClass.contains(Reg)) {
10550    // XMM#
10551    if (!ST.hasSSE1())
10552      return;
10553
10554    // PXOR is safe to use because it doesn't affect flags.
10555    BuildMI(MBB, Iter, DL, get(X86::PXORrr), Reg)
10556        .addReg(Reg, RegState::Undef)
10557        .addReg(Reg, RegState::Undef);
10558  } else if (X86::VR256RegClass.contains(Reg)) {
10559    // YMM#
10560    if (!ST.hasAVX())
10561      return;
10562
10563    // VPXOR is safe to use because it doesn't affect flags.
10564    BuildMI(MBB, Iter, DL, get(X86::VPXORrr), Reg)
10565        .addReg(Reg, RegState::Undef)
10566        .addReg(Reg, RegState::Undef);
10567  } else if (X86::VR512RegClass.contains(Reg)) {
10568    // ZMM#
10569    if (!ST.hasAVX512())
10570      return;
10571
10572    // VPXORY is safe to use because it doesn't affect flags.
10573    BuildMI(MBB, Iter, DL, get(X86::VPXORYrr), Reg)
10574        .addReg(Reg, RegState::Undef)
10575        .addReg(Reg, RegState::Undef);
10576  } else if (X86::VK1RegClass.contains(Reg) || X86::VK2RegClass.contains(Reg) ||
10577             X86::VK4RegClass.contains(Reg) || X86::VK8RegClass.contains(Reg) ||
10578             X86::VK16RegClass.contains(Reg)) {
10579    if (!ST.hasVLX())
10580      return;
10581
10582    // KXOR is safe to use because it doesn't affect flags.
10583    unsigned Op = ST.hasBWI() ? X86::KXORQrr : X86::KXORWrr;
10584    BuildMI(MBB, Iter, DL, get(Op), Reg)
10585        .addReg(Reg, RegState::Undef)
10586        .addReg(Reg, RegState::Undef);
10587  }
10588}
10589
10590bool X86InstrInfo::getMachineCombinerPatterns(
10591    MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns,
10592    bool DoRegPressureReduce) const {
10593  unsigned Opc = Root.getOpcode();
10594  switch (Opc) {
10595  default:
10596    return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
10597                                                       DoRegPressureReduce);
10598  case X86::VPDPWSSDrr:
10599  case X86::VPDPWSSDrm:
10600  case X86::VPDPWSSDYrr:
10601  case X86::VPDPWSSDYrm: {
10602    Patterns.push_back(MachineCombinerPattern::DPWSSD);
10603    return true;
10604  }
10605  case X86::VPDPWSSDZ128r:
10606  case X86::VPDPWSSDZ128m:
10607  case X86::VPDPWSSDZ256r:
10608  case X86::VPDPWSSDZ256m:
10609  case X86::VPDPWSSDZr:
10610  case X86::VPDPWSSDZm: {
10611    if (Subtarget.hasBWI())
10612      Patterns.push_back(MachineCombinerPattern::DPWSSD);
10613    return true;
10614  }
10615  }
10616}
10617
10618static void
10619genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII,
10620                             SmallVectorImpl<MachineInstr *> &InsInstrs,
10621                             SmallVectorImpl<MachineInstr *> &DelInstrs,
10622                             DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
10623  MachineFunction *MF = Root.getMF();
10624  MachineRegisterInfo &RegInfo = MF->getRegInfo();
10625
10626  unsigned Opc = Root.getOpcode();
10627  unsigned AddOpc = 0;
10628  unsigned MaddOpc = 0;
10629  switch (Opc) {
10630  default:
10631    assert(false && "It should not reach here");
10632    break;
10633  // vpdpwssd xmm2,xmm3,xmm1
10634  // -->
10635  // vpmaddwd xmm3,xmm3,xmm1
10636  // vpaddd xmm2,xmm2,xmm3
10637  case X86::VPDPWSSDrr:
10638    MaddOpc = X86::VPMADDWDrr;
10639    AddOpc = X86::VPADDDrr;
10640    break;
10641  case X86::VPDPWSSDrm:
10642    MaddOpc = X86::VPMADDWDrm;
10643    AddOpc = X86::VPADDDrr;
10644    break;
10645  case X86::VPDPWSSDZ128r:
10646    MaddOpc = X86::VPMADDWDZ128rr;
10647    AddOpc = X86::VPADDDZ128rr;
10648    break;
10649  case X86::VPDPWSSDZ128m:
10650    MaddOpc = X86::VPMADDWDZ128rm;
10651    AddOpc = X86::VPADDDZ128rr;
10652    break;
10653  // vpdpwssd ymm2,ymm3,ymm1
10654  // -->
10655  // vpmaddwd ymm3,ymm3,ymm1
10656  // vpaddd ymm2,ymm2,ymm3
10657  case X86::VPDPWSSDYrr:
10658    MaddOpc = X86::VPMADDWDYrr;
10659    AddOpc = X86::VPADDDYrr;
10660    break;
10661  case X86::VPDPWSSDYrm:
10662    MaddOpc = X86::VPMADDWDYrm;
10663    AddOpc = X86::VPADDDYrr;
10664    break;
10665  case X86::VPDPWSSDZ256r:
10666    MaddOpc = X86::VPMADDWDZ256rr;
10667    AddOpc = X86::VPADDDZ256rr;
10668    break;
10669  case X86::VPDPWSSDZ256m:
10670    MaddOpc = X86::VPMADDWDZ256rm;
10671    AddOpc = X86::VPADDDZ256rr;
10672    break;
10673  // vpdpwssd zmm2,zmm3,zmm1
10674  // -->
10675  // vpmaddwd zmm3,zmm3,zmm1
10676  // vpaddd zmm2,zmm2,zmm3
10677  case X86::VPDPWSSDZr:
10678    MaddOpc = X86::VPMADDWDZrr;
10679    AddOpc = X86::VPADDDZrr;
10680    break;
10681  case X86::VPDPWSSDZm:
10682    MaddOpc = X86::VPMADDWDZrm;
10683    AddOpc = X86::VPADDDZrr;
10684    break;
10685  }
10686  // Create vpmaddwd.
10687  const TargetRegisterClass *RC =
10688      RegInfo.getRegClass(Root.getOperand(0).getReg());
10689  Register NewReg = RegInfo.createVirtualRegister(RC);
10690  MachineInstr *Madd = Root.getMF()->CloneMachineInstr(&Root);
10691  Madd->setDesc(TII.get(MaddOpc));
10692  Madd->untieRegOperand(1);
10693  Madd->removeOperand(1);
10694  Madd->getOperand(0).setReg(NewReg);
10695  InstrIdxForVirtReg.insert(std::make_pair(NewReg, 0));
10696  // Create vpaddd.
10697  Register DstReg = Root.getOperand(0).getReg();
10698  bool IsKill = Root.getOperand(1).isKill();
10699  MachineInstr *Add =
10700      BuildMI(*MF, MIMetadata(Root), TII.get(AddOpc), DstReg)
10701          .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill))
10702          .addReg(Madd->getOperand(0).getReg(), getKillRegState(true));
10703  InsInstrs.push_back(Madd);
10704  InsInstrs.push_back(Add);
10705  DelInstrs.push_back(&Root);
10706}
10707
10708void X86InstrInfo::genAlternativeCodeSequence(
10709    MachineInstr &Root, MachineCombinerPattern Pattern,
10710    SmallVectorImpl<MachineInstr *> &InsInstrs,
10711    SmallVectorImpl<MachineInstr *> &DelInstrs,
10712    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
10713  switch (Pattern) {
10714  default:
10715    // Reassociate instructions.
10716    TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
10717                                                DelInstrs, InstrIdxForVirtReg);
10718    return;
10719  case MachineCombinerPattern::DPWSSD:
10720    genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs,
10721                                 InstrIdxForVirtReg);
10722    return;
10723  }
10724}
10725
10726// See also: X86DAGToDAGISel::SelectInlineAsmMemoryOperand().
10727void X86InstrInfo::getFrameIndexOperands(SmallVectorImpl<MachineOperand> &Ops,
10728                                         int FI) const {
10729  X86AddressMode M;
10730  M.BaseType = X86AddressMode::FrameIndexBase;
10731  M.Base.FrameIndex = FI;
10732  M.getFullAddress(Ops);
10733}
10734
10735#define GET_INSTRINFO_HELPERS
10736#include "X86GenInstrInfo.inc"
10737