assembler_x86.hpp revision 10964:33f10a35ce20
1/*
2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#ifndef CPU_X86_VM_ASSEMBLER_X86_HPP
26#define CPU_X86_VM_ASSEMBLER_X86_HPP
27
28#include "asm/register.hpp"
29#include "vm_version_x86.hpp"
30
31class BiasedLockingCounters;
32
33// Contains all the definitions needed for x86 assembly code generation.
34
35// Calling convention
36class Argument VALUE_OBJ_CLASS_SPEC {
37 public:
38  enum {
39#ifdef _LP64
40#ifdef _WIN64
41    n_int_register_parameters_c   = 4, // rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
42    n_float_register_parameters_c = 4,  // xmm0 - xmm3 (c_farg0, c_farg1, ... )
43#else
44    n_int_register_parameters_c   = 6, // rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
45    n_float_register_parameters_c = 8,  // xmm0 - xmm7 (c_farg0, c_farg1, ... )
46#endif // _WIN64
47    n_int_register_parameters_j   = 6, // j_rarg0, j_rarg1, ...
48    n_float_register_parameters_j = 8  // j_farg0, j_farg1, ...
49#else
50    n_register_parameters = 0   // 0 registers used to pass arguments
51#endif // _LP64
52  };
53};
54
55
56#ifdef _LP64
57// Symbolically name the register arguments used by the c calling convention.
58// Windows is different from linux/solaris. So much for standards...
59
60#ifdef _WIN64
61
62REGISTER_DECLARATION(Register, c_rarg0, rcx);
63REGISTER_DECLARATION(Register, c_rarg1, rdx);
64REGISTER_DECLARATION(Register, c_rarg2, r8);
65REGISTER_DECLARATION(Register, c_rarg3, r9);
66
67REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
68REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
69REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
70REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
71
72#else
73
74REGISTER_DECLARATION(Register, c_rarg0, rdi);
75REGISTER_DECLARATION(Register, c_rarg1, rsi);
76REGISTER_DECLARATION(Register, c_rarg2, rdx);
77REGISTER_DECLARATION(Register, c_rarg3, rcx);
78REGISTER_DECLARATION(Register, c_rarg4, r8);
79REGISTER_DECLARATION(Register, c_rarg5, r9);
80
81REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
82REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
83REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
84REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
85REGISTER_DECLARATION(XMMRegister, c_farg4, xmm4);
86REGISTER_DECLARATION(XMMRegister, c_farg5, xmm5);
87REGISTER_DECLARATION(XMMRegister, c_farg6, xmm6);
88REGISTER_DECLARATION(XMMRegister, c_farg7, xmm7);
89
90#endif // _WIN64
91
92// Symbolically name the register arguments used by the Java calling convention.
93// We have control over the convention for java so we can do what we please.
94// What pleases us is to offset the java calling convention so that when
95// we call a suitable jni method the arguments are lined up and we don't
96// have to do little shuffling. A suitable jni method is non-static and a
97// small number of arguments (two fewer args on windows)
98//
99//        |-------------------------------------------------------|
100//        | c_rarg0   c_rarg1  c_rarg2 c_rarg3 c_rarg4 c_rarg5    |
101//        |-------------------------------------------------------|
102//        | rcx       rdx      r8      r9      rdi*    rsi*       | windows (* not a c_rarg)
103//        | rdi       rsi      rdx     rcx     r8      r9         | solaris/linux
104//        |-------------------------------------------------------|
105//        | j_rarg5   j_rarg0  j_rarg1 j_rarg2 j_rarg3 j_rarg4    |
106//        |-------------------------------------------------------|
107
108REGISTER_DECLARATION(Register, j_rarg0, c_rarg1);
109REGISTER_DECLARATION(Register, j_rarg1, c_rarg2);
110REGISTER_DECLARATION(Register, j_rarg2, c_rarg3);
111// Windows runs out of register args here
112#ifdef _WIN64
113REGISTER_DECLARATION(Register, j_rarg3, rdi);
114REGISTER_DECLARATION(Register, j_rarg4, rsi);
115#else
116REGISTER_DECLARATION(Register, j_rarg3, c_rarg4);
117REGISTER_DECLARATION(Register, j_rarg4, c_rarg5);
118#endif /* _WIN64 */
119REGISTER_DECLARATION(Register, j_rarg5, c_rarg0);
120
121REGISTER_DECLARATION(XMMRegister, j_farg0, xmm0);
122REGISTER_DECLARATION(XMMRegister, j_farg1, xmm1);
123REGISTER_DECLARATION(XMMRegister, j_farg2, xmm2);
124REGISTER_DECLARATION(XMMRegister, j_farg3, xmm3);
125REGISTER_DECLARATION(XMMRegister, j_farg4, xmm4);
126REGISTER_DECLARATION(XMMRegister, j_farg5, xmm5);
127REGISTER_DECLARATION(XMMRegister, j_farg6, xmm6);
128REGISTER_DECLARATION(XMMRegister, j_farg7, xmm7);
129
130REGISTER_DECLARATION(Register, rscratch1, r10);  // volatile
131REGISTER_DECLARATION(Register, rscratch2, r11);  // volatile
132
133REGISTER_DECLARATION(Register, r12_heapbase, r12); // callee-saved
134REGISTER_DECLARATION(Register, r15_thread, r15); // callee-saved
135
136#else
137// rscratch1 will apear in 32bit code that is dead but of course must compile
138// Using noreg ensures if the dead code is incorrectly live and executed it
139// will cause an assertion failure
140#define rscratch1 noreg
141#define rscratch2 noreg
142
143#endif // _LP64
144
145// JSR 292
146// On x86, the SP does not have to be saved when invoking method handle intrinsics
147// or compiled lambda forms. We indicate that by setting rbp_mh_SP_save to noreg.
148REGISTER_DECLARATION(Register, rbp_mh_SP_save, noreg);
149
150// Address is an abstraction used to represent a memory location
151// using any of the amd64 addressing modes with one object.
152//
153// Note: A register location is represented via a Register, not
154//       via an address for efficiency & simplicity reasons.
155
156class ArrayAddress;
157
158class Address VALUE_OBJ_CLASS_SPEC {
159 public:
160  enum ScaleFactor {
161    no_scale = -1,
162    times_1  =  0,
163    times_2  =  1,
164    times_4  =  2,
165    times_8  =  3,
166    times_ptr = LP64_ONLY(times_8) NOT_LP64(times_4)
167  };
168  static ScaleFactor times(int size) {
169    assert(size >= 1 && size <= 8 && is_power_of_2(size), "bad scale size");
170    if (size == 8)  return times_8;
171    if (size == 4)  return times_4;
172    if (size == 2)  return times_2;
173    return times_1;
174  }
175  static int scale_size(ScaleFactor scale) {
176    assert(scale != no_scale, "");
177    assert(((1 << (int)times_1) == 1 &&
178            (1 << (int)times_2) == 2 &&
179            (1 << (int)times_4) == 4 &&
180            (1 << (int)times_8) == 8), "");
181    return (1 << (int)scale);
182  }
183
184 private:
185  Register         _base;
186  Register         _index;
187  ScaleFactor      _scale;
188  int              _disp;
189  RelocationHolder _rspec;
190
191  // Easily misused constructors make them private
192  // %%% can we make these go away?
193  NOT_LP64(Address(address loc, RelocationHolder spec);)
194  Address(int disp, address loc, relocInfo::relocType rtype);
195  Address(int disp, address loc, RelocationHolder spec);
196
197 public:
198
199 int disp() { return _disp; }
200  // creation
201  Address()
202    : _base(noreg),
203      _index(noreg),
204      _scale(no_scale),
205      _disp(0) {
206  }
207
208  // No default displacement otherwise Register can be implicitly
209  // converted to 0(Register) which is quite a different animal.
210
211  Address(Register base, int disp)
212    : _base(base),
213      _index(noreg),
214      _scale(no_scale),
215      _disp(disp) {
216  }
217
218  Address(Register base, Register index, ScaleFactor scale, int disp = 0)
219    : _base (base),
220      _index(index),
221      _scale(scale),
222      _disp (disp) {
223    assert(!index->is_valid() == (scale == Address::no_scale),
224           "inconsistent address");
225  }
226
227  Address(Register base, RegisterOrConstant index, ScaleFactor scale = times_1, int disp = 0)
228    : _base (base),
229      _index(index.register_or_noreg()),
230      _scale(scale),
231      _disp (disp + (index.constant_or_zero() * scale_size(scale))) {
232    if (!index.is_register())  scale = Address::no_scale;
233    assert(!_index->is_valid() == (scale == Address::no_scale),
234           "inconsistent address");
235  }
236
237  Address plus_disp(int disp) const {
238    Address a = (*this);
239    a._disp += disp;
240    return a;
241  }
242  Address plus_disp(RegisterOrConstant disp, ScaleFactor scale = times_1) const {
243    Address a = (*this);
244    a._disp += disp.constant_or_zero() * scale_size(scale);
245    if (disp.is_register()) {
246      assert(!a.index()->is_valid(), "competing indexes");
247      a._index = disp.as_register();
248      a._scale = scale;
249    }
250    return a;
251  }
252  bool is_same_address(Address a) const {
253    // disregard _rspec
254    return _base == a._base && _disp == a._disp && _index == a._index && _scale == a._scale;
255  }
256
257  // The following two overloads are used in connection with the
258  // ByteSize type (see sizes.hpp).  They simplify the use of
259  // ByteSize'd arguments in assembly code. Note that their equivalent
260  // for the optimized build are the member functions with int disp
261  // argument since ByteSize is mapped to an int type in that case.
262  //
263  // Note: DO NOT introduce similar overloaded functions for WordSize
264  // arguments as in the optimized mode, both ByteSize and WordSize
265  // are mapped to the same type and thus the compiler cannot make a
266  // distinction anymore (=> compiler errors).
267
268#ifdef ASSERT
269  Address(Register base, ByteSize disp)
270    : _base(base),
271      _index(noreg),
272      _scale(no_scale),
273      _disp(in_bytes(disp)) {
274  }
275
276  Address(Register base, Register index, ScaleFactor scale, ByteSize disp)
277    : _base(base),
278      _index(index),
279      _scale(scale),
280      _disp(in_bytes(disp)) {
281    assert(!index->is_valid() == (scale == Address::no_scale),
282           "inconsistent address");
283  }
284
285  Address(Register base, RegisterOrConstant index, ScaleFactor scale, ByteSize disp)
286    : _base (base),
287      _index(index.register_or_noreg()),
288      _scale(scale),
289      _disp (in_bytes(disp) + (index.constant_or_zero() * scale_size(scale))) {
290    if (!index.is_register())  scale = Address::no_scale;
291    assert(!_index->is_valid() == (scale == Address::no_scale),
292           "inconsistent address");
293  }
294
295#endif // ASSERT
296
297  // accessors
298  bool        uses(Register reg) const { return _base == reg || _index == reg; }
299  Register    base()             const { return _base;  }
300  Register    index()            const { return _index; }
301  ScaleFactor scale()            const { return _scale; }
302  int         disp()             const { return _disp;  }
303
304  // Convert the raw encoding form into the form expected by the constructor for
305  // Address.  An index of 4 (rsp) corresponds to having no index, so convert
306  // that to noreg for the Address constructor.
307  static Address make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc);
308
309  static Address make_array(ArrayAddress);
310
311 private:
312  bool base_needs_rex() const {
313    return _base != noreg && _base->encoding() >= 8;
314  }
315
316  bool index_needs_rex() const {
317    return _index != noreg &&_index->encoding() >= 8;
318  }
319
320  relocInfo::relocType reloc() const { return _rspec.type(); }
321
322  friend class Assembler;
323  friend class MacroAssembler;
324  friend class LIR_Assembler; // base/index/scale/disp
325};
326
327//
328// AddressLiteral has been split out from Address because operands of this type
329// need to be treated specially on 32bit vs. 64bit platforms. By splitting it out
330// the few instructions that need to deal with address literals are unique and the
331// MacroAssembler does not have to implement every instruction in the Assembler
332// in order to search for address literals that may need special handling depending
333// on the instruction and the platform. As small step on the way to merging i486/amd64
334// directories.
335//
336class AddressLiteral VALUE_OBJ_CLASS_SPEC {
337  friend class ArrayAddress;
338  RelocationHolder _rspec;
339  // Typically we use AddressLiterals we want to use their rval
340  // However in some situations we want the lval (effect address) of the item.
341  // We provide a special factory for making those lvals.
342  bool _is_lval;
343
344  // If the target is far we'll need to load the ea of this to
345  // a register to reach it. Otherwise if near we can do rip
346  // relative addressing.
347
348  address          _target;
349
350 protected:
351  // creation
352  AddressLiteral()
353    : _is_lval(false),
354      _target(NULL)
355  {}
356
357  public:
358
359
360  AddressLiteral(address target, relocInfo::relocType rtype);
361
362  AddressLiteral(address target, RelocationHolder const& rspec)
363    : _rspec(rspec),
364      _is_lval(false),
365      _target(target)
366  {}
367
368  AddressLiteral addr() {
369    AddressLiteral ret = *this;
370    ret._is_lval = true;
371    return ret;
372  }
373
374
375 private:
376
377  address target() { return _target; }
378  bool is_lval() { return _is_lval; }
379
380  relocInfo::relocType reloc() const { return _rspec.type(); }
381  const RelocationHolder& rspec() const { return _rspec; }
382
383  friend class Assembler;
384  friend class MacroAssembler;
385  friend class Address;
386  friend class LIR_Assembler;
387};
388
389// Convience classes
390class RuntimeAddress: public AddressLiteral {
391
392  public:
393
394  RuntimeAddress(address target) : AddressLiteral(target, relocInfo::runtime_call_type) {}
395
396};
397
398class ExternalAddress: public AddressLiteral {
399 private:
400  static relocInfo::relocType reloc_for_target(address target) {
401    // Sometimes ExternalAddress is used for values which aren't
402    // exactly addresses, like the card table base.
403    // external_word_type can't be used for values in the first page
404    // so just skip the reloc in that case.
405    return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
406  }
407
408 public:
409
410  ExternalAddress(address target) : AddressLiteral(target, reloc_for_target(target)) {}
411
412};
413
414class InternalAddress: public AddressLiteral {
415
416  public:
417
418  InternalAddress(address target) : AddressLiteral(target, relocInfo::internal_word_type) {}
419
420};
421
422// x86 can do array addressing as a single operation since disp can be an absolute
423// address amd64 can't. We create a class that expresses the concept but does extra
424// magic on amd64 to get the final result
425
426class ArrayAddress VALUE_OBJ_CLASS_SPEC {
427  private:
428
429  AddressLiteral _base;
430  Address        _index;
431
432  public:
433
434  ArrayAddress() {};
435  ArrayAddress(AddressLiteral base, Address index): _base(base), _index(index) {};
436  AddressLiteral base() { return _base; }
437  Address index() { return _index; }
438
439};
440
441class InstructionAttr;
442
443// 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
444// See fxsave and xsave(EVEX enabled) documentation for layout
445const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize);
446
447// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
448// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
449// is what you get. The Assembler is generating code into a CodeBuffer.
450
451class Assembler : public AbstractAssembler  {
452  friend class AbstractAssembler; // for the non-virtual hack
453  friend class LIR_Assembler; // as_Address()
454  friend class StubGenerator;
455
456 public:
457  enum Condition {                     // The x86 condition codes used for conditional jumps/moves.
458    zero          = 0x4,
459    notZero       = 0x5,
460    equal         = 0x4,
461    notEqual      = 0x5,
462    less          = 0xc,
463    lessEqual     = 0xe,
464    greater       = 0xf,
465    greaterEqual  = 0xd,
466    below         = 0x2,
467    belowEqual    = 0x6,
468    above         = 0x7,
469    aboveEqual    = 0x3,
470    overflow      = 0x0,
471    noOverflow    = 0x1,
472    carrySet      = 0x2,
473    carryClear    = 0x3,
474    negative      = 0x8,
475    positive      = 0x9,
476    parity        = 0xa,
477    noParity      = 0xb
478  };
479
480  enum Prefix {
481    // segment overrides
482    CS_segment = 0x2e,
483    SS_segment = 0x36,
484    DS_segment = 0x3e,
485    ES_segment = 0x26,
486    FS_segment = 0x64,
487    GS_segment = 0x65,
488
489    REX        = 0x40,
490
491    REX_B      = 0x41,
492    REX_X      = 0x42,
493    REX_XB     = 0x43,
494    REX_R      = 0x44,
495    REX_RB     = 0x45,
496    REX_RX     = 0x46,
497    REX_RXB    = 0x47,
498
499    REX_W      = 0x48,
500
501    REX_WB     = 0x49,
502    REX_WX     = 0x4A,
503    REX_WXB    = 0x4B,
504    REX_WR     = 0x4C,
505    REX_WRB    = 0x4D,
506    REX_WRX    = 0x4E,
507    REX_WRXB   = 0x4F,
508
509    VEX_3bytes = 0xC4,
510    VEX_2bytes = 0xC5,
511    EVEX_4bytes = 0x62,
512    Prefix_EMPTY = 0x0
513  };
514
515  enum VexPrefix {
516    VEX_B = 0x20,
517    VEX_X = 0x40,
518    VEX_R = 0x80,
519    VEX_W = 0x80
520  };
521
522  enum ExexPrefix {
523    EVEX_F  = 0x04,
524    EVEX_V  = 0x08,
525    EVEX_Rb = 0x10,
526    EVEX_X  = 0x40,
527    EVEX_Z  = 0x80
528  };
529
530  enum VexSimdPrefix {
531    VEX_SIMD_NONE = 0x0,
532    VEX_SIMD_66   = 0x1,
533    VEX_SIMD_F3   = 0x2,
534    VEX_SIMD_F2   = 0x3
535  };
536
537  enum VexOpcode {
538    VEX_OPCODE_NONE  = 0x0,
539    VEX_OPCODE_0F    = 0x1,
540    VEX_OPCODE_0F_38 = 0x2,
541    VEX_OPCODE_0F_3A = 0x3,
542    VEX_OPCODE_MASK  = 0x1F
543  };
544
545  enum AvxVectorLen {
546    AVX_128bit = 0x0,
547    AVX_256bit = 0x1,
548    AVX_512bit = 0x2,
549    AVX_NoVec  = 0x4
550  };
551
552  enum EvexTupleType {
553    EVEX_FV   = 0,
554    EVEX_HV   = 4,
555    EVEX_FVM  = 6,
556    EVEX_T1S  = 7,
557    EVEX_T1F  = 11,
558    EVEX_T2   = 13,
559    EVEX_T4   = 15,
560    EVEX_T8   = 17,
561    EVEX_HVM  = 18,
562    EVEX_QVM  = 19,
563    EVEX_OVM  = 20,
564    EVEX_M128 = 21,
565    EVEX_DUP  = 22,
566    EVEX_ETUP = 23
567  };
568
569  enum EvexInputSizeInBits {
570    EVEX_8bit  = 0,
571    EVEX_16bit = 1,
572    EVEX_32bit = 2,
573    EVEX_64bit = 3,
574    EVEX_NObit = 4
575  };
576
577  enum WhichOperand {
578    // input to locate_operand, and format code for relocations
579    imm_operand  = 0,            // embedded 32-bit|64-bit immediate operand
580    disp32_operand = 1,          // embedded 32-bit displacement or address
581    call32_operand = 2,          // embedded 32-bit self-relative displacement
582#ifndef _LP64
583    _WhichOperand_limit = 3
584#else
585     narrow_oop_operand = 3,     // embedded 32-bit immediate narrow oop
586    _WhichOperand_limit = 4
587#endif
588  };
589
590
591
592  // NOTE: The general philopsophy of the declarations here is that 64bit versions
593  // of instructions are freely declared without the need for wrapping them an ifdef.
594  // (Some dangerous instructions are ifdef's out of inappropriate jvm's.)
595  // In the .cpp file the implementations are wrapped so that they are dropped out
596  // of the resulting jvm. This is done mostly to keep the footprint of MINIMAL
597  // to the size it was prior to merging up the 32bit and 64bit assemblers.
598  //
599  // This does mean you'll get a linker/runtime error if you use a 64bit only instruction
600  // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down.
601
602private:
603
604  bool _legacy_mode_bw;
605  bool _legacy_mode_dq;
606  bool _legacy_mode_vl;
607  bool _legacy_mode_vlbw;
608  bool _is_managed;
609
610  class InstructionAttr *_attributes;
611
612  // 64bit prefixes
613  int prefix_and_encode(int reg_enc, bool byteinst = false);
614  int prefixq_and_encode(int reg_enc);
615
616  int prefix_and_encode(int dst_enc, int src_enc) {
617    return prefix_and_encode(dst_enc, false, src_enc, false);
618  }
619  int prefix_and_encode(int dst_enc, bool dst_is_byte, int src_enc, bool src_is_byte);
620  int prefixq_and_encode(int dst_enc, int src_enc);
621
622  void prefix(Register reg);
623  void prefix(Register dst, Register src, Prefix p);
624  void prefix(Register dst, Address adr, Prefix p);
625  void prefix(Address adr);
626  void prefixq(Address adr);
627
628  void prefix(Address adr, Register reg,  bool byteinst = false);
629  void prefix(Address adr, XMMRegister reg);
630  void prefixq(Address adr, Register reg);
631  void prefixq(Address adr, XMMRegister reg);
632
633  void prefetch_prefix(Address src);
634
635  void rex_prefix(Address adr, XMMRegister xreg,
636                  VexSimdPrefix pre, VexOpcode opc, bool rex_w);
637  int  rex_prefix_and_encode(int dst_enc, int src_enc,
638                             VexSimdPrefix pre, VexOpcode opc, bool rex_w);
639
640  void vex_prefix(bool vex_r, bool vex_b, bool vex_x, int nds_enc, VexSimdPrefix pre, VexOpcode opc);
641
642  void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool evex_r, bool evex_v,
643                   int nds_enc, VexSimdPrefix pre, VexOpcode opc);
644
645  void vex_prefix(Address adr, int nds_enc, int xreg_enc,
646                  VexSimdPrefix pre, VexOpcode opc,
647                  InstructionAttr *attributes);
648
649  int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
650                             VexSimdPrefix pre, VexOpcode opc,
651                             InstructionAttr *attributes);
652
653  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre,
654                   VexOpcode opc, InstructionAttr *attributes);
655
656  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre,
657                             VexOpcode opc, InstructionAttr *attributes);
658
659  // Helper functions for groups of instructions
660  void emit_arith_b(int op1, int op2, Register dst, int imm8);
661
662  void emit_arith(int op1, int op2, Register dst, int32_t imm32);
663  // Force generation of a 4 byte immediate value even if it fits into 8bit
664  void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
665  void emit_arith(int op1, int op2, Register dst, Register src);
666
667  bool emit_compressed_disp_byte(int &disp);
668
669  void emit_operand(Register reg,
670                    Register base, Register index, Address::ScaleFactor scale,
671                    int disp,
672                    RelocationHolder const& rspec,
673                    int rip_relative_correction = 0);
674
675  void emit_operand(Register reg, Address adr, int rip_relative_correction = 0);
676
677  // operands that only take the original 32bit registers
678  void emit_operand32(Register reg, Address adr);
679
680  void emit_operand(XMMRegister reg,
681                    Register base, Register index, Address::ScaleFactor scale,
682                    int disp,
683                    RelocationHolder const& rspec);
684
685  void emit_operand(XMMRegister reg, Address adr);
686
687  void emit_operand(MMXRegister reg, Address adr);
688
689  // workaround gcc (3.2.1-7) bug
690  void emit_operand(Address adr, MMXRegister reg);
691
692
693  // Immediate-to-memory forms
694  void emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32);
695
696  void emit_farith(int b1, int b2, int i);
697
698
699 protected:
700  #ifdef ASSERT
701  void check_relocation(RelocationHolder const& rspec, int format);
702  #endif
703
704  void emit_data(jint data, relocInfo::relocType    rtype, int format);
705  void emit_data(jint data, RelocationHolder const& rspec, int format);
706  void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
707  void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
708
709  bool reachable(AddressLiteral adr) NOT_LP64({ return true;});
710
711  // These are all easily abused and hence protected
712
713  // 32BIT ONLY SECTION
714#ifndef _LP64
715  // Make these disappear in 64bit mode since they would never be correct
716  void cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec);   // 32BIT ONLY
717  void cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
718
719  void mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
720  void mov_literal32(Address dst, int32_t imm32, RelocationHolder const& rspec);     // 32BIT ONLY
721
722  void push_literal32(int32_t imm32, RelocationHolder const& rspec);                 // 32BIT ONLY
723#else
724  // 64BIT ONLY SECTION
725  void mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec);   // 64BIT ONLY
726
727  void cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec);
728  void cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec);
729
730  void mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec);
731  void mov_narrow_oop(Address dst, int32_t imm32, RelocationHolder const& rspec);
732#endif // _LP64
733
734  // These are unique in that we are ensured by the caller that the 32bit
735  // relative in these instructions will always be able to reach the potentially
736  // 64bit address described by entry. Since they can take a 64bit address they
737  // don't have the 32 suffix like the other instructions in this class.
738
739  void call_literal(address entry, RelocationHolder const& rspec);
740  void jmp_literal(address entry, RelocationHolder const& rspec);
741
742  // Avoid using directly section
743  // Instructions in this section are actually usable by anyone without danger
744  // of failure but have performance issues that are addressed my enhanced
745  // instructions which will do the proper thing base on the particular cpu.
746  // We protect them because we don't trust you...
747
748  // Don't use next inc() and dec() methods directly. INC & DEC instructions
749  // could cause a partial flag stall since they don't set CF flag.
750  // Use MacroAssembler::decrement() & MacroAssembler::increment() methods
751  // which call inc() & dec() or add() & sub() in accordance with
752  // the product flag UseIncDec value.
753
754  void decl(Register dst);
755  void decl(Address dst);
756  void decq(Register dst);
757  void decq(Address dst);
758
759  void incl(Register dst);
760  void incl(Address dst);
761  void incq(Register dst);
762  void incq(Address dst);
763
764  // New cpus require use of movsd and movss to avoid partial register stall
765  // when loading from memory. But for old Opteron use movlpd instead of movsd.
766  // The selection is done in MacroAssembler::movdbl() and movflt().
767
768  // Move Scalar Single-Precision Floating-Point Values
769  void movss(XMMRegister dst, Address src);
770  void movss(XMMRegister dst, XMMRegister src);
771  void movss(Address dst, XMMRegister src);
772
773  // Move Scalar Double-Precision Floating-Point Values
774  void movsd(XMMRegister dst, Address src);
775  void movsd(XMMRegister dst, XMMRegister src);
776  void movsd(Address dst, XMMRegister src);
777  void movlpd(XMMRegister dst, Address src);
778
779  // New cpus require use of movaps and movapd to avoid partial register stall
780  // when moving between registers.
781  void movaps(XMMRegister dst, XMMRegister src);
782  void movapd(XMMRegister dst, XMMRegister src);
783
784  // End avoid using directly
785
786
787  // Instruction prefixes
788  void prefix(Prefix p);
789
790  public:
791
792  // Creation
793  Assembler(CodeBuffer* code) : AbstractAssembler(code) {
794    init_attributes();
795  }
796
797  // Decoding
798  static address locate_operand(address inst, WhichOperand which);
799  static address locate_next_instruction(address inst);
800
801  // Utilities
802  static bool is_polling_page_far() NOT_LP64({ return false;});
803  static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
804                                         int cur_tuple_type, int in_size_in_bits, int cur_encoding);
805
806  // Generic instructions
807  // Does 32bit or 64bit as needed for the platform. In some sense these
808  // belong in macro assembler but there is no need for both varieties to exist
809
810  void init_attributes(void) {
811    _legacy_mode_bw = (VM_Version::supports_avx512bw() == false);
812    _legacy_mode_dq = (VM_Version::supports_avx512dq() == false);
813    _legacy_mode_vl = (VM_Version::supports_avx512vl() == false);
814    _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false);
815    _is_managed = false;
816    _attributes = NULL;
817  }
818
819  void set_attributes(InstructionAttr *attributes) { _attributes = attributes; }
820  void clear_attributes(void) { _attributes = NULL; }
821
822  void set_managed(void) { _is_managed = true; }
823  void clear_managed(void) { _is_managed = false; }
824  bool is_managed(void) { return _is_managed; }
825
826  void lea(Register dst, Address src);
827
828  void mov(Register dst, Register src);
829
830  void pusha();
831  void popa();
832
833  void pushf();
834  void popf();
835
836  void push(int32_t imm32);
837
838  void push(Register src);
839
840  void pop(Register dst);
841
842  // These are dummies to prevent surprise implicit conversions to Register
843  void push(void* v);
844  void pop(void* v);
845
846  // These do register sized moves/scans
847  void rep_mov();
848  void rep_stos();
849  void rep_stosb();
850  void repne_scan();
851#ifdef _LP64
852  void repne_scanl();
853#endif
854
855  // Vanilla instructions in lexical order
856
857  void adcl(Address dst, int32_t imm32);
858  void adcl(Address dst, Register src);
859  void adcl(Register dst, int32_t imm32);
860  void adcl(Register dst, Address src);
861  void adcl(Register dst, Register src);
862
863  void adcq(Register dst, int32_t imm32);
864  void adcq(Register dst, Address src);
865  void adcq(Register dst, Register src);
866
867  void addl(Address dst, int32_t imm32);
868  void addl(Address dst, Register src);
869  void addl(Register dst, int32_t imm32);
870  void addl(Register dst, Address src);
871  void addl(Register dst, Register src);
872
873  void addq(Address dst, int32_t imm32);
874  void addq(Address dst, Register src);
875  void addq(Register dst, int32_t imm32);
876  void addq(Register dst, Address src);
877  void addq(Register dst, Register src);
878
879#ifdef _LP64
880 //Add Unsigned Integers with Carry Flag
881  void adcxq(Register dst, Register src);
882
883 //Add Unsigned Integers with Overflow Flag
884  void adoxq(Register dst, Register src);
885#endif
886
887  void addr_nop_4();
888  void addr_nop_5();
889  void addr_nop_7();
890  void addr_nop_8();
891
892  // Add Scalar Double-Precision Floating-Point Values
893  void addsd(XMMRegister dst, Address src);
894  void addsd(XMMRegister dst, XMMRegister src);
895
896  // Add Scalar Single-Precision Floating-Point Values
897  void addss(XMMRegister dst, Address src);
898  void addss(XMMRegister dst, XMMRegister src);
899
900  // AES instructions
901  void aesdec(XMMRegister dst, Address src);
902  void aesdec(XMMRegister dst, XMMRegister src);
903  void aesdeclast(XMMRegister dst, Address src);
904  void aesdeclast(XMMRegister dst, XMMRegister src);
905  void aesenc(XMMRegister dst, Address src);
906  void aesenc(XMMRegister dst, XMMRegister src);
907  void aesenclast(XMMRegister dst, Address src);
908  void aesenclast(XMMRegister dst, XMMRegister src);
909
910
911  void andl(Address  dst, int32_t imm32);
912  void andl(Register dst, int32_t imm32);
913  void andl(Register dst, Address src);
914  void andl(Register dst, Register src);
915
916  void andq(Address  dst, int32_t imm32);
917  void andq(Register dst, int32_t imm32);
918  void andq(Register dst, Address src);
919  void andq(Register dst, Register src);
920
921  // BMI instructions
922  void andnl(Register dst, Register src1, Register src2);
923  void andnl(Register dst, Register src1, Address src2);
924  void andnq(Register dst, Register src1, Register src2);
925  void andnq(Register dst, Register src1, Address src2);
926
927  void blsil(Register dst, Register src);
928  void blsil(Register dst, Address src);
929  void blsiq(Register dst, Register src);
930  void blsiq(Register dst, Address src);
931
932  void blsmskl(Register dst, Register src);
933  void blsmskl(Register dst, Address src);
934  void blsmskq(Register dst, Register src);
935  void blsmskq(Register dst, Address src);
936
937  void blsrl(Register dst, Register src);
938  void blsrl(Register dst, Address src);
939  void blsrq(Register dst, Register src);
940  void blsrq(Register dst, Address src);
941
942  void bsfl(Register dst, Register src);
943  void bsrl(Register dst, Register src);
944
945#ifdef _LP64
946  void bsfq(Register dst, Register src);
947  void bsrq(Register dst, Register src);
948#endif
949
950  void bswapl(Register reg);
951
952  void bswapq(Register reg);
953
954  void call(Label& L, relocInfo::relocType rtype);
955  void call(Register reg);  // push pc; pc <- reg
956  void call(Address adr);   // push pc; pc <- adr
957
958  void cdql();
959
960  void cdqq();
961
962  void cld();
963
964  void clflush(Address adr);
965
966  void cmovl(Condition cc, Register dst, Register src);
967  void cmovl(Condition cc, Register dst, Address src);
968
969  void cmovq(Condition cc, Register dst, Register src);
970  void cmovq(Condition cc, Register dst, Address src);
971
972
973  void cmpb(Address dst, int imm8);
974
975  void cmpl(Address dst, int32_t imm32);
976
977  void cmpl(Register dst, int32_t imm32);
978  void cmpl(Register dst, Register src);
979  void cmpl(Register dst, Address src);
980
981  void cmpq(Address dst, int32_t imm32);
982  void cmpq(Address dst, Register src);
983
984  void cmpq(Register dst, int32_t imm32);
985  void cmpq(Register dst, Register src);
986  void cmpq(Register dst, Address src);
987
988  // these are dummies used to catch attempting to convert NULL to Register
989  void cmpl(Register dst, void* junk); // dummy
990  void cmpq(Register dst, void* junk); // dummy
991
992  void cmpw(Address dst, int imm16);
993
994  void cmpxchg8 (Address adr);
995
996  void cmpxchgb(Register reg, Address adr);
997  void cmpxchgl(Register reg, Address adr);
998
999  void cmpxchgq(Register reg, Address adr);
1000
1001  // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
1002  void comisd(XMMRegister dst, Address src);
1003  void comisd(XMMRegister dst, XMMRegister src);
1004
1005  // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
1006  void comiss(XMMRegister dst, Address src);
1007  void comiss(XMMRegister dst, XMMRegister src);
1008
1009  // Identify processor type and features
1010  void cpuid();
1011
1012  // CRC32C
1013  void crc32(Register crc, Register v, int8_t sizeInBytes);
1014  void crc32(Register crc, Address adr, int8_t sizeInBytes);
1015
1016  // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
1017  void cvtsd2ss(XMMRegister dst, XMMRegister src);
1018  void cvtsd2ss(XMMRegister dst, Address src);
1019
1020  // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
1021  void cvtsi2sdl(XMMRegister dst, Register src);
1022  void cvtsi2sdl(XMMRegister dst, Address src);
1023  void cvtsi2sdq(XMMRegister dst, Register src);
1024  void cvtsi2sdq(XMMRegister dst, Address src);
1025
1026  // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
1027  void cvtsi2ssl(XMMRegister dst, Register src);
1028  void cvtsi2ssl(XMMRegister dst, Address src);
1029  void cvtsi2ssq(XMMRegister dst, Register src);
1030  void cvtsi2ssq(XMMRegister dst, Address src);
1031
1032  // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
1033  void cvtdq2pd(XMMRegister dst, XMMRegister src);
1034
1035  // Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value
1036  void cvtdq2ps(XMMRegister dst, XMMRegister src);
1037
1038  // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
1039  void cvtss2sd(XMMRegister dst, XMMRegister src);
1040  void cvtss2sd(XMMRegister dst, Address src);
1041
1042  // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
1043  void cvttsd2sil(Register dst, Address src);
1044  void cvttsd2sil(Register dst, XMMRegister src);
1045  void cvttsd2siq(Register dst, XMMRegister src);
1046
1047  // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
1048  void cvttss2sil(Register dst, XMMRegister src);
1049  void cvttss2siq(Register dst, XMMRegister src);
1050
1051  void cvttpd2dq(XMMRegister dst, XMMRegister src);
1052
1053  // Divide Scalar Double-Precision Floating-Point Values
1054  void divsd(XMMRegister dst, Address src);
1055  void divsd(XMMRegister dst, XMMRegister src);
1056
1057  // Divide Scalar Single-Precision Floating-Point Values
1058  void divss(XMMRegister dst, Address src);
1059  void divss(XMMRegister dst, XMMRegister src);
1060
1061  void emms();
1062
1063  void fabs();
1064
1065  void fadd(int i);
1066
1067  void fadd_d(Address src);
1068  void fadd_s(Address src);
1069
1070  // "Alternate" versions of x87 instructions place result down in FPU
1071  // stack instead of on TOS
1072
1073  void fadda(int i); // "alternate" fadd
1074  void faddp(int i = 1);
1075
1076  void fchs();
1077
1078  void fcom(int i);
1079
1080  void fcomp(int i = 1);
1081  void fcomp_d(Address src);
1082  void fcomp_s(Address src);
1083
1084  void fcompp();
1085
1086  void fcos();
1087
1088  void fdecstp();
1089
1090  void fdiv(int i);
1091  void fdiv_d(Address src);
1092  void fdivr_s(Address src);
1093  void fdiva(int i);  // "alternate" fdiv
1094  void fdivp(int i = 1);
1095
1096  void fdivr(int i);
1097  void fdivr_d(Address src);
1098  void fdiv_s(Address src);
1099
1100  void fdivra(int i); // "alternate" reversed fdiv
1101
1102  void fdivrp(int i = 1);
1103
1104  void ffree(int i = 0);
1105
1106  void fild_d(Address adr);
1107  void fild_s(Address adr);
1108
1109  void fincstp();
1110
1111  void finit();
1112
1113  void fist_s (Address adr);
1114  void fistp_d(Address adr);
1115  void fistp_s(Address adr);
1116
1117  void fld1();
1118
1119  void fld_d(Address adr);
1120  void fld_s(Address adr);
1121  void fld_s(int index);
1122  void fld_x(Address adr);  // extended-precision (80-bit) format
1123
1124  void fldcw(Address src);
1125
1126  void fldenv(Address src);
1127
1128  void fldlg2();
1129
1130  void fldln2();
1131
1132  void fldz();
1133
1134  void flog();
1135  void flog10();
1136
1137  void fmul(int i);
1138
1139  void fmul_d(Address src);
1140  void fmul_s(Address src);
1141
1142  void fmula(int i);  // "alternate" fmul
1143
1144  void fmulp(int i = 1);
1145
1146  void fnsave(Address dst);
1147
1148  void fnstcw(Address src);
1149
1150  void fnstsw_ax();
1151
1152  void fprem();
1153  void fprem1();
1154
1155  void frstor(Address src);
1156
1157  void fsin();
1158
1159  void fsqrt();
1160
1161  void fst_d(Address adr);
1162  void fst_s(Address adr);
1163
1164  void fstp_d(Address adr);
1165  void fstp_d(int index);
1166  void fstp_s(Address adr);
1167  void fstp_x(Address adr); // extended-precision (80-bit) format
1168
1169  void fsub(int i);
1170  void fsub_d(Address src);
1171  void fsub_s(Address src);
1172
1173  void fsuba(int i);  // "alternate" fsub
1174
1175  void fsubp(int i = 1);
1176
1177  void fsubr(int i);
1178  void fsubr_d(Address src);
1179  void fsubr_s(Address src);
1180
1181  void fsubra(int i); // "alternate" reversed fsub
1182
1183  void fsubrp(int i = 1);
1184
1185  void ftan();
1186
1187  void ftst();
1188
1189  void fucomi(int i = 1);
1190  void fucomip(int i = 1);
1191
1192  void fwait();
1193
1194  void fxch(int i = 1);
1195
1196  void fxrstor(Address src);
1197  void xrstor(Address src);
1198
1199  void fxsave(Address dst);
1200  void xsave(Address dst);
1201
1202  void fyl2x();
1203  void frndint();
1204  void f2xm1();
1205  void fldl2e();
1206
1207  void hlt();
1208
1209  void idivl(Register src);
1210  void divl(Register src); // Unsigned division
1211
1212#ifdef _LP64
1213  void idivq(Register src);
1214#endif
1215
1216  void imull(Register src);
1217  void imull(Register dst, Register src);
1218  void imull(Register dst, Register src, int value);
1219  void imull(Register dst, Address src);
1220
1221#ifdef _LP64
1222  void imulq(Register dst, Register src);
1223  void imulq(Register dst, Register src, int value);
1224  void imulq(Register dst, Address src);
1225#endif
1226
1227  // jcc is the generic conditional branch generator to run-
1228  // time routines, jcc is used for branches to labels. jcc
1229  // takes a branch opcode (cc) and a label (L) and generates
1230  // either a backward branch or a forward branch and links it
1231  // to the label fixup chain. Usage:
1232  //
1233  // Label L;      // unbound label
1234  // jcc(cc, L);   // forward branch to unbound label
1235  // bind(L);      // bind label to the current pc
1236  // jcc(cc, L);   // backward branch to bound label
1237  // bind(L);      // illegal: a label may be bound only once
1238  //
1239  // Note: The same Label can be used for forward and backward branches
1240  // but it may be bound only once.
1241
1242  void jcc(Condition cc, Label& L, bool maybe_short = true);
1243
1244  // Conditional jump to a 8-bit offset to L.
1245  // WARNING: be very careful using this for forward jumps.  If the label is
1246  // not bound within an 8-bit offset of this instruction, a run-time error
1247  // will occur.
1248  void jccb(Condition cc, Label& L);
1249
1250  void jmp(Address entry);    // pc <- entry
1251
1252  // Label operations & relative jumps (PPUM Appendix D)
1253  void jmp(Label& L, bool maybe_short = true);   // unconditional jump to L
1254
1255  void jmp(Register entry); // pc <- entry
1256
1257  // Unconditional 8-bit offset jump to L.
1258  // WARNING: be very careful using this for forward jumps.  If the label is
1259  // not bound within an 8-bit offset of this instruction, a run-time error
1260  // will occur.
1261  void jmpb(Label& L);
1262
1263  void ldmxcsr( Address src );
1264
1265  void leal(Register dst, Address src);
1266
1267  void leaq(Register dst, Address src);
1268
1269  void lfence();
1270
1271  void lock();
1272
1273  void lzcntl(Register dst, Register src);
1274
1275#ifdef _LP64
1276  void lzcntq(Register dst, Register src);
1277#endif
1278
1279  enum Membar_mask_bits {
1280    StoreStore = 1 << 3,
1281    LoadStore  = 1 << 2,
1282    StoreLoad  = 1 << 1,
1283    LoadLoad   = 1 << 0
1284  };
1285
1286  // Serializes memory and blows flags
1287  void membar(Membar_mask_bits order_constraint) {
1288    if (os::is_MP()) {
1289      // We only have to handle StoreLoad
1290      if (order_constraint & StoreLoad) {
1291        // All usable chips support "locked" instructions which suffice
1292        // as barriers, and are much faster than the alternative of
1293        // using cpuid instruction. We use here a locked add [esp-C],0.
1294        // This is conveniently otherwise a no-op except for blowing
1295        // flags, and introducing a false dependency on target memory
1296        // location. We can't do anything with flags, but we can avoid
1297        // memory dependencies in the current method by locked-adding
1298        // somewhere else on the stack. Doing [esp+C] will collide with
1299        // something on stack in current method, hence we go for [esp-C].
1300        // It is convenient since it is almost always in data cache, for
1301        // any small C.  We need to step back from SP to avoid data
1302        // dependencies with other things on below SP (callee-saves, for
1303        // example). Without a clear way to figure out the minimal safe
1304        // distance from SP, it makes sense to step back the complete
1305        // cache line, as this will also avoid possible second-order effects
1306        // with locked ops against the cache line. Our choice of offset
1307        // is bounded by x86 operand encoding, which should stay within
1308        // [-128; +127] to have the 8-byte displacement encoding.
1309        //
1310        // Any change to this code may need to revisit other places in
1311        // the code where this idiom is used, in particular the
1312        // orderAccess code.
1313
1314        int offset = -VM_Version::L1_line_size();
1315        if (offset < -128) {
1316          offset = -128;
1317        }
1318
1319        lock();
1320        addl(Address(rsp, offset), 0);// Assert the lock# signal here
1321      }
1322    }
1323  }
1324
1325  void mfence();
1326
1327  // Moves
1328
1329  void mov64(Register dst, int64_t imm64);
1330
1331  void movb(Address dst, Register src);
1332  void movb(Address dst, int imm8);
1333  void movb(Register dst, Address src);
1334
1335  void movddup(XMMRegister dst, XMMRegister src);
1336
1337  void kmovbl(KRegister dst, Register src);
1338  void kmovbl(Register dst, KRegister src);
1339  void kmovwl(KRegister dst, Register src);
1340  void kmovwl(Register dst, KRegister src);
1341  void kmovdl(KRegister dst, Register src);
1342  void kmovdl(Register dst, KRegister src);
1343  void kmovql(KRegister dst, KRegister src);
1344  void kmovql(Address dst, KRegister src);
1345  void kmovql(KRegister dst, Address src);
1346  void kmovql(KRegister dst, Register src);
1347  void kmovql(Register dst, KRegister src);
1348
1349  void kortestbl(KRegister dst, KRegister src);
1350  void kortestwl(KRegister dst, KRegister src);
1351  void kortestdl(KRegister dst, KRegister src);
1352  void kortestql(KRegister dst, KRegister src);
1353
1354  void movdl(XMMRegister dst, Register src);
1355  void movdl(Register dst, XMMRegister src);
1356  void movdl(XMMRegister dst, Address src);
1357  void movdl(Address dst, XMMRegister src);
1358
1359  // Move Double Quadword
1360  void movdq(XMMRegister dst, Register src);
1361  void movdq(Register dst, XMMRegister src);
1362
1363  // Move Aligned Double Quadword
1364  void movdqa(XMMRegister dst, XMMRegister src);
1365  void movdqa(XMMRegister dst, Address src);
1366
1367  // Move Unaligned Double Quadword
1368  void movdqu(Address     dst, XMMRegister src);
1369  void movdqu(XMMRegister dst, Address src);
1370  void movdqu(XMMRegister dst, XMMRegister src);
1371
1372  // Move Unaligned 256bit Vector
1373  void vmovdqu(Address dst, XMMRegister src);
1374  void vmovdqu(XMMRegister dst, Address src);
1375  void vmovdqu(XMMRegister dst, XMMRegister src);
1376
1377   // Move Unaligned 512bit Vector
1378  void evmovdqub(Address dst, XMMRegister src, int vector_len);
1379  void evmovdqub(XMMRegister dst, Address src, int vector_len);
1380  void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len);
1381  void evmovdquw(Address dst, XMMRegister src, int vector_len);
1382  void evmovdquw(XMMRegister dst, Address src, int vector_len);
1383  void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len);
1384  void evmovdqul(Address dst, XMMRegister src, int vector_len);
1385  void evmovdqul(XMMRegister dst, Address src, int vector_len);
1386  void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
1387  void evmovdquq(Address dst, XMMRegister src, int vector_len);
1388  void evmovdquq(XMMRegister dst, Address src, int vector_len);
1389  void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
1390
1391  // Move lower 64bit to high 64bit in 128bit register
1392  void movlhps(XMMRegister dst, XMMRegister src);
1393
1394  void movl(Register dst, int32_t imm32);
1395  void movl(Address dst, int32_t imm32);
1396  void movl(Register dst, Register src);
1397  void movl(Register dst, Address src);
1398  void movl(Address dst, Register src);
1399
1400  // These dummies prevent using movl from converting a zero (like NULL) into Register
1401  // by giving the compiler two choices it can't resolve
1402
1403  void movl(Address  dst, void* junk);
1404  void movl(Register dst, void* junk);
1405
1406#ifdef _LP64
1407  void movq(Register dst, Register src);
1408  void movq(Register dst, Address src);
1409  void movq(Address  dst, Register src);
1410#endif
1411
1412  void movq(Address     dst, MMXRegister src );
1413  void movq(MMXRegister dst, Address src );
1414
1415#ifdef _LP64
1416  // These dummies prevent using movq from converting a zero (like NULL) into Register
1417  // by giving the compiler two choices it can't resolve
1418
1419  void movq(Address  dst, void* dummy);
1420  void movq(Register dst, void* dummy);
1421#endif
1422
1423  // Move Quadword
1424  void movq(Address     dst, XMMRegister src);
1425  void movq(XMMRegister dst, Address src);
1426
1427  void movsbl(Register dst, Address src);
1428  void movsbl(Register dst, Register src);
1429
1430#ifdef _LP64
1431  void movsbq(Register dst, Address src);
1432  void movsbq(Register dst, Register src);
1433
1434  // Move signed 32bit immediate to 64bit extending sign
1435  void movslq(Address  dst, int32_t imm64);
1436  void movslq(Register dst, int32_t imm64);
1437
1438  void movslq(Register dst, Address src);
1439  void movslq(Register dst, Register src);
1440  void movslq(Register dst, void* src); // Dummy declaration to cause NULL to be ambiguous
1441#endif
1442
1443  void movswl(Register dst, Address src);
1444  void movswl(Register dst, Register src);
1445
1446#ifdef _LP64
1447  void movswq(Register dst, Address src);
1448  void movswq(Register dst, Register src);
1449#endif
1450
1451  void movw(Address dst, int imm16);
1452  void movw(Register dst, Address src);
1453  void movw(Address dst, Register src);
1454
1455  void movzbl(Register dst, Address src);
1456  void movzbl(Register dst, Register src);
1457
1458#ifdef _LP64
1459  void movzbq(Register dst, Address src);
1460  void movzbq(Register dst, Register src);
1461#endif
1462
1463  void movzwl(Register dst, Address src);
1464  void movzwl(Register dst, Register src);
1465
1466#ifdef _LP64
1467  void movzwq(Register dst, Address src);
1468  void movzwq(Register dst, Register src);
1469#endif
1470
1471  // Unsigned multiply with RAX destination register
1472  void mull(Address src);
1473  void mull(Register src);
1474
1475#ifdef _LP64
1476  void mulq(Address src);
1477  void mulq(Register src);
1478  void mulxq(Register dst1, Register dst2, Register src);
1479#endif
1480
1481  // Multiply Scalar Double-Precision Floating-Point Values
1482  void mulsd(XMMRegister dst, Address src);
1483  void mulsd(XMMRegister dst, XMMRegister src);
1484
1485  // Multiply Scalar Single-Precision Floating-Point Values
1486  void mulss(XMMRegister dst, Address src);
1487  void mulss(XMMRegister dst, XMMRegister src);
1488
1489  void negl(Register dst);
1490
1491#ifdef _LP64
1492  void negq(Register dst);
1493#endif
1494
1495  void nop(int i = 1);
1496
1497  void notl(Register dst);
1498
1499#ifdef _LP64
1500  void notq(Register dst);
1501#endif
1502
1503  void orl(Address dst, int32_t imm32);
1504  void orl(Register dst, int32_t imm32);
1505  void orl(Register dst, Address src);
1506  void orl(Register dst, Register src);
1507  void orl(Address dst, Register src);
1508
1509  void orq(Address dst, int32_t imm32);
1510  void orq(Register dst, int32_t imm32);
1511  void orq(Register dst, Address src);
1512  void orq(Register dst, Register src);
1513
1514  // Pack with unsigned saturation
1515  void packuswb(XMMRegister dst, XMMRegister src);
1516  void packuswb(XMMRegister dst, Address src);
1517  void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1518
1519  // Pemutation of 64bit words
1520  void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1521  void vpermq(XMMRegister dst, XMMRegister src, int imm8);
1522
1523  void pause();
1524
1525  // SSE4.2 string instructions
1526  void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
1527  void pcmpestri(XMMRegister xmm1, Address src, int imm8);
1528
1529  void pcmpeqb(XMMRegister dst, XMMRegister src);
1530  void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1531  void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
1532  void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
1533
1534  void pcmpeqw(XMMRegister dst, XMMRegister src);
1535  void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1536  void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
1537  void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len);
1538
1539  void pcmpeqd(XMMRegister dst, XMMRegister src);
1540  void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1541  void evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
1542  void evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len);
1543
1544  void pcmpeqq(XMMRegister dst, XMMRegister src);
1545  void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1546  void evpcmpeqq(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
1547  void evpcmpeqq(KRegister kdst, XMMRegister nds, Address src, int vector_len);
1548
1549  void pmovmskb(Register dst, XMMRegister src);
1550  void vpmovmskb(Register dst, XMMRegister src);
1551
1552  // SSE 4.1 extract
1553  void pextrd(Register dst, XMMRegister src, int imm8);
1554  void pextrq(Register dst, XMMRegister src, int imm8);
1555  void pextrd(Address dst, XMMRegister src, int imm8);
1556  void pextrq(Address dst, XMMRegister src, int imm8);
1557  void pextrb(Address dst, XMMRegister src, int imm8);
1558  // SSE 2 extract
1559  void pextrw(Register dst, XMMRegister src, int imm8);
1560  void pextrw(Address dst, XMMRegister src, int imm8);
1561
1562  // SSE 4.1 insert
1563  void pinsrd(XMMRegister dst, Register src, int imm8);
1564  void pinsrq(XMMRegister dst, Register src, int imm8);
1565  void pinsrd(XMMRegister dst, Address src, int imm8);
1566  void pinsrq(XMMRegister dst, Address src, int imm8);
1567  void pinsrb(XMMRegister dst, Address src, int imm8);
1568  // SSE 2 insert
1569  void pinsrw(XMMRegister dst, Register src, int imm8);
1570  void pinsrw(XMMRegister dst, Address src, int imm8);
1571
1572  // SSE4.1 packed move
1573  void pmovzxbw(XMMRegister dst, XMMRegister src);
1574  void pmovzxbw(XMMRegister dst, Address src);
1575
1576  void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
1577
1578#ifndef _LP64 // no 32bit push/pop on amd64
1579  void popl(Address dst);
1580#endif
1581
1582#ifdef _LP64
1583  void popq(Address dst);
1584#endif
1585
1586  void popcntl(Register dst, Address src);
1587  void popcntl(Register dst, Register src);
1588
1589#ifdef _LP64
1590  void popcntq(Register dst, Address src);
1591  void popcntq(Register dst, Register src);
1592#endif
1593
1594  // Prefetches (SSE, SSE2, 3DNOW only)
1595
1596  void prefetchnta(Address src);
1597  void prefetchr(Address src);
1598  void prefetcht0(Address src);
1599  void prefetcht1(Address src);
1600  void prefetcht2(Address src);
1601  void prefetchw(Address src);
1602
1603  // Shuffle Bytes
1604  void pshufb(XMMRegister dst, XMMRegister src);
1605  void pshufb(XMMRegister dst, Address src);
1606
1607  // Shuffle Packed Doublewords
1608  void pshufd(XMMRegister dst, XMMRegister src, int mode);
1609  void pshufd(XMMRegister dst, Address src,     int mode);
1610
1611  // Shuffle Packed Low Words
1612  void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1613  void pshuflw(XMMRegister dst, Address src,     int mode);
1614
1615  // Shift Right by bytes Logical DoubleQuadword Immediate
1616  void psrldq(XMMRegister dst, int shift);
1617  // Shift Left by bytes Logical DoubleQuadword Immediate
1618  void pslldq(XMMRegister dst, int shift);
1619
1620  // Logical Compare 128bit
1621  void ptest(XMMRegister dst, XMMRegister src);
1622  void ptest(XMMRegister dst, Address src);
1623  // Logical Compare 256bit
1624  void vptest(XMMRegister dst, XMMRegister src);
1625  void vptest(XMMRegister dst, Address src);
1626
1627  // Interleave Low Bytes
1628  void punpcklbw(XMMRegister dst, XMMRegister src);
1629  void punpcklbw(XMMRegister dst, Address src);
1630
1631  // Interleave Low Doublewords
1632  void punpckldq(XMMRegister dst, XMMRegister src);
1633  void punpckldq(XMMRegister dst, Address src);
1634
1635  // Interleave Low Quadwords
1636  void punpcklqdq(XMMRegister dst, XMMRegister src);
1637
1638#ifndef _LP64 // no 32bit push/pop on amd64
1639  void pushl(Address src);
1640#endif
1641
1642  void pushq(Address src);
1643
1644  void rcll(Register dst, int imm8);
1645
1646  void rclq(Register dst, int imm8);
1647
1648  void rcrq(Register dst, int imm8);
1649
1650  void rcpps(XMMRegister dst, XMMRegister src);
1651
1652  void rcpss(XMMRegister dst, XMMRegister src);
1653
1654  void rdtsc();
1655
1656  void ret(int imm16);
1657
1658#ifdef _LP64
1659  void rorq(Register dst, int imm8);
1660  void rorxq(Register dst, Register src, int imm8);
1661#endif
1662
1663  void sahf();
1664
1665  void sarl(Register dst, int imm8);
1666  void sarl(Register dst);
1667
1668  void sarq(Register dst, int imm8);
1669  void sarq(Register dst);
1670
1671  void sbbl(Address dst, int32_t imm32);
1672  void sbbl(Register dst, int32_t imm32);
1673  void sbbl(Register dst, Address src);
1674  void sbbl(Register dst, Register src);
1675
1676  void sbbq(Address dst, int32_t imm32);
1677  void sbbq(Register dst, int32_t imm32);
1678  void sbbq(Register dst, Address src);
1679  void sbbq(Register dst, Register src);
1680
1681  void setb(Condition cc, Register dst);
1682
1683  void palignr(XMMRegister dst, XMMRegister src, int imm8);
1684  void pblendw(XMMRegister dst, XMMRegister src, int imm8);
1685
1686  void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8);
1687  void sha1nexte(XMMRegister dst, XMMRegister src);
1688  void sha1msg1(XMMRegister dst, XMMRegister src);
1689  void sha1msg2(XMMRegister dst, XMMRegister src);
1690  // xmm0 is implicit additional source to the following instruction.
1691  void sha256rnds2(XMMRegister dst, XMMRegister src);
1692  void sha256msg1(XMMRegister dst, XMMRegister src);
1693  void sha256msg2(XMMRegister dst, XMMRegister src);
1694
1695  void shldl(Register dst, Register src);
1696  void shldl(Register dst, Register src, int8_t imm8);
1697
1698  void shll(Register dst, int imm8);
1699  void shll(Register dst);
1700
1701  void shlq(Register dst, int imm8);
1702  void shlq(Register dst);
1703
1704  void shrdl(Register dst, Register src);
1705
1706  void shrl(Register dst, int imm8);
1707  void shrl(Register dst);
1708
1709  void shrq(Register dst, int imm8);
1710  void shrq(Register dst);
1711
1712  void smovl(); // QQQ generic?
1713
1714  // Compute Square Root of Scalar Double-Precision Floating-Point Value
1715  void sqrtsd(XMMRegister dst, Address src);
1716  void sqrtsd(XMMRegister dst, XMMRegister src);
1717
1718  // Compute Square Root of Scalar Single-Precision Floating-Point Value
1719  void sqrtss(XMMRegister dst, Address src);
1720  void sqrtss(XMMRegister dst, XMMRegister src);
1721
1722  void std();
1723
1724  void stmxcsr( Address dst );
1725
1726  void subl(Address dst, int32_t imm32);
1727  void subl(Address dst, Register src);
1728  void subl(Register dst, int32_t imm32);
1729  void subl(Register dst, Address src);
1730  void subl(Register dst, Register src);
1731
1732  void subq(Address dst, int32_t imm32);
1733  void subq(Address dst, Register src);
1734  void subq(Register dst, int32_t imm32);
1735  void subq(Register dst, Address src);
1736  void subq(Register dst, Register src);
1737
1738  // Force generation of a 4 byte immediate value even if it fits into 8bit
1739  void subl_imm32(Register dst, int32_t imm32);
1740  void subq_imm32(Register dst, int32_t imm32);
1741
1742  // Subtract Scalar Double-Precision Floating-Point Values
1743  void subsd(XMMRegister dst, Address src);
1744  void subsd(XMMRegister dst, XMMRegister src);
1745
1746  // Subtract Scalar Single-Precision Floating-Point Values
1747  void subss(XMMRegister dst, Address src);
1748  void subss(XMMRegister dst, XMMRegister src);
1749
1750  void testb(Register dst, int imm8);
1751  void testb(Address dst, int imm8);
1752
1753  void testl(Register dst, int32_t imm32);
1754  void testl(Register dst, Register src);
1755  void testl(Register dst, Address src);
1756
1757  void testq(Register dst, int32_t imm32);
1758  void testq(Register dst, Register src);
1759
1760  // BMI - count trailing zeros
1761  void tzcntl(Register dst, Register src);
1762  void tzcntq(Register dst, Register src);
1763
1764  // Unordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
1765  void ucomisd(XMMRegister dst, Address src);
1766  void ucomisd(XMMRegister dst, XMMRegister src);
1767
1768  // Unordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
1769  void ucomiss(XMMRegister dst, Address src);
1770  void ucomiss(XMMRegister dst, XMMRegister src);
1771
1772  void xabort(int8_t imm8);
1773
1774  void xaddl(Address dst, Register src);
1775
1776  void xaddq(Address dst, Register src);
1777
1778  void xbegin(Label& abort, relocInfo::relocType rtype = relocInfo::none);
1779
1780  void xchgl(Register reg, Address adr);
1781  void xchgl(Register dst, Register src);
1782
1783  void xchgq(Register reg, Address adr);
1784  void xchgq(Register dst, Register src);
1785
1786  void xend();
1787
1788  // Get Value of Extended Control Register
1789  void xgetbv();
1790
1791  void xorl(Register dst, int32_t imm32);
1792  void xorl(Register dst, Address src);
1793  void xorl(Register dst, Register src);
1794
1795  void xorb(Register dst, Address src);
1796
1797  void xorq(Register dst, Address src);
1798  void xorq(Register dst, Register src);
1799
1800  void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
1801
1802  // AVX 3-operands scalar instructions (encoded with VEX prefix)
1803
1804  void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
1805  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1806  void vaddss(XMMRegister dst, XMMRegister nds, Address src);
1807  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1808  void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
1809  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1810  void vdivss(XMMRegister dst, XMMRegister nds, Address src);
1811  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1812  void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
1813  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1814  void vmulss(XMMRegister dst, XMMRegister nds, Address src);
1815  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1816  void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
1817  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1818  void vsubss(XMMRegister dst, XMMRegister nds, Address src);
1819  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1820
1821
1822  //====================VECTOR ARITHMETIC=====================================
1823
1824  // Add Packed Floating-Point Values
1825  void addpd(XMMRegister dst, XMMRegister src);
1826  void addpd(XMMRegister dst, Address src);
1827  void addps(XMMRegister dst, XMMRegister src);
1828  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1829  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1830  void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1831  void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1832
1833  // Subtract Packed Floating-Point Values
1834  void subpd(XMMRegister dst, XMMRegister src);
1835  void subps(XMMRegister dst, XMMRegister src);
1836  void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1837  void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1838  void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1839  void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1840
1841  // Multiply Packed Floating-Point Values
1842  void mulpd(XMMRegister dst, XMMRegister src);
1843  void mulpd(XMMRegister dst, Address src);
1844  void mulps(XMMRegister dst, XMMRegister src);
1845  void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1846  void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1847  void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1848  void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1849
1850  // Divide Packed Floating-Point Values
1851  void divpd(XMMRegister dst, XMMRegister src);
1852  void divps(XMMRegister dst, XMMRegister src);
1853  void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1854  void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1855  void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1856  void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1857
1858  // Sqrt Packed Floating-Point Values - Double precision only
1859  void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len);
1860  void vsqrtpd(XMMRegister dst, Address src, int vector_len);
1861
1862  // Bitwise Logical AND of Packed Floating-Point Values
1863  void andpd(XMMRegister dst, XMMRegister src);
1864  void andps(XMMRegister dst, XMMRegister src);
1865  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1866  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1867  void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1868  void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1869
1870  void unpckhpd(XMMRegister dst, XMMRegister src);
1871  void unpcklpd(XMMRegister dst, XMMRegister src);
1872
1873  // Bitwise Logical XOR of Packed Floating-Point Values
1874  void xorpd(XMMRegister dst, XMMRegister src);
1875  void xorps(XMMRegister dst, XMMRegister src);
1876  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1877  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1878  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1879  void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1880
1881  // Add horizontal packed integers
1882  void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1883  void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1884  void phaddw(XMMRegister dst, XMMRegister src);
1885  void phaddd(XMMRegister dst, XMMRegister src);
1886
1887  // Add packed integers
1888  void paddb(XMMRegister dst, XMMRegister src);
1889  void paddw(XMMRegister dst, XMMRegister src);
1890  void paddd(XMMRegister dst, XMMRegister src);
1891  void paddd(XMMRegister dst, Address src);
1892  void paddq(XMMRegister dst, XMMRegister src);
1893  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1894  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1895  void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1896  void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1897  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1898  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1899  void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1900  void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1901
1902  // Sub packed integers
1903  void psubb(XMMRegister dst, XMMRegister src);
1904  void psubw(XMMRegister dst, XMMRegister src);
1905  void psubd(XMMRegister dst, XMMRegister src);
1906  void psubq(XMMRegister dst, XMMRegister src);
1907  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1908  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1909  void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1910  void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1911  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1912  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1913  void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1914  void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1915
1916  // Multiply packed integers (only shorts and ints)
1917  void pmullw(XMMRegister dst, XMMRegister src);
1918  void pmulld(XMMRegister dst, XMMRegister src);
1919  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1920  void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1921  void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1922  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1923  void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1924  void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1925
1926  // Shift left packed integers
1927  void psllw(XMMRegister dst, int shift);
1928  void pslld(XMMRegister dst, int shift);
1929  void psllq(XMMRegister dst, int shift);
1930  void psllw(XMMRegister dst, XMMRegister shift);
1931  void pslld(XMMRegister dst, XMMRegister shift);
1932  void psllq(XMMRegister dst, XMMRegister shift);
1933  void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1934  void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1935  void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1936  void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1937  void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1938  void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1939
1940  // Logical shift right packed integers
1941  void psrlw(XMMRegister dst, int shift);
1942  void psrld(XMMRegister dst, int shift);
1943  void psrlq(XMMRegister dst, int shift);
1944  void psrlw(XMMRegister dst, XMMRegister shift);
1945  void psrld(XMMRegister dst, XMMRegister shift);
1946  void psrlq(XMMRegister dst, XMMRegister shift);
1947  void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1948  void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1949  void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1950  void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1951  void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1952  void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1953
1954  // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
1955  void psraw(XMMRegister dst, int shift);
1956  void psrad(XMMRegister dst, int shift);
1957  void psraw(XMMRegister dst, XMMRegister shift);
1958  void psrad(XMMRegister dst, XMMRegister shift);
1959  void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1960  void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1961  void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1962  void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1963
1964  // And packed integers
1965  void pand(XMMRegister dst, XMMRegister src);
1966  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1967  void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1968
1969  // Andn packed integers
1970  void pandn(XMMRegister dst, XMMRegister src);
1971
1972  // Or packed integers
1973  void por(XMMRegister dst, XMMRegister src);
1974  void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1975  void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1976
1977  // Xor packed integers
1978  void pxor(XMMRegister dst, XMMRegister src);
1979  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1980  void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1981
1982  // vinserti forms
1983  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
1984  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
1985  void vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
1986  void vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
1987  void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
1988
1989  // vinsertf forms
1990  void vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
1991  void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
1992  void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
1993  void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
1994  void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
1995  void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
1996
1997  // vextracti forms
1998  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
1999  void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
2000  void vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
2001  void vextracti32x4(Address dst, XMMRegister src, uint8_t imm8);
2002  void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
2003  void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
2004
2005  // vextractf forms
2006  void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
2007  void vextractf128(Address dst, XMMRegister src, uint8_t imm8);
2008  void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
2009  void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
2010  void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
2011  void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
2012  void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
2013
2014  // legacy xmm sourced word/dword replicate
2015  void vpbroadcastw(XMMRegister dst, XMMRegister src);
2016  void vpbroadcastd(XMMRegister dst, XMMRegister src);
2017
2018  // xmm/mem sourced byte/word/dword/qword replicate
2019  void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
2020  void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
2021  void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
2022  void evpbroadcastw(XMMRegister dst, Address src, int vector_len);
2023  void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
2024  void evpbroadcastd(XMMRegister dst, Address src, int vector_len);
2025  void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
2026  void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
2027
2028  // scalar single/double precision replicate
2029  void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
2030  void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
2031  void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
2032  void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);
2033
2034  // gpr sourced byte/word/dword/qword replicate
2035  void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
2036  void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
2037  void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
2038  void evpbroadcastq(XMMRegister dst, Register src, int vector_len);
2039
2040  // Carry-Less Multiplication Quadword
2041  void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
2042  void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
2043
2044  // AVX instruction which is used to clear upper 128 bits of YMM registers and
2045  // to avoid transaction penalty between AVX and SSE states. There is no
2046  // penalty if legacy SSE instructions are encoded using VEX prefix because
2047  // they always clear upper 128 bits. It should be used before calling
2048  // runtime code and native libraries.
2049  void vzeroupper();
2050
2051  // AVX support for vectorized conditional move (double). The following two instructions used only coupled.
2052  void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
2053  void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
2054
2055
2056 protected:
2057  // Next instructions require address alignment 16 bytes SSE mode.
2058  // They should be called only from corresponding MacroAssembler instructions.
2059  void andpd(XMMRegister dst, Address src);
2060  void andps(XMMRegister dst, Address src);
2061  void xorpd(XMMRegister dst, Address src);
2062  void xorps(XMMRegister dst, Address src);
2063
2064};
2065
2066// The Intel x86/Amd64 Assembler attributes: All fields enclosed here are to guide encoding level decisions.
2067// Specific set functions are for specialized use, else defaults or whatever was supplied to object construction
2068// are applied.
2069class InstructionAttr {
2070public:
2071  InstructionAttr(
2072    int vector_len,     // The length of vector to be applied in encoding - for both AVX and EVEX
2073    bool rex_vex_w,     // Width of data: if 32-bits or less, false, else if 64-bit or specially defined, true
2074    bool legacy_mode,   // Details if either this instruction is conditionally encoded to AVX or earlier if true else possibly EVEX
2075    bool no_reg_mask,   // when true, k0 is used when EVEX encoding is chosen, else k1 is used under the same condition
2076    bool uses_vl)       // This instruction may have legacy constraints based on vector length for EVEX
2077    :
2078      _avx_vector_len(vector_len),
2079      _rex_vex_w(rex_vex_w),
2080      _legacy_mode(legacy_mode),
2081      _no_reg_mask(no_reg_mask),
2082      _uses_vl(uses_vl),
2083      _tuple_type(Assembler::EVEX_ETUP),
2084      _input_size_in_bits(Assembler::EVEX_NObit),
2085      _is_evex_instruction(false),
2086      _evex_encoding(0),
2087      _is_clear_context(false),
2088      _is_extended_context(false),
2089      _current_assembler(NULL) {
2090    if (UseAVX < 3) _legacy_mode = true;
2091  }
2092
2093  ~InstructionAttr() {
2094    if (_current_assembler != NULL) {
2095      _current_assembler->clear_attributes();
2096    }
2097    _current_assembler = NULL;
2098  }
2099
2100private:
2101  int  _avx_vector_len;
2102  bool _rex_vex_w;
2103  bool _legacy_mode;
2104  bool _no_reg_mask;
2105  bool _uses_vl;
2106  int  _tuple_type;
2107  int  _input_size_in_bits;
2108  bool _is_evex_instruction;
2109  int  _evex_encoding;
2110  bool _is_clear_context;
2111  bool _is_extended_context;
2112
2113  Assembler *_current_assembler;
2114
2115public:
2116  // query functions for field accessors
2117  int  get_vector_len(void) const { return _avx_vector_len; }
2118  bool is_rex_vex_w(void) const { return _rex_vex_w; }
2119  bool is_legacy_mode(void) const { return _legacy_mode; }
2120  bool is_no_reg_mask(void) const { return _no_reg_mask; }
2121  bool uses_vl(void) const { return _uses_vl; }
2122  int  get_tuple_type(void) const { return _tuple_type; }
2123  int  get_input_size(void) const { return _input_size_in_bits; }
2124  int  is_evex_instruction(void) const { return _is_evex_instruction; }
2125  int  get_evex_encoding(void) const { return _evex_encoding; }
2126  bool is_clear_context(void) const { return _is_clear_context; }
2127  bool is_extended_context(void) const { return _is_extended_context; }
2128
2129  // Set the vector len manually
2130  void set_vector_len(int vector_len) { _avx_vector_len = vector_len; }
2131
2132  // Set the instruction to be encoded in AVX mode
2133  void set_is_legacy_mode(void) { _legacy_mode = true; }
2134
2135  // Set the current instuction to be encoded as an EVEX instuction
2136  void set_is_evex_instruction(void) { _is_evex_instruction = true; }
2137
2138  // Internal encoding data used in compressed immediate offset programming
2139  void set_evex_encoding(int value) { _evex_encoding = value; }
2140
2141  // Set the Evex.Z field to be used to clear all non directed XMM/YMM/ZMM components
2142  void set_is_clear_context(void) { _is_clear_context = true; }
2143
2144  // Map back to current asembler so that we can manage object level assocation
2145  void set_current_assembler(Assembler *current_assembler) { _current_assembler = current_assembler; }
2146
2147  // Address modifiers used for compressed displacement calculation
2148  void set_address_attributes(int tuple_type, int input_size_in_bits) {
2149    if (VM_Version::supports_evex()) {
2150      _tuple_type = tuple_type;
2151      _input_size_in_bits = input_size_in_bits;
2152    }
2153  }
2154
2155};
2156
2157#endif // CPU_X86_VM_ASSEMBLER_X86_HPP
2158