assembler_x86.hpp revision 8961:a589f73b79f4
1/*
2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#ifndef CPU_X86_VM_ASSEMBLER_X86_HPP
26#define CPU_X86_VM_ASSEMBLER_X86_HPP
27
28#include "asm/register.hpp"
29#include "vm_version_x86.hpp"
30
31class BiasedLockingCounters;
32
33// Contains all the definitions needed for x86 assembly code generation.
34
35// Calling convention
36class Argument VALUE_OBJ_CLASS_SPEC {
37 public:
38  enum {
39#ifdef _LP64
40#ifdef _WIN64
41    n_int_register_parameters_c   = 4, // rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
42    n_float_register_parameters_c = 4,  // xmm0 - xmm3 (c_farg0, c_farg1, ... )
43#else
44    n_int_register_parameters_c   = 6, // rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
45    n_float_register_parameters_c = 8,  // xmm0 - xmm7 (c_farg0, c_farg1, ... )
46#endif // _WIN64
47    n_int_register_parameters_j   = 6, // j_rarg0, j_rarg1, ...
48    n_float_register_parameters_j = 8  // j_farg0, j_farg1, ...
49#else
50    n_register_parameters = 0   // 0 registers used to pass arguments
51#endif // _LP64
52  };
53};
54
55
56#ifdef _LP64
57// Symbolically name the register arguments used by the c calling convention.
58// Windows is different from linux/solaris. So much for standards...
59
60#ifdef _WIN64
61
62REGISTER_DECLARATION(Register, c_rarg0, rcx);
63REGISTER_DECLARATION(Register, c_rarg1, rdx);
64REGISTER_DECLARATION(Register, c_rarg2, r8);
65REGISTER_DECLARATION(Register, c_rarg3, r9);
66
67REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
68REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
69REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
70REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
71
72#else
73
74REGISTER_DECLARATION(Register, c_rarg0, rdi);
75REGISTER_DECLARATION(Register, c_rarg1, rsi);
76REGISTER_DECLARATION(Register, c_rarg2, rdx);
77REGISTER_DECLARATION(Register, c_rarg3, rcx);
78REGISTER_DECLARATION(Register, c_rarg4, r8);
79REGISTER_DECLARATION(Register, c_rarg5, r9);
80
81REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
82REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
83REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
84REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
85REGISTER_DECLARATION(XMMRegister, c_farg4, xmm4);
86REGISTER_DECLARATION(XMMRegister, c_farg5, xmm5);
87REGISTER_DECLARATION(XMMRegister, c_farg6, xmm6);
88REGISTER_DECLARATION(XMMRegister, c_farg7, xmm7);
89
90#endif // _WIN64
91
92// Symbolically name the register arguments used by the Java calling convention.
93// We have control over the convention for java so we can do what we please.
94// What pleases us is to offset the java calling convention so that when
95// we call a suitable jni method the arguments are lined up and we don't
96// have to do little shuffling. A suitable jni method is non-static and a
97// small number of arguments (two fewer args on windows)
98//
99//        |-------------------------------------------------------|
100//        | c_rarg0   c_rarg1  c_rarg2 c_rarg3 c_rarg4 c_rarg5    |
101//        |-------------------------------------------------------|
102//        | rcx       rdx      r8      r9      rdi*    rsi*       | windows (* not a c_rarg)
103//        | rdi       rsi      rdx     rcx     r8      r9         | solaris/linux
104//        |-------------------------------------------------------|
105//        | j_rarg5   j_rarg0  j_rarg1 j_rarg2 j_rarg3 j_rarg4    |
106//        |-------------------------------------------------------|
107
108REGISTER_DECLARATION(Register, j_rarg0, c_rarg1);
109REGISTER_DECLARATION(Register, j_rarg1, c_rarg2);
110REGISTER_DECLARATION(Register, j_rarg2, c_rarg3);
111// Windows runs out of register args here
112#ifdef _WIN64
113REGISTER_DECLARATION(Register, j_rarg3, rdi);
114REGISTER_DECLARATION(Register, j_rarg4, rsi);
115#else
116REGISTER_DECLARATION(Register, j_rarg3, c_rarg4);
117REGISTER_DECLARATION(Register, j_rarg4, c_rarg5);
118#endif /* _WIN64 */
119REGISTER_DECLARATION(Register, j_rarg5, c_rarg0);
120
121REGISTER_DECLARATION(XMMRegister, j_farg0, xmm0);
122REGISTER_DECLARATION(XMMRegister, j_farg1, xmm1);
123REGISTER_DECLARATION(XMMRegister, j_farg2, xmm2);
124REGISTER_DECLARATION(XMMRegister, j_farg3, xmm3);
125REGISTER_DECLARATION(XMMRegister, j_farg4, xmm4);
126REGISTER_DECLARATION(XMMRegister, j_farg5, xmm5);
127REGISTER_DECLARATION(XMMRegister, j_farg6, xmm6);
128REGISTER_DECLARATION(XMMRegister, j_farg7, xmm7);
129
130REGISTER_DECLARATION(Register, rscratch1, r10);  // volatile
131REGISTER_DECLARATION(Register, rscratch2, r11);  // volatile
132
133REGISTER_DECLARATION(Register, r12_heapbase, r12); // callee-saved
134REGISTER_DECLARATION(Register, r15_thread, r15); // callee-saved
135
136#else
137// rscratch1 will apear in 32bit code that is dead but of course must compile
138// Using noreg ensures if the dead code is incorrectly live and executed it
139// will cause an assertion failure
140#define rscratch1 noreg
141#define rscratch2 noreg
142
143#endif // _LP64
144
145// JSR 292
146// On x86, the SP does not have to be saved when invoking method handle intrinsics
147// or compiled lambda forms. We indicate that by setting rbp_mh_SP_save to noreg.
148REGISTER_DECLARATION(Register, rbp_mh_SP_save, noreg);
149
150// Address is an abstraction used to represent a memory location
151// using any of the amd64 addressing modes with one object.
152//
153// Note: A register location is represented via a Register, not
154//       via an address for efficiency & simplicity reasons.
155
156class ArrayAddress;
157
158class Address VALUE_OBJ_CLASS_SPEC {
159 public:
160  enum ScaleFactor {
161    no_scale = -1,
162    times_1  =  0,
163    times_2  =  1,
164    times_4  =  2,
165    times_8  =  3,
166    times_ptr = LP64_ONLY(times_8) NOT_LP64(times_4)
167  };
168  static ScaleFactor times(int size) {
169    assert(size >= 1 && size <= 8 && is_power_of_2(size), "bad scale size");
170    if (size == 8)  return times_8;
171    if (size == 4)  return times_4;
172    if (size == 2)  return times_2;
173    return times_1;
174  }
175  static int scale_size(ScaleFactor scale) {
176    assert(scale != no_scale, "");
177    assert(((1 << (int)times_1) == 1 &&
178            (1 << (int)times_2) == 2 &&
179            (1 << (int)times_4) == 4 &&
180            (1 << (int)times_8) == 8), "");
181    return (1 << (int)scale);
182  }
183
184 private:
185  Register         _base;
186  Register         _index;
187  ScaleFactor      _scale;
188  int              _disp;
189  RelocationHolder _rspec;
190
191  // Easily misused constructors make them private
192  // %%% can we make these go away?
193  NOT_LP64(Address(address loc, RelocationHolder spec);)
194  Address(int disp, address loc, relocInfo::relocType rtype);
195  Address(int disp, address loc, RelocationHolder spec);
196
197 public:
198
199 int disp() { return _disp; }
200  // creation
201  Address()
202    : _base(noreg),
203      _index(noreg),
204      _scale(no_scale),
205      _disp(0) {
206  }
207
208  // No default displacement otherwise Register can be implicitly
209  // converted to 0(Register) which is quite a different animal.
210
211  Address(Register base, int disp)
212    : _base(base),
213      _index(noreg),
214      _scale(no_scale),
215      _disp(disp) {
216  }
217
218  Address(Register base, Register index, ScaleFactor scale, int disp = 0)
219    : _base (base),
220      _index(index),
221      _scale(scale),
222      _disp (disp) {
223    assert(!index->is_valid() == (scale == Address::no_scale),
224           "inconsistent address");
225  }
226
227  Address(Register base, RegisterOrConstant index, ScaleFactor scale = times_1, int disp = 0)
228    : _base (base),
229      _index(index.register_or_noreg()),
230      _scale(scale),
231      _disp (disp + (index.constant_or_zero() * scale_size(scale))) {
232    if (!index.is_register())  scale = Address::no_scale;
233    assert(!_index->is_valid() == (scale == Address::no_scale),
234           "inconsistent address");
235  }
236
237  Address plus_disp(int disp) const {
238    Address a = (*this);
239    a._disp += disp;
240    return a;
241  }
242  Address plus_disp(RegisterOrConstant disp, ScaleFactor scale = times_1) const {
243    Address a = (*this);
244    a._disp += disp.constant_or_zero() * scale_size(scale);
245    if (disp.is_register()) {
246      assert(!a.index()->is_valid(), "competing indexes");
247      a._index = disp.as_register();
248      a._scale = scale;
249    }
250    return a;
251  }
252  bool is_same_address(Address a) const {
253    // disregard _rspec
254    return _base == a._base && _disp == a._disp && _index == a._index && _scale == a._scale;
255  }
256
257  // The following two overloads are used in connection with the
258  // ByteSize type (see sizes.hpp).  They simplify the use of
259  // ByteSize'd arguments in assembly code. Note that their equivalent
260  // for the optimized build are the member functions with int disp
261  // argument since ByteSize is mapped to an int type in that case.
262  //
263  // Note: DO NOT introduce similar overloaded functions for WordSize
264  // arguments as in the optimized mode, both ByteSize and WordSize
265  // are mapped to the same type and thus the compiler cannot make a
266  // distinction anymore (=> compiler errors).
267
268#ifdef ASSERT
269  Address(Register base, ByteSize disp)
270    : _base(base),
271      _index(noreg),
272      _scale(no_scale),
273      _disp(in_bytes(disp)) {
274  }
275
276  Address(Register base, Register index, ScaleFactor scale, ByteSize disp)
277    : _base(base),
278      _index(index),
279      _scale(scale),
280      _disp(in_bytes(disp)) {
281    assert(!index->is_valid() == (scale == Address::no_scale),
282           "inconsistent address");
283  }
284
285  Address(Register base, RegisterOrConstant index, ScaleFactor scale, ByteSize disp)
286    : _base (base),
287      _index(index.register_or_noreg()),
288      _scale(scale),
289      _disp (in_bytes(disp) + (index.constant_or_zero() * scale_size(scale))) {
290    if (!index.is_register())  scale = Address::no_scale;
291    assert(!_index->is_valid() == (scale == Address::no_scale),
292           "inconsistent address");
293  }
294
295#endif // ASSERT
296
297  // accessors
298  bool        uses(Register reg) const { return _base == reg || _index == reg; }
299  Register    base()             const { return _base;  }
300  Register    index()            const { return _index; }
301  ScaleFactor scale()            const { return _scale; }
302  int         disp()             const { return _disp;  }
303
304  // Convert the raw encoding form into the form expected by the constructor for
305  // Address.  An index of 4 (rsp) corresponds to having no index, so convert
306  // that to noreg for the Address constructor.
307  static Address make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc);
308
309  static Address make_array(ArrayAddress);
310
311 private:
312  bool base_needs_rex() const {
313    return _base != noreg && _base->encoding() >= 8;
314  }
315
316  bool index_needs_rex() const {
317    return _index != noreg &&_index->encoding() >= 8;
318  }
319
320  relocInfo::relocType reloc() const { return _rspec.type(); }
321
322  friend class Assembler;
323  friend class MacroAssembler;
324  friend class LIR_Assembler; // base/index/scale/disp
325};
326
327//
328// AddressLiteral has been split out from Address because operands of this type
329// need to be treated specially on 32bit vs. 64bit platforms. By splitting it out
330// the few instructions that need to deal with address literals are unique and the
331// MacroAssembler does not have to implement every instruction in the Assembler
332// in order to search for address literals that may need special handling depending
333// on the instruction and the platform. As small step on the way to merging i486/amd64
334// directories.
335//
336class AddressLiteral VALUE_OBJ_CLASS_SPEC {
337  friend class ArrayAddress;
338  RelocationHolder _rspec;
339  // Typically we use AddressLiterals we want to use their rval
340  // However in some situations we want the lval (effect address) of the item.
341  // We provide a special factory for making those lvals.
342  bool _is_lval;
343
344  // If the target is far we'll need to load the ea of this to
345  // a register to reach it. Otherwise if near we can do rip
346  // relative addressing.
347
348  address          _target;
349
350 protected:
351  // creation
352  AddressLiteral()
353    : _is_lval(false),
354      _target(NULL)
355  {}
356
357  public:
358
359
360  AddressLiteral(address target, relocInfo::relocType rtype);
361
362  AddressLiteral(address target, RelocationHolder const& rspec)
363    : _rspec(rspec),
364      _is_lval(false),
365      _target(target)
366  {}
367
368  AddressLiteral addr() {
369    AddressLiteral ret = *this;
370    ret._is_lval = true;
371    return ret;
372  }
373
374
375 private:
376
377  address target() { return _target; }
378  bool is_lval() { return _is_lval; }
379
380  relocInfo::relocType reloc() const { return _rspec.type(); }
381  const RelocationHolder& rspec() const { return _rspec; }
382
383  friend class Assembler;
384  friend class MacroAssembler;
385  friend class Address;
386  friend class LIR_Assembler;
387};
388
389// Convience classes
390class RuntimeAddress: public AddressLiteral {
391
392  public:
393
394  RuntimeAddress(address target) : AddressLiteral(target, relocInfo::runtime_call_type) {}
395
396};
397
398class ExternalAddress: public AddressLiteral {
399 private:
400  static relocInfo::relocType reloc_for_target(address target) {
401    // Sometimes ExternalAddress is used for values which aren't
402    // exactly addresses, like the card table base.
403    // external_word_type can't be used for values in the first page
404    // so just skip the reloc in that case.
405    return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
406  }
407
408 public:
409
410  ExternalAddress(address target) : AddressLiteral(target, reloc_for_target(target)) {}
411
412};
413
414class InternalAddress: public AddressLiteral {
415
416  public:
417
418  InternalAddress(address target) : AddressLiteral(target, relocInfo::internal_word_type) {}
419
420};
421
422// x86 can do array addressing as a single operation since disp can be an absolute
423// address amd64 can't. We create a class that expresses the concept but does extra
424// magic on amd64 to get the final result
425
426class ArrayAddress VALUE_OBJ_CLASS_SPEC {
427  private:
428
429  AddressLiteral _base;
430  Address        _index;
431
432  public:
433
434  ArrayAddress() {};
435  ArrayAddress(AddressLiteral base, Address index): _base(base), _index(index) {};
436  AddressLiteral base() { return _base; }
437  Address index() { return _index; }
438
439};
440
441const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize);
442
443// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
444// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
445// is what you get. The Assembler is generating code into a CodeBuffer.
446
447class Assembler : public AbstractAssembler  {
448  friend class AbstractAssembler; // for the non-virtual hack
449  friend class LIR_Assembler; // as_Address()
450  friend class StubGenerator;
451
452 public:
453  enum Condition {                     // The x86 condition codes used for conditional jumps/moves.
454    zero          = 0x4,
455    notZero       = 0x5,
456    equal         = 0x4,
457    notEqual      = 0x5,
458    less          = 0xc,
459    lessEqual     = 0xe,
460    greater       = 0xf,
461    greaterEqual  = 0xd,
462    below         = 0x2,
463    belowEqual    = 0x6,
464    above         = 0x7,
465    aboveEqual    = 0x3,
466    overflow      = 0x0,
467    noOverflow    = 0x1,
468    carrySet      = 0x2,
469    carryClear    = 0x3,
470    negative      = 0x8,
471    positive      = 0x9,
472    parity        = 0xa,
473    noParity      = 0xb
474  };
475
476  enum Prefix {
477    // segment overrides
478    CS_segment = 0x2e,
479    SS_segment = 0x36,
480    DS_segment = 0x3e,
481    ES_segment = 0x26,
482    FS_segment = 0x64,
483    GS_segment = 0x65,
484
485    REX        = 0x40,
486
487    REX_B      = 0x41,
488    REX_X      = 0x42,
489    REX_XB     = 0x43,
490    REX_R      = 0x44,
491    REX_RB     = 0x45,
492    REX_RX     = 0x46,
493    REX_RXB    = 0x47,
494
495    REX_W      = 0x48,
496
497    REX_WB     = 0x49,
498    REX_WX     = 0x4A,
499    REX_WXB    = 0x4B,
500    REX_WR     = 0x4C,
501    REX_WRB    = 0x4D,
502    REX_WRX    = 0x4E,
503    REX_WRXB   = 0x4F,
504
505    VEX_3bytes = 0xC4,
506    VEX_2bytes = 0xC5,
507    EVEX_4bytes = 0x62
508  };
509
510  enum VexPrefix {
511    VEX_B = 0x20,
512    VEX_X = 0x40,
513    VEX_R = 0x80,
514    VEX_W = 0x80
515  };
516
517  enum ExexPrefix {
518    EVEX_F  = 0x04,
519    EVEX_V  = 0x08,
520    EVEX_Rb = 0x10,
521    EVEX_X  = 0x40,
522    EVEX_Z  = 0x80
523  };
524
525  enum VexSimdPrefix {
526    VEX_SIMD_NONE = 0x0,
527    VEX_SIMD_66   = 0x1,
528    VEX_SIMD_F3   = 0x2,
529    VEX_SIMD_F2   = 0x3
530  };
531
532  enum VexOpcode {
533    VEX_OPCODE_NONE  = 0x0,
534    VEX_OPCODE_0F    = 0x1,
535    VEX_OPCODE_0F_38 = 0x2,
536    VEX_OPCODE_0F_3A = 0x3
537  };
538
539  enum AvxVectorLen {
540    AVX_128bit = 0x0,
541    AVX_256bit = 0x1,
542    AVX_512bit = 0x2,
543    AVX_NoVec  = 0x4
544  };
545
546  enum EvexTupleType {
547    EVEX_FV   = 0,
548    EVEX_HV   = 4,
549    EVEX_FVM  = 6,
550    EVEX_T1S  = 7,
551    EVEX_T1F  = 11,
552    EVEX_T2   = 13,
553    EVEX_T4   = 15,
554    EVEX_T8   = 17,
555    EVEX_HVM  = 18,
556    EVEX_QVM  = 19,
557    EVEX_OVM  = 20,
558    EVEX_M128 = 21,
559    EVEX_DUP  = 22,
560    EVEX_ETUP = 23
561  };
562
563  enum EvexInputSizeInBits {
564    EVEX_8bit  = 0,
565    EVEX_16bit = 1,
566    EVEX_32bit = 2,
567    EVEX_64bit = 3
568  };
569
570  enum WhichOperand {
571    // input to locate_operand, and format code for relocations
572    imm_operand  = 0,            // embedded 32-bit|64-bit immediate operand
573    disp32_operand = 1,          // embedded 32-bit displacement or address
574    call32_operand = 2,          // embedded 32-bit self-relative displacement
575#ifndef _LP64
576    _WhichOperand_limit = 3
577#else
578     narrow_oop_operand = 3,     // embedded 32-bit immediate narrow oop
579    _WhichOperand_limit = 4
580#endif
581  };
582
583
584
585  // NOTE: The general philopsophy of the declarations here is that 64bit versions
586  // of instructions are freely declared without the need for wrapping them an ifdef.
587  // (Some dangerous instructions are ifdef's out of inappropriate jvm's.)
588  // In the .cpp file the implementations are wrapped so that they are dropped out
589  // of the resulting jvm. This is done mostly to keep the footprint of MINIMAL
590  // to the size it was prior to merging up the 32bit and 64bit assemblers.
591  //
592  // This does mean you'll get a linker/runtime error if you use a 64bit only instruction
593  // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down.
594
595private:
596
597  int evex_encoding;
598  int input_size_in_bits;
599  int avx_vector_len;
600  int tuple_type;
601  bool is_evex_instruction;
602
603  // 64bit prefixes
604  int prefix_and_encode(int reg_enc, bool byteinst = false);
605  int prefixq_and_encode(int reg_enc);
606
607  int prefix_and_encode(int dst_enc, int src_enc, bool byteinst = false);
608  int prefixq_and_encode(int dst_enc, int src_enc);
609
610  void prefix(Register reg);
611  void prefix(Address adr);
612  void prefixq(Address adr);
613
614  void prefix(Address adr, Register reg,  bool byteinst = false);
615  void prefix(Address adr, XMMRegister reg);
616  void prefixq(Address adr, Register reg);
617  void prefixq(Address adr, XMMRegister reg);
618
619  void prefetch_prefix(Address src);
620
621  void rex_prefix(Address adr, XMMRegister xreg,
622                  VexSimdPrefix pre, VexOpcode opc, bool rex_w);
623  int  rex_prefix_and_encode(int dst_enc, int src_enc,
624                             VexSimdPrefix pre, VexOpcode opc, bool rex_w);
625
626  void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
627                  int nds_enc, VexSimdPrefix pre, VexOpcode opc,
628                  int vector_len);
629
630  void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool evex_r, bool evex_v,
631                   int nds_enc, VexSimdPrefix pre, VexOpcode opc,
632                   bool is_extended_context, bool is_merge_context,
633                   int vector_len, bool no_mask_reg );
634
635  void vex_prefix(Address adr, int nds_enc, int xreg_enc,
636                  VexSimdPrefix pre, VexOpcode opc,
637                  bool vex_w, int vector_len,
638                  bool legacy_mode = false, bool no_mask_reg = false);
639
640  void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
641                  VexSimdPrefix pre, int vector_len = AVX_128bit,
642                  bool no_mask_reg = false, bool legacy_mode = false) {
643    int dst_enc = dst->encoding();
644    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
645    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector_len, legacy_mode, no_mask_reg);
646  }
647
648  void vex_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
649                    VexSimdPrefix pre, int vector_len = AVX_128bit,
650                    bool no_mask_reg = false) {
651    int dst_enc = dst->encoding();
652    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
653    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, true, vector_len, false, no_mask_reg);
654  }
655
656  void vex_prefix_0F38(Register dst, Register nds, Address src, bool no_mask_reg = false) {
657    bool vex_w = false;
658    int vector_len = AVX_128bit;
659    vex_prefix(src, nds->encoding(), dst->encoding(),
660               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
661               vector_len, no_mask_reg);
662  }
663
664  void vex_prefix_0F38_legacy(Register dst, Register nds, Address src, bool no_mask_reg = false) {
665    bool vex_w = false;
666    int vector_len = AVX_128bit;
667    vex_prefix(src, nds->encoding(), dst->encoding(),
668               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
669               vector_len, true, no_mask_reg);
670  }
671
672  void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) {
673    bool vex_w = true;
674    int vector_len = AVX_128bit;
675    vex_prefix(src, nds->encoding(), dst->encoding(),
676               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
677               vector_len, no_mask_reg);
678  }
679
680  void vex_prefix_0F38_q_legacy(Register dst, Register nds, Address src, bool no_mask_reg = false) {
681    bool vex_w = true;
682    int vector_len = AVX_128bit;
683    vex_prefix(src, nds->encoding(), dst->encoding(),
684               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
685               vector_len, true, no_mask_reg);
686  }
687
688  int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
689                             VexSimdPrefix pre, VexOpcode opc,
690                             bool vex_w, int vector_len,
691                             bool legacy_mode, bool no_mask_reg);
692
693  int  vex_prefix_0F38_and_encode(Register dst, Register nds, Register src, bool no_mask_reg = false) {
694    bool vex_w = false;
695    int vector_len = AVX_128bit;
696    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
697                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
698                                 false, no_mask_reg);
699  }
700
701  int  vex_prefix_0F38_and_encode_legacy(Register dst, Register nds, Register src, bool no_mask_reg = false) {
702    bool vex_w = false;
703    int vector_len = AVX_128bit;
704    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
705      VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
706      true, no_mask_reg);
707  }
708
709  int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) {
710    bool vex_w = true;
711    int vector_len = AVX_128bit;
712    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
713                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
714                                 false, no_mask_reg);
715  }
716
717  int  vex_prefix_0F38_and_encode_q_legacy(Register dst, Register nds, Register src, bool no_mask_reg = false) {
718    bool vex_w = true;
719    int vector_len = AVX_128bit;
720    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
721                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
722                                 true, no_mask_reg);
723  }
724
725  int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
726                             VexSimdPrefix pre, int vector_len = AVX_128bit,
727                             VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false,
728                             bool no_mask_reg = false) {
729    int src_enc = src->encoding();
730    int dst_enc = dst->encoding();
731    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
732    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector_len, legacy_mode, no_mask_reg);
733  }
734
735  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
736                   VexSimdPrefix pre, bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F,
737                   bool rex_w = false, int vector_len = AVX_128bit, bool legacy_mode = false);
738
739  void simd_prefix(XMMRegister dst, Address src, VexSimdPrefix pre,
740                   bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F) {
741    simd_prefix(dst, xnoreg, src, pre, no_mask_reg, opc);
742  }
743
744  void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
745    simd_prefix(src, dst, pre, no_mask_reg);
746  }
747  void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
748                     VexSimdPrefix pre, bool no_mask_reg = false) {
749    bool rex_w = true;
750    simd_prefix(dst, nds, src, pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
751  }
752
753  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
754                             VexSimdPrefix pre, bool no_mask_reg,
755                             VexOpcode opc = VEX_OPCODE_0F,
756                             bool rex_w = false, int vector_len = AVX_128bit,
757                             bool legacy_mode = false);
758
759  int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src,
760                             VexSimdPrefix pre, bool no_mask_reg,
761                             VexOpcode opc = VEX_OPCODE_0F,
762                             bool rex_w = false, int vector_len = AVX_128bit);
763
764  int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src,
765                             VexSimdPrefix pre, bool no_mask_reg,
766                             VexOpcode opc = VEX_OPCODE_0F,
767                             bool rex_w = false, int vector_len = AVX_128bit);
768
769  // Move/convert 32-bit integer value.
770  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
771                             VexSimdPrefix pre, bool no_mask_reg) {
772    // It is OK to cast from Register to XMMRegister to pass argument here
773    // since only encoding is used in simd_prefix_and_encode() and number of
774    // Gen and Xmm registers are the same.
775    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F);
776  }
777  int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
778    return simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg);
779  }
780  int simd_prefix_and_encode(Register dst, XMMRegister src,
781                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
782                             bool no_mask_reg = false) {
783    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc);
784  }
785
786  // Move/convert 64-bit integer value.
787  int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
788                               VexSimdPrefix pre, bool no_mask_reg = false) {
789    bool rex_w = true;
790    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
791  }
792  int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
793    return simd_prefix_and_encode_q(dst, xnoreg, src, pre, no_mask_reg);
794  }
795  int simd_prefix_and_encode_q(Register dst, XMMRegister src,
796                               VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
797                               bool no_mask_reg = false) {
798    bool rex_w = true;
799    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc, rex_w);
800  }
801
802  // Helper functions for groups of instructions
803  void emit_arith_b(int op1, int op2, Register dst, int imm8);
804
805  void emit_arith(int op1, int op2, Register dst, int32_t imm32);
806  // Force generation of a 4 byte immediate value even if it fits into 8bit
807  void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
808  void emit_arith(int op1, int op2, Register dst, Register src);
809
810  void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
811  void emit_simd_arith_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
812  void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
813  void emit_simd_arith_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
814  void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
815  void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
816  void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
817  void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
818  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
819                      Address src, VexSimdPrefix pre, int vector_len,
820                      bool no_mask_reg = false, bool legacy_mode = false);
821  void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
822                        Address src, VexSimdPrefix pre, int vector_len,
823                        bool no_mask_reg = false);
824  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
825                      XMMRegister src, VexSimdPrefix pre, int vector_len,
826                      bool no_mask_reg = false, bool legacy_mode = false);
827  void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
828                        XMMRegister src, VexSimdPrefix pre, int vector_len,
829                        bool no_mask_reg = false);
830
831  bool emit_compressed_disp_byte(int &disp);
832
833  void emit_operand(Register reg,
834                    Register base, Register index, Address::ScaleFactor scale,
835                    int disp,
836                    RelocationHolder const& rspec,
837                    int rip_relative_correction = 0);
838
839  void emit_operand(Register reg, Address adr, int rip_relative_correction = 0);
840
841  // operands that only take the original 32bit registers
842  void emit_operand32(Register reg, Address adr);
843
844  void emit_operand(XMMRegister reg,
845                    Register base, Register index, Address::ScaleFactor scale,
846                    int disp,
847                    RelocationHolder const& rspec);
848
849  void emit_operand(XMMRegister reg, Address adr);
850
851  void emit_operand(MMXRegister reg, Address adr);
852
853  // workaround gcc (3.2.1-7) bug
854  void emit_operand(Address adr, MMXRegister reg);
855
856
857  // Immediate-to-memory forms
858  void emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32);
859
860  void emit_farith(int b1, int b2, int i);
861
862
863 protected:
864  #ifdef ASSERT
865  void check_relocation(RelocationHolder const& rspec, int format);
866  #endif
867
868  void emit_data(jint data, relocInfo::relocType    rtype, int format);
869  void emit_data(jint data, RelocationHolder const& rspec, int format);
870  void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
871  void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
872
873  bool reachable(AddressLiteral adr) NOT_LP64({ return true;});
874
875  // These are all easily abused and hence protected
876
877  // 32BIT ONLY SECTION
878#ifndef _LP64
879  // Make these disappear in 64bit mode since they would never be correct
880  void cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec);   // 32BIT ONLY
881  void cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
882
883  void mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
884  void mov_literal32(Address dst, int32_t imm32, RelocationHolder const& rspec);     // 32BIT ONLY
885
886  void push_literal32(int32_t imm32, RelocationHolder const& rspec);                 // 32BIT ONLY
887#else
888  // 64BIT ONLY SECTION
889  void mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec);   // 64BIT ONLY
890
891  void cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec);
892  void cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec);
893
894  void mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec);
895  void mov_narrow_oop(Address dst, int32_t imm32, RelocationHolder const& rspec);
896#endif // _LP64
897
898  // These are unique in that we are ensured by the caller that the 32bit
899  // relative in these instructions will always be able to reach the potentially
900  // 64bit address described by entry. Since they can take a 64bit address they
901  // don't have the 32 suffix like the other instructions in this class.
902
903  void call_literal(address entry, RelocationHolder const& rspec);
904  void jmp_literal(address entry, RelocationHolder const& rspec);
905
906  // Avoid using directly section
907  // Instructions in this section are actually usable by anyone without danger
908  // of failure but have performance issues that are addressed my enhanced
909  // instructions which will do the proper thing base on the particular cpu.
910  // We protect them because we don't trust you...
911
912  // Don't use next inc() and dec() methods directly. INC & DEC instructions
913  // could cause a partial flag stall since they don't set CF flag.
914  // Use MacroAssembler::decrement() & MacroAssembler::increment() methods
915  // which call inc() & dec() or add() & sub() in accordance with
916  // the product flag UseIncDec value.
917
918  void decl(Register dst);
919  void decl(Address dst);
920  void decq(Register dst);
921  void decq(Address dst);
922
923  void incl(Register dst);
924  void incl(Address dst);
925  void incq(Register dst);
926  void incq(Address dst);
927
928  // New cpus require use of movsd and movss to avoid partial register stall
929  // when loading from memory. But for old Opteron use movlpd instead of movsd.
930  // The selection is done in MacroAssembler::movdbl() and movflt().
931
932  // Move Scalar Single-Precision Floating-Point Values
933  void movss(XMMRegister dst, Address src);
934  void movss(XMMRegister dst, XMMRegister src);
935  void movss(Address dst, XMMRegister src);
936
937  // Move Scalar Double-Precision Floating-Point Values
938  void movsd(XMMRegister dst, Address src);
939  void movsd(XMMRegister dst, XMMRegister src);
940  void movsd(Address dst, XMMRegister src);
941  void movlpd(XMMRegister dst, Address src);
942
943  // New cpus require use of movaps and movapd to avoid partial register stall
944  // when moving between registers.
945  void movaps(XMMRegister dst, XMMRegister src);
946  void movapd(XMMRegister dst, XMMRegister src);
947
948  // End avoid using directly
949
950
951  // Instruction prefixes
952  void prefix(Prefix p);
953
954  public:
955
956  // Creation
957  Assembler(CodeBuffer* code) : AbstractAssembler(code) {
958    init_attributes();
959  }
960
961  // Decoding
962  static address locate_operand(address inst, WhichOperand which);
963  static address locate_next_instruction(address inst);
964
965  // Utilities
966  static bool is_polling_page_far() NOT_LP64({ return false;});
967  static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
968                                         int cur_tuple_type, int in_size_in_bits, int cur_encoding);
969
970  // Generic instructions
971  // Does 32bit or 64bit as needed for the platform. In some sense these
972  // belong in macro assembler but there is no need for both varieties to exist
973
974  void init_attributes(void) {
975    evex_encoding = 0;
976    input_size_in_bits = 0;
977    avx_vector_len = AVX_NoVec;
978    tuple_type = EVEX_ETUP;
979    is_evex_instruction = false;
980  }
981
982  void lea(Register dst, Address src);
983
984  void mov(Register dst, Register src);
985
986  void pusha();
987  void popa();
988
989  void pushf();
990  void popf();
991
992  void push(int32_t imm32);
993
994  void push(Register src);
995
996  void pop(Register dst);
997
998  // These are dummies to prevent surprise implicit conversions to Register
999  void push(void* v);
1000  void pop(void* v);
1001
1002  // These do register sized moves/scans
1003  void rep_mov();
1004  void rep_stos();
1005  void rep_stosb();
1006  void repne_scan();
1007#ifdef _LP64
1008  void repne_scanl();
1009#endif
1010
1011  // Vanilla instructions in lexical order
1012
1013  void adcl(Address dst, int32_t imm32);
1014  void adcl(Address dst, Register src);
1015  void adcl(Register dst, int32_t imm32);
1016  void adcl(Register dst, Address src);
1017  void adcl(Register dst, Register src);
1018
1019  void adcq(Register dst, int32_t imm32);
1020  void adcq(Register dst, Address src);
1021  void adcq(Register dst, Register src);
1022
1023  void addl(Address dst, int32_t imm32);
1024  void addl(Address dst, Register src);
1025  void addl(Register dst, int32_t imm32);
1026  void addl(Register dst, Address src);
1027  void addl(Register dst, Register src);
1028
1029  void addq(Address dst, int32_t imm32);
1030  void addq(Address dst, Register src);
1031  void addq(Register dst, int32_t imm32);
1032  void addq(Register dst, Address src);
1033  void addq(Register dst, Register src);
1034
1035#ifdef _LP64
1036 //Add Unsigned Integers with Carry Flag
1037  void adcxq(Register dst, Register src);
1038
1039 //Add Unsigned Integers with Overflow Flag
1040  void adoxq(Register dst, Register src);
1041#endif
1042
1043  void addr_nop_4();
1044  void addr_nop_5();
1045  void addr_nop_7();
1046  void addr_nop_8();
1047
1048  // Add Scalar Double-Precision Floating-Point Values
1049  void addsd(XMMRegister dst, Address src);
1050  void addsd(XMMRegister dst, XMMRegister src);
1051
1052  // Add Scalar Single-Precision Floating-Point Values
1053  void addss(XMMRegister dst, Address src);
1054  void addss(XMMRegister dst, XMMRegister src);
1055
1056  // AES instructions
1057  void aesdec(XMMRegister dst, Address src);
1058  void aesdec(XMMRegister dst, XMMRegister src);
1059  void aesdeclast(XMMRegister dst, Address src);
1060  void aesdeclast(XMMRegister dst, XMMRegister src);
1061  void aesenc(XMMRegister dst, Address src);
1062  void aesenc(XMMRegister dst, XMMRegister src);
1063  void aesenclast(XMMRegister dst, Address src);
1064  void aesenclast(XMMRegister dst, XMMRegister src);
1065
1066
1067  void andl(Address  dst, int32_t imm32);
1068  void andl(Register dst, int32_t imm32);
1069  void andl(Register dst, Address src);
1070  void andl(Register dst, Register src);
1071
1072  void andq(Address  dst, int32_t imm32);
1073  void andq(Register dst, int32_t imm32);
1074  void andq(Register dst, Address src);
1075  void andq(Register dst, Register src);
1076
1077  // BMI instructions
1078  void andnl(Register dst, Register src1, Register src2);
1079  void andnl(Register dst, Register src1, Address src2);
1080  void andnq(Register dst, Register src1, Register src2);
1081  void andnq(Register dst, Register src1, Address src2);
1082
1083  void blsil(Register dst, Register src);
1084  void blsil(Register dst, Address src);
1085  void blsiq(Register dst, Register src);
1086  void blsiq(Register dst, Address src);
1087
1088  void blsmskl(Register dst, Register src);
1089  void blsmskl(Register dst, Address src);
1090  void blsmskq(Register dst, Register src);
1091  void blsmskq(Register dst, Address src);
1092
1093  void blsrl(Register dst, Register src);
1094  void blsrl(Register dst, Address src);
1095  void blsrq(Register dst, Register src);
1096  void blsrq(Register dst, Address src);
1097
1098  void bsfl(Register dst, Register src);
1099  void bsrl(Register dst, Register src);
1100
1101#ifdef _LP64
1102  void bsfq(Register dst, Register src);
1103  void bsrq(Register dst, Register src);
1104#endif
1105
1106  void bswapl(Register reg);
1107
1108  void bswapq(Register reg);
1109
1110  void call(Label& L, relocInfo::relocType rtype);
1111  void call(Register reg);  // push pc; pc <- reg
1112  void call(Address adr);   // push pc; pc <- adr
1113
1114  void cdql();
1115
1116  void cdqq();
1117
1118  void cld();
1119
1120  void clflush(Address adr);
1121
1122  void cmovl(Condition cc, Register dst, Register src);
1123  void cmovl(Condition cc, Register dst, Address src);
1124
1125  void cmovq(Condition cc, Register dst, Register src);
1126  void cmovq(Condition cc, Register dst, Address src);
1127
1128
1129  void cmpb(Address dst, int imm8);
1130
1131  void cmpl(Address dst, int32_t imm32);
1132
1133  void cmpl(Register dst, int32_t imm32);
1134  void cmpl(Register dst, Register src);
1135  void cmpl(Register dst, Address src);
1136
1137  void cmpq(Address dst, int32_t imm32);
1138  void cmpq(Address dst, Register src);
1139
1140  void cmpq(Register dst, int32_t imm32);
1141  void cmpq(Register dst, Register src);
1142  void cmpq(Register dst, Address src);
1143
1144  // these are dummies used to catch attempting to convert NULL to Register
1145  void cmpl(Register dst, void* junk); // dummy
1146  void cmpq(Register dst, void* junk); // dummy
1147
1148  void cmpw(Address dst, int imm16);
1149
1150  void cmpxchg8 (Address adr);
1151
1152  void cmpxchgb(Register reg, Address adr);
1153  void cmpxchgl(Register reg, Address adr);
1154
1155  void cmpxchgq(Register reg, Address adr);
1156
1157  // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
1158  void comisd(XMMRegister dst, Address src);
1159  void comisd(XMMRegister dst, XMMRegister src);
1160
1161  // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
1162  void comiss(XMMRegister dst, Address src);
1163  void comiss(XMMRegister dst, XMMRegister src);
1164
1165  // Identify processor type and features
1166  void cpuid();
1167
1168  // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
1169  void cvtsd2ss(XMMRegister dst, XMMRegister src);
1170  void cvtsd2ss(XMMRegister dst, Address src);
1171
1172  // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
1173  void cvtsi2sdl(XMMRegister dst, Register src);
1174  void cvtsi2sdl(XMMRegister dst, Address src);
1175  void cvtsi2sdq(XMMRegister dst, Register src);
1176  void cvtsi2sdq(XMMRegister dst, Address src);
1177
1178  // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
1179  void cvtsi2ssl(XMMRegister dst, Register src);
1180  void cvtsi2ssl(XMMRegister dst, Address src);
1181  void cvtsi2ssq(XMMRegister dst, Register src);
1182  void cvtsi2ssq(XMMRegister dst, Address src);
1183
1184  // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
1185  void cvtdq2pd(XMMRegister dst, XMMRegister src);
1186
1187  // Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value
1188  void cvtdq2ps(XMMRegister dst, XMMRegister src);
1189
1190  // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
1191  void cvtss2sd(XMMRegister dst, XMMRegister src);
1192  void cvtss2sd(XMMRegister dst, Address src);
1193
1194  // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
1195  void cvttsd2sil(Register dst, Address src);
1196  void cvttsd2sil(Register dst, XMMRegister src);
1197  void cvttsd2siq(Register dst, XMMRegister src);
1198
1199  // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
1200  void cvttss2sil(Register dst, XMMRegister src);
1201  void cvttss2siq(Register dst, XMMRegister src);
1202
1203  // Divide Scalar Double-Precision Floating-Point Values
1204  void divsd(XMMRegister dst, Address src);
1205  void divsd(XMMRegister dst, XMMRegister src);
1206
1207  // Divide Scalar Single-Precision Floating-Point Values
1208  void divss(XMMRegister dst, Address src);
1209  void divss(XMMRegister dst, XMMRegister src);
1210
1211  void emms();
1212
1213  void fabs();
1214
1215  void fadd(int i);
1216
1217  void fadd_d(Address src);
1218  void fadd_s(Address src);
1219
1220  // "Alternate" versions of x87 instructions place result down in FPU
1221  // stack instead of on TOS
1222
1223  void fadda(int i); // "alternate" fadd
1224  void faddp(int i = 1);
1225
1226  void fchs();
1227
1228  void fcom(int i);
1229
1230  void fcomp(int i = 1);
1231  void fcomp_d(Address src);
1232  void fcomp_s(Address src);
1233
1234  void fcompp();
1235
1236  void fcos();
1237
1238  void fdecstp();
1239
1240  void fdiv(int i);
1241  void fdiv_d(Address src);
1242  void fdivr_s(Address src);
1243  void fdiva(int i);  // "alternate" fdiv
1244  void fdivp(int i = 1);
1245
1246  void fdivr(int i);
1247  void fdivr_d(Address src);
1248  void fdiv_s(Address src);
1249
1250  void fdivra(int i); // "alternate" reversed fdiv
1251
1252  void fdivrp(int i = 1);
1253
1254  void ffree(int i = 0);
1255
1256  void fild_d(Address adr);
1257  void fild_s(Address adr);
1258
1259  void fincstp();
1260
1261  void finit();
1262
1263  void fist_s (Address adr);
1264  void fistp_d(Address adr);
1265  void fistp_s(Address adr);
1266
1267  void fld1();
1268
1269  void fld_d(Address adr);
1270  void fld_s(Address adr);
1271  void fld_s(int index);
1272  void fld_x(Address adr);  // extended-precision (80-bit) format
1273
1274  void fldcw(Address src);
1275
1276  void fldenv(Address src);
1277
1278  void fldlg2();
1279
1280  void fldln2();
1281
1282  void fldz();
1283
1284  void flog();
1285  void flog10();
1286
1287  void fmul(int i);
1288
1289  void fmul_d(Address src);
1290  void fmul_s(Address src);
1291
1292  void fmula(int i);  // "alternate" fmul
1293
1294  void fmulp(int i = 1);
1295
1296  void fnsave(Address dst);
1297
1298  void fnstcw(Address src);
1299
1300  void fnstsw_ax();
1301
1302  void fprem();
1303  void fprem1();
1304
1305  void frstor(Address src);
1306
1307  void fsin();
1308
1309  void fsqrt();
1310
1311  void fst_d(Address adr);
1312  void fst_s(Address adr);
1313
1314  void fstp_d(Address adr);
1315  void fstp_d(int index);
1316  void fstp_s(Address adr);
1317  void fstp_x(Address adr); // extended-precision (80-bit) format
1318
1319  void fsub(int i);
1320  void fsub_d(Address src);
1321  void fsub_s(Address src);
1322
1323  void fsuba(int i);  // "alternate" fsub
1324
1325  void fsubp(int i = 1);
1326
1327  void fsubr(int i);
1328  void fsubr_d(Address src);
1329  void fsubr_s(Address src);
1330
1331  void fsubra(int i); // "alternate" reversed fsub
1332
1333  void fsubrp(int i = 1);
1334
1335  void ftan();
1336
1337  void ftst();
1338
1339  void fucomi(int i = 1);
1340  void fucomip(int i = 1);
1341
1342  void fwait();
1343
1344  void fxch(int i = 1);
1345
1346  void fxrstor(Address src);
1347
1348  void fxsave(Address dst);
1349
1350  void fyl2x();
1351  void frndint();
1352  void f2xm1();
1353  void fldl2e();
1354
1355  void hlt();
1356
1357  void idivl(Register src);
1358  void divl(Register src); // Unsigned division
1359
1360#ifdef _LP64
1361  void idivq(Register src);
1362#endif
1363
1364  void imull(Register dst, Register src);
1365  void imull(Register dst, Register src, int value);
1366  void imull(Register dst, Address src);
1367
1368#ifdef _LP64
1369  void imulq(Register dst, Register src);
1370  void imulq(Register dst, Register src, int value);
1371  void imulq(Register dst, Address src);
1372#endif
1373
1374  // jcc is the generic conditional branch generator to run-
1375  // time routines, jcc is used for branches to labels. jcc
1376  // takes a branch opcode (cc) and a label (L) and generates
1377  // either a backward branch or a forward branch and links it
1378  // to the label fixup chain. Usage:
1379  //
1380  // Label L;      // unbound label
1381  // jcc(cc, L);   // forward branch to unbound label
1382  // bind(L);      // bind label to the current pc
1383  // jcc(cc, L);   // backward branch to bound label
1384  // bind(L);      // illegal: a label may be bound only once
1385  //
1386  // Note: The same Label can be used for forward and backward branches
1387  // but it may be bound only once.
1388
1389  void jcc(Condition cc, Label& L, bool maybe_short = true);
1390
1391  // Conditional jump to a 8-bit offset to L.
1392  // WARNING: be very careful using this for forward jumps.  If the label is
1393  // not bound within an 8-bit offset of this instruction, a run-time error
1394  // will occur.
1395  void jccb(Condition cc, Label& L);
1396
1397  void jmp(Address entry);    // pc <- entry
1398
1399  // Label operations & relative jumps (PPUM Appendix D)
1400  void jmp(Label& L, bool maybe_short = true);   // unconditional jump to L
1401
1402  void jmp(Register entry); // pc <- entry
1403
1404  // Unconditional 8-bit offset jump to L.
1405  // WARNING: be very careful using this for forward jumps.  If the label is
1406  // not bound within an 8-bit offset of this instruction, a run-time error
1407  // will occur.
1408  void jmpb(Label& L);
1409
1410  void ldmxcsr( Address src );
1411
1412  void leal(Register dst, Address src);
1413
1414  void leaq(Register dst, Address src);
1415
1416  void lfence();
1417
1418  void lock();
1419
1420  void lzcntl(Register dst, Register src);
1421
1422#ifdef _LP64
1423  void lzcntq(Register dst, Register src);
1424#endif
1425
1426  enum Membar_mask_bits {
1427    StoreStore = 1 << 3,
1428    LoadStore  = 1 << 2,
1429    StoreLoad  = 1 << 1,
1430    LoadLoad   = 1 << 0
1431  };
1432
1433  // Serializes memory and blows flags
1434  void membar(Membar_mask_bits order_constraint) {
1435    if (os::is_MP()) {
1436      // We only have to handle StoreLoad
1437      if (order_constraint & StoreLoad) {
1438        // All usable chips support "locked" instructions which suffice
1439        // as barriers, and are much faster than the alternative of
1440        // using cpuid instruction. We use here a locked add [esp-C],0.
1441        // This is conveniently otherwise a no-op except for blowing
1442        // flags, and introducing a false dependency on target memory
1443        // location. We can't do anything with flags, but we can avoid
1444        // memory dependencies in the current method by locked-adding
1445        // somewhere else on the stack. Doing [esp+C] will collide with
1446        // something on stack in current method, hence we go for [esp-C].
1447        // It is convenient since it is almost always in data cache, for
1448        // any small C.  We need to step back from SP to avoid data
1449        // dependencies with other things on below SP (callee-saves, for
1450        // example). Without a clear way to figure out the minimal safe
1451        // distance from SP, it makes sense to step back the complete
1452        // cache line, as this will also avoid possible second-order effects
1453        // with locked ops against the cache line. Our choice of offset
1454        // is bounded by x86 operand encoding, which should stay within
1455        // [-128; +127] to have the 8-byte displacement encoding.
1456        //
1457        // Any change to this code may need to revisit other places in
1458        // the code where this idiom is used, in particular the
1459        // orderAccess code.
1460
1461        int offset = -VM_Version::L1_line_size();
1462        if (offset < -128) {
1463          offset = -128;
1464        }
1465
1466        lock();
1467        addl(Address(rsp, offset), 0);// Assert the lock# signal here
1468      }
1469    }
1470  }
1471
1472  void mfence();
1473
1474  // Moves
1475
1476  void mov64(Register dst, int64_t imm64);
1477
1478  void movb(Address dst, Register src);
1479  void movb(Address dst, int imm8);
1480  void movb(Register dst, Address src);
1481
1482  void kmovq(KRegister dst, KRegister src);
1483  void kmovql(KRegister dst, Register src);
1484  void kmovdl(KRegister dst, Register src);
1485  void kmovq(Address dst, KRegister src);
1486  void kmovq(KRegister dst, Address src);
1487
1488  void movdl(XMMRegister dst, Register src);
1489  void movdl(Register dst, XMMRegister src);
1490  void movdl(XMMRegister dst, Address src);
1491  void movdl(Address dst, XMMRegister src);
1492
1493  // Move Double Quadword
1494  void movdq(XMMRegister dst, Register src);
1495  void movdq(Register dst, XMMRegister src);
1496
1497  // Move Aligned Double Quadword
1498  void movdqa(XMMRegister dst, XMMRegister src);
1499  void movdqa(XMMRegister dst, Address src);
1500
1501  // Move Unaligned Double Quadword
1502  void movdqu(Address     dst, XMMRegister src);
1503  void movdqu(XMMRegister dst, Address src);
1504  void movdqu(XMMRegister dst, XMMRegister src);
1505
1506  // Move Unaligned 256bit Vector
1507  void vmovdqu(Address dst, XMMRegister src);
1508  void vmovdqu(XMMRegister dst, Address src);
1509  void vmovdqu(XMMRegister dst, XMMRegister src);
1510
1511   // Move Unaligned 512bit Vector
1512  void evmovdqu(Address dst, XMMRegister src, int vector_len);
1513  void evmovdqu(XMMRegister dst, Address src, int vector_len);
1514  void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len);
1515
1516  // Move lower 64bit to high 64bit in 128bit register
1517  void movlhps(XMMRegister dst, XMMRegister src);
1518
1519  void movl(Register dst, int32_t imm32);
1520  void movl(Address dst, int32_t imm32);
1521  void movl(Register dst, Register src);
1522  void movl(Register dst, Address src);
1523  void movl(Address dst, Register src);
1524
1525  // These dummies prevent using movl from converting a zero (like NULL) into Register
1526  // by giving the compiler two choices it can't resolve
1527
1528  void movl(Address  dst, void* junk);
1529  void movl(Register dst, void* junk);
1530
1531#ifdef _LP64
1532  void movq(Register dst, Register src);
1533  void movq(Register dst, Address src);
1534  void movq(Address  dst, Register src);
1535#endif
1536
1537  void movq(Address     dst, MMXRegister src );
1538  void movq(MMXRegister dst, Address src );
1539
1540#ifdef _LP64
1541  // These dummies prevent using movq from converting a zero (like NULL) into Register
1542  // by giving the compiler two choices it can't resolve
1543
1544  void movq(Address  dst, void* dummy);
1545  void movq(Register dst, void* dummy);
1546#endif
1547
1548  // Move Quadword
1549  void movq(Address     dst, XMMRegister src);
1550  void movq(XMMRegister dst, Address src);
1551
1552  void movsbl(Register dst, Address src);
1553  void movsbl(Register dst, Register src);
1554
1555#ifdef _LP64
1556  void movsbq(Register dst, Address src);
1557  void movsbq(Register dst, Register src);
1558
1559  // Move signed 32bit immediate to 64bit extending sign
1560  void movslq(Address  dst, int32_t imm64);
1561  void movslq(Register dst, int32_t imm64);
1562
1563  void movslq(Register dst, Address src);
1564  void movslq(Register dst, Register src);
1565  void movslq(Register dst, void* src); // Dummy declaration to cause NULL to be ambiguous
1566#endif
1567
1568  void movswl(Register dst, Address src);
1569  void movswl(Register dst, Register src);
1570
1571#ifdef _LP64
1572  void movswq(Register dst, Address src);
1573  void movswq(Register dst, Register src);
1574#endif
1575
1576  void movw(Address dst, int imm16);
1577  void movw(Register dst, Address src);
1578  void movw(Address dst, Register src);
1579
1580  void movzbl(Register dst, Address src);
1581  void movzbl(Register dst, Register src);
1582
1583#ifdef _LP64
1584  void movzbq(Register dst, Address src);
1585  void movzbq(Register dst, Register src);
1586#endif
1587
1588  void movzwl(Register dst, Address src);
1589  void movzwl(Register dst, Register src);
1590
1591#ifdef _LP64
1592  void movzwq(Register dst, Address src);
1593  void movzwq(Register dst, Register src);
1594#endif
1595
1596  // Unsigned multiply with RAX destination register
1597  void mull(Address src);
1598  void mull(Register src);
1599
1600#ifdef _LP64
1601  void mulq(Address src);
1602  void mulq(Register src);
1603  void mulxq(Register dst1, Register dst2, Register src);
1604#endif
1605
1606  // Multiply Scalar Double-Precision Floating-Point Values
1607  void mulsd(XMMRegister dst, Address src);
1608  void mulsd(XMMRegister dst, XMMRegister src);
1609
1610  // Multiply Scalar Single-Precision Floating-Point Values
1611  void mulss(XMMRegister dst, Address src);
1612  void mulss(XMMRegister dst, XMMRegister src);
1613
1614  void negl(Register dst);
1615
1616#ifdef _LP64
1617  void negq(Register dst);
1618#endif
1619
1620  void nop(int i = 1);
1621
1622  void notl(Register dst);
1623
1624#ifdef _LP64
1625  void notq(Register dst);
1626#endif
1627
1628  void orl(Address dst, int32_t imm32);
1629  void orl(Register dst, int32_t imm32);
1630  void orl(Register dst, Address src);
1631  void orl(Register dst, Register src);
1632  void orl(Address dst, Register src);
1633
1634  void orq(Address dst, int32_t imm32);
1635  void orq(Register dst, int32_t imm32);
1636  void orq(Register dst, Address src);
1637  void orq(Register dst, Register src);
1638
1639  // Pack with unsigned saturation
1640  void packuswb(XMMRegister dst, XMMRegister src);
1641  void packuswb(XMMRegister dst, Address src);
1642  void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1643
1644  // Pemutation of 64bit words
1645  void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1646
1647  void pause();
1648
1649  // SSE4.2 string instructions
1650  void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
1651  void pcmpestri(XMMRegister xmm1, Address src, int imm8);
1652
1653  // SSE 4.1 extract
1654  void pextrd(Register dst, XMMRegister src, int imm8);
1655  void pextrq(Register dst, XMMRegister src, int imm8);
1656
1657  // SSE 4.1 insert
1658  void pinsrd(XMMRegister dst, Register src, int imm8);
1659  void pinsrq(XMMRegister dst, Register src, int imm8);
1660
1661  // SSE4.1 packed move
1662  void pmovzxbw(XMMRegister dst, XMMRegister src);
1663  void pmovzxbw(XMMRegister dst, Address src);
1664
1665#ifndef _LP64 // no 32bit push/pop on amd64
1666  void popl(Address dst);
1667#endif
1668
1669#ifdef _LP64
1670  void popq(Address dst);
1671#endif
1672
1673  void popcntl(Register dst, Address src);
1674  void popcntl(Register dst, Register src);
1675
1676#ifdef _LP64
1677  void popcntq(Register dst, Address src);
1678  void popcntq(Register dst, Register src);
1679#endif
1680
1681  // Prefetches (SSE, SSE2, 3DNOW only)
1682
1683  void prefetchnta(Address src);
1684  void prefetchr(Address src);
1685  void prefetcht0(Address src);
1686  void prefetcht1(Address src);
1687  void prefetcht2(Address src);
1688  void prefetchw(Address src);
1689
1690  // Shuffle Bytes
1691  void pshufb(XMMRegister dst, XMMRegister src);
1692  void pshufb(XMMRegister dst, Address src);
1693
1694  // Shuffle Packed Doublewords
1695  void pshufd(XMMRegister dst, XMMRegister src, int mode);
1696  void pshufd(XMMRegister dst, Address src,     int mode);
1697
1698  // Shuffle Packed Low Words
1699  void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1700  void pshuflw(XMMRegister dst, Address src,     int mode);
1701
1702  // Shift Right by bytes Logical DoubleQuadword Immediate
1703  void psrldq(XMMRegister dst, int shift);
1704  // Shift Left by bytes Logical DoubleQuadword Immediate
1705  void pslldq(XMMRegister dst, int shift);
1706
1707  // Logical Compare 128bit
1708  void ptest(XMMRegister dst, XMMRegister src);
1709  void ptest(XMMRegister dst, Address src);
1710  // Logical Compare 256bit
1711  void vptest(XMMRegister dst, XMMRegister src);
1712  void vptest(XMMRegister dst, Address src);
1713
1714  // Interleave Low Bytes
1715  void punpcklbw(XMMRegister dst, XMMRegister src);
1716  void punpcklbw(XMMRegister dst, Address src);
1717
1718  // Interleave Low Doublewords
1719  void punpckldq(XMMRegister dst, XMMRegister src);
1720  void punpckldq(XMMRegister dst, Address src);
1721
1722  // Interleave Low Quadwords
1723  void punpcklqdq(XMMRegister dst, XMMRegister src);
1724
1725#ifndef _LP64 // no 32bit push/pop on amd64
1726  void pushl(Address src);
1727#endif
1728
1729  void pushq(Address src);
1730
1731  void rcll(Register dst, int imm8);
1732
1733  void rclq(Register dst, int imm8);
1734
1735  void rcrq(Register dst, int imm8);
1736
1737  void rdtsc();
1738
1739  void ret(int imm16);
1740
1741#ifdef _LP64
1742  void rorq(Register dst, int imm8);
1743  void rorxq(Register dst, Register src, int imm8);
1744#endif
1745
1746  void sahf();
1747
1748  void sarl(Register dst, int imm8);
1749  void sarl(Register dst);
1750
1751  void sarq(Register dst, int imm8);
1752  void sarq(Register dst);
1753
1754  void sbbl(Address dst, int32_t imm32);
1755  void sbbl(Register dst, int32_t imm32);
1756  void sbbl(Register dst, Address src);
1757  void sbbl(Register dst, Register src);
1758
1759  void sbbq(Address dst, int32_t imm32);
1760  void sbbq(Register dst, int32_t imm32);
1761  void sbbq(Register dst, Address src);
1762  void sbbq(Register dst, Register src);
1763
1764  void setb(Condition cc, Register dst);
1765
1766  void shldl(Register dst, Register src);
1767
1768  void shll(Register dst, int imm8);
1769  void shll(Register dst);
1770
1771  void shlq(Register dst, int imm8);
1772  void shlq(Register dst);
1773
1774  void shrdl(Register dst, Register src);
1775
1776  void shrl(Register dst, int imm8);
1777  void shrl(Register dst);
1778
1779  void shrq(Register dst, int imm8);
1780  void shrq(Register dst);
1781
1782  void smovl(); // QQQ generic?
1783
1784  // Compute Square Root of Scalar Double-Precision Floating-Point Value
1785  void sqrtsd(XMMRegister dst, Address src);
1786  void sqrtsd(XMMRegister dst, XMMRegister src);
1787
1788  // Compute Square Root of Scalar Single-Precision Floating-Point Value
1789  void sqrtss(XMMRegister dst, Address src);
1790  void sqrtss(XMMRegister dst, XMMRegister src);
1791
1792  void std();
1793
1794  void stmxcsr( Address dst );
1795
1796  void subl(Address dst, int32_t imm32);
1797  void subl(Address dst, Register src);
1798  void subl(Register dst, int32_t imm32);
1799  void subl(Register dst, Address src);
1800  void subl(Register dst, Register src);
1801
1802  void subq(Address dst, int32_t imm32);
1803  void subq(Address dst, Register src);
1804  void subq(Register dst, int32_t imm32);
1805  void subq(Register dst, Address src);
1806  void subq(Register dst, Register src);
1807
1808  // Force generation of a 4 byte immediate value even if it fits into 8bit
1809  void subl_imm32(Register dst, int32_t imm32);
1810  void subq_imm32(Register dst, int32_t imm32);
1811
1812  // Subtract Scalar Double-Precision Floating-Point Values
1813  void subsd(XMMRegister dst, Address src);
1814  void subsd(XMMRegister dst, XMMRegister src);
1815
1816  // Subtract Scalar Single-Precision Floating-Point Values
1817  void subss(XMMRegister dst, Address src);
1818  void subss(XMMRegister dst, XMMRegister src);
1819
1820  void testb(Register dst, int imm8);
1821
1822  void testl(Register dst, int32_t imm32);
1823  void testl(Register dst, Register src);
1824  void testl(Register dst, Address src);
1825
1826  void testq(Register dst, int32_t imm32);
1827  void testq(Register dst, Register src);
1828
1829  // BMI - count trailing zeros
1830  void tzcntl(Register dst, Register src);
1831  void tzcntq(Register dst, Register src);
1832
1833  // Unordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
1834  void ucomisd(XMMRegister dst, Address src);
1835  void ucomisd(XMMRegister dst, XMMRegister src);
1836
1837  // Unordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
1838  void ucomiss(XMMRegister dst, Address src);
1839  void ucomiss(XMMRegister dst, XMMRegister src);
1840
1841  void xabort(int8_t imm8);
1842
1843  void xaddl(Address dst, Register src);
1844
1845  void xaddq(Address dst, Register src);
1846
1847  void xbegin(Label& abort, relocInfo::relocType rtype = relocInfo::none);
1848
1849  void xchgl(Register reg, Address adr);
1850  void xchgl(Register dst, Register src);
1851
1852  void xchgq(Register reg, Address adr);
1853  void xchgq(Register dst, Register src);
1854
1855  void xend();
1856
1857  // Get Value of Extended Control Register
1858  void xgetbv();
1859
1860  void xorl(Register dst, int32_t imm32);
1861  void xorl(Register dst, Address src);
1862  void xorl(Register dst, Register src);
1863
1864  void xorq(Register dst, Address src);
1865  void xorq(Register dst, Register src);
1866
1867  void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
1868
1869  // AVX 3-operands scalar instructions (encoded with VEX prefix)
1870
1871  void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
1872  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1873  void vaddss(XMMRegister dst, XMMRegister nds, Address src);
1874  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1875  void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
1876  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1877  void vdivss(XMMRegister dst, XMMRegister nds, Address src);
1878  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1879  void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
1880  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1881  void vmulss(XMMRegister dst, XMMRegister nds, Address src);
1882  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1883  void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
1884  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1885  void vsubss(XMMRegister dst, XMMRegister nds, Address src);
1886  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1887
1888
1889  //====================VECTOR ARITHMETIC=====================================
1890
1891  // Add Packed Floating-Point Values
1892  void addpd(XMMRegister dst, XMMRegister src);
1893  void addps(XMMRegister dst, XMMRegister src);
1894  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1895  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1896  void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1897  void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1898
1899  // Subtract Packed Floating-Point Values
1900  void subpd(XMMRegister dst, XMMRegister src);
1901  void subps(XMMRegister dst, XMMRegister src);
1902  void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1903  void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1904  void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1905  void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1906
1907  // Multiply Packed Floating-Point Values
1908  void mulpd(XMMRegister dst, XMMRegister src);
1909  void mulps(XMMRegister dst, XMMRegister src);
1910  void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1911  void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1912  void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1913  void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1914
1915  // Divide Packed Floating-Point Values
1916  void divpd(XMMRegister dst, XMMRegister src);
1917  void divps(XMMRegister dst, XMMRegister src);
1918  void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1919  void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1920  void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1921  void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1922
1923  // Sqrt Packed Floating-Point Values - Double precision only
1924  void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len);
1925  void vsqrtpd(XMMRegister dst, Address src, int vector_len);
1926
1927  // Bitwise Logical AND of Packed Floating-Point Values
1928  void andpd(XMMRegister dst, XMMRegister src);
1929  void andps(XMMRegister dst, XMMRegister src);
1930  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1931  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1932  void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1933  void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1934
1935  // Bitwise Logical XOR of Packed Floating-Point Values
1936  void xorpd(XMMRegister dst, XMMRegister src);
1937  void xorps(XMMRegister dst, XMMRegister src);
1938  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1939  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1940  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1941  void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1942
1943  // Add horizontal packed integers
1944  void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1945  void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1946  void phaddw(XMMRegister dst, XMMRegister src);
1947  void phaddd(XMMRegister dst, XMMRegister src);
1948
1949  // Add packed integers
1950  void paddb(XMMRegister dst, XMMRegister src);
1951  void paddw(XMMRegister dst, XMMRegister src);
1952  void paddd(XMMRegister dst, XMMRegister src);
1953  void paddq(XMMRegister dst, XMMRegister src);
1954  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1955  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1956  void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1957  void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1958  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1959  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1960  void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1961  void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1962
1963  // Sub packed integers
1964  void psubb(XMMRegister dst, XMMRegister src);
1965  void psubw(XMMRegister dst, XMMRegister src);
1966  void psubd(XMMRegister dst, XMMRegister src);
1967  void psubq(XMMRegister dst, XMMRegister src);
1968  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1969  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1970  void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1971  void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1972  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1973  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1974  void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1975  void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1976
1977  // Multiply packed integers (only shorts and ints)
1978  void pmullw(XMMRegister dst, XMMRegister src);
1979  void pmulld(XMMRegister dst, XMMRegister src);
1980  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1981  void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1982  void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1983  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1984  void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1985  void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1986
1987  // Shift left packed integers
1988  void psllw(XMMRegister dst, int shift);
1989  void pslld(XMMRegister dst, int shift);
1990  void psllq(XMMRegister dst, int shift);
1991  void psllw(XMMRegister dst, XMMRegister shift);
1992  void pslld(XMMRegister dst, XMMRegister shift);
1993  void psllq(XMMRegister dst, XMMRegister shift);
1994  void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1995  void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1996  void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1997  void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1998  void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1999  void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2000
2001  // Logical shift right packed integers
2002  void psrlw(XMMRegister dst, int shift);
2003  void psrld(XMMRegister dst, int shift);
2004  void psrlq(XMMRegister dst, int shift);
2005  void psrlw(XMMRegister dst, XMMRegister shift);
2006  void psrld(XMMRegister dst, XMMRegister shift);
2007  void psrlq(XMMRegister dst, XMMRegister shift);
2008  void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2009  void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2010  void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2011  void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2012  void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2013  void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2014
2015  // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
2016  void psraw(XMMRegister dst, int shift);
2017  void psrad(XMMRegister dst, int shift);
2018  void psraw(XMMRegister dst, XMMRegister shift);
2019  void psrad(XMMRegister dst, XMMRegister shift);
2020  void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2021  void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2022  void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2023  void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2024
2025  // And packed integers
2026  void pand(XMMRegister dst, XMMRegister src);
2027  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2028  void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2029
2030  // Or packed integers
2031  void por(XMMRegister dst, XMMRegister src);
2032  void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2033  void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2034
2035  // Xor packed integers
2036  void pxor(XMMRegister dst, XMMRegister src);
2037  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2038  void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2039
2040  // Copy low 128bit into high 128bit of YMM registers.
2041  void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
2042  void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
2043  void vextractf128h(XMMRegister dst, XMMRegister src);
2044  void vextracti128h(XMMRegister dst, XMMRegister src);
2045
2046  // Load/store high 128bit of YMM registers which does not destroy other half.
2047  void vinsertf128h(XMMRegister dst, Address src);
2048  void vinserti128h(XMMRegister dst, Address src);
2049  void vextractf128h(Address dst, XMMRegister src);
2050  void vextracti128h(Address dst, XMMRegister src);
2051
2052  // Copy low 256bit into high 256bit of ZMM registers.
2053  void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
2054  void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
2055  void vextracti64x4h(XMMRegister dst, XMMRegister src);
2056  void vextractf64x4h(XMMRegister dst, XMMRegister src);
2057  void vextractf64x4h(Address dst, XMMRegister src);
2058  void vinsertf64x4h(XMMRegister dst, Address src);
2059
2060  // Copy targeted 128bit segments of the ZMM registers
2061  void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
2062  void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
2063  void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
2064
2065  // duplicate 4-bytes integer data from src into 8 locations in dest
2066  void vpbroadcastd(XMMRegister dst, XMMRegister src);
2067
2068  // duplicate n-bytes integer data from src into vector_len locations in dest
2069  void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
2070  void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
2071  void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
2072  void evpbroadcastw(XMMRegister dst, Address src, int vector_len);
2073  void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
2074  void evpbroadcastd(XMMRegister dst, Address src, int vector_len);
2075  void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
2076  void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
2077
2078  void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
2079  void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
2080  void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
2081  void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);
2082
2083  void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
2084  void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
2085  void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
2086  void evpbroadcastq(XMMRegister dst, Register src, int vector_len);
2087
2088  // Carry-Less Multiplication Quadword
2089  void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
2090  void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
2091
2092  // AVX instruction which is used to clear upper 128 bits of YMM registers and
2093  // to avoid transaction penalty between AVX and SSE states. There is no
2094  // penalty if legacy SSE instructions are encoded using VEX prefix because
2095  // they always clear upper 128 bits. It should be used before calling
2096  // runtime code and native libraries.
2097  void vzeroupper();
2098
2099 protected:
2100  // Next instructions require address alignment 16 bytes SSE mode.
2101  // They should be called only from corresponding MacroAssembler instructions.
2102  void andpd(XMMRegister dst, Address src);
2103  void andps(XMMRegister dst, Address src);
2104  void xorpd(XMMRegister dst, Address src);
2105  void xorps(XMMRegister dst, Address src);
2106
2107};
2108
2109#endif // CPU_X86_VM_ASSEMBLER_X86_HPP
2110