assembler_x86.hpp revision 3888:f0c2369fda5a
1/*
2 * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#ifndef CPU_X86_VM_ASSEMBLER_X86_HPP
26#define CPU_X86_VM_ASSEMBLER_X86_HPP
27
28#include "asm/register.hpp"
29
30class BiasedLockingCounters;
31
32// Contains all the definitions needed for x86 assembly code generation.
33
34// Calling convention
35class Argument VALUE_OBJ_CLASS_SPEC {
36 public:
37  enum {
38#ifdef _LP64
39#ifdef _WIN64
40    n_int_register_parameters_c   = 4, // rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
41    n_float_register_parameters_c = 4,  // xmm0 - xmm3 (c_farg0, c_farg1, ... )
42#else
43    n_int_register_parameters_c   = 6, // rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
44    n_float_register_parameters_c = 8,  // xmm0 - xmm7 (c_farg0, c_farg1, ... )
45#endif // _WIN64
46    n_int_register_parameters_j   = 6, // j_rarg0, j_rarg1, ...
47    n_float_register_parameters_j = 8  // j_farg0, j_farg1, ...
48#else
49    n_register_parameters = 0   // 0 registers used to pass arguments
50#endif // _LP64
51  };
52};
53
54
55#ifdef _LP64
56// Symbolically name the register arguments used by the c calling convention.
57// Windows is different from linux/solaris. So much for standards...
58
59#ifdef _WIN64
60
61REGISTER_DECLARATION(Register, c_rarg0, rcx);
62REGISTER_DECLARATION(Register, c_rarg1, rdx);
63REGISTER_DECLARATION(Register, c_rarg2, r8);
64REGISTER_DECLARATION(Register, c_rarg3, r9);
65
66REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
67REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
68REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
69REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
70
71#else
72
73REGISTER_DECLARATION(Register, c_rarg0, rdi);
74REGISTER_DECLARATION(Register, c_rarg1, rsi);
75REGISTER_DECLARATION(Register, c_rarg2, rdx);
76REGISTER_DECLARATION(Register, c_rarg3, rcx);
77REGISTER_DECLARATION(Register, c_rarg4, r8);
78REGISTER_DECLARATION(Register, c_rarg5, r9);
79
80REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
81REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
82REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
83REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
84REGISTER_DECLARATION(XMMRegister, c_farg4, xmm4);
85REGISTER_DECLARATION(XMMRegister, c_farg5, xmm5);
86REGISTER_DECLARATION(XMMRegister, c_farg6, xmm6);
87REGISTER_DECLARATION(XMMRegister, c_farg7, xmm7);
88
89#endif // _WIN64
90
91// Symbolically name the register arguments used by the Java calling convention.
92// We have control over the convention for java so we can do what we please.
93// What pleases us is to offset the java calling convention so that when
94// we call a suitable jni method the arguments are lined up and we don't
95// have to do little shuffling. A suitable jni method is non-static and a
96// small number of arguments (two fewer args on windows)
97//
98//        |-------------------------------------------------------|
99//        | c_rarg0   c_rarg1  c_rarg2 c_rarg3 c_rarg4 c_rarg5    |
100//        |-------------------------------------------------------|
101//        | rcx       rdx      r8      r9      rdi*    rsi*       | windows (* not a c_rarg)
102//        | rdi       rsi      rdx     rcx     r8      r9         | solaris/linux
103//        |-------------------------------------------------------|
104//        | j_rarg5   j_rarg0  j_rarg1 j_rarg2 j_rarg3 j_rarg4    |
105//        |-------------------------------------------------------|
106
107REGISTER_DECLARATION(Register, j_rarg0, c_rarg1);
108REGISTER_DECLARATION(Register, j_rarg1, c_rarg2);
109REGISTER_DECLARATION(Register, j_rarg2, c_rarg3);
110// Windows runs out of register args here
111#ifdef _WIN64
112REGISTER_DECLARATION(Register, j_rarg3, rdi);
113REGISTER_DECLARATION(Register, j_rarg4, rsi);
114#else
115REGISTER_DECLARATION(Register, j_rarg3, c_rarg4);
116REGISTER_DECLARATION(Register, j_rarg4, c_rarg5);
117#endif /* _WIN64 */
118REGISTER_DECLARATION(Register, j_rarg5, c_rarg0);
119
120REGISTER_DECLARATION(XMMRegister, j_farg0, xmm0);
121REGISTER_DECLARATION(XMMRegister, j_farg1, xmm1);
122REGISTER_DECLARATION(XMMRegister, j_farg2, xmm2);
123REGISTER_DECLARATION(XMMRegister, j_farg3, xmm3);
124REGISTER_DECLARATION(XMMRegister, j_farg4, xmm4);
125REGISTER_DECLARATION(XMMRegister, j_farg5, xmm5);
126REGISTER_DECLARATION(XMMRegister, j_farg6, xmm6);
127REGISTER_DECLARATION(XMMRegister, j_farg7, xmm7);
128
129REGISTER_DECLARATION(Register, rscratch1, r10);  // volatile
130REGISTER_DECLARATION(Register, rscratch2, r11);  // volatile
131
132REGISTER_DECLARATION(Register, r12_heapbase, r12); // callee-saved
133REGISTER_DECLARATION(Register, r15_thread, r15); // callee-saved
134
135#else
136// rscratch1 will apear in 32bit code that is dead but of course must compile
137// Using noreg ensures if the dead code is incorrectly live and executed it
138// will cause an assertion failure
139#define rscratch1 noreg
140#define rscratch2 noreg
141
142#endif // _LP64
143
144// JSR 292 fixed register usages:
145REGISTER_DECLARATION(Register, rbp_mh_SP_save, rbp);
146
147// Address is an abstraction used to represent a memory location
148// using any of the amd64 addressing modes with one object.
149//
150// Note: A register location is represented via a Register, not
151//       via an address for efficiency & simplicity reasons.
152
153class ArrayAddress;
154
155class Address VALUE_OBJ_CLASS_SPEC {
156 public:
157  enum ScaleFactor {
158    no_scale = -1,
159    times_1  =  0,
160    times_2  =  1,
161    times_4  =  2,
162    times_8  =  3,
163    times_ptr = LP64_ONLY(times_8) NOT_LP64(times_4)
164  };
165  static ScaleFactor times(int size) {
166    assert(size >= 1 && size <= 8 && is_power_of_2(size), "bad scale size");
167    if (size == 8)  return times_8;
168    if (size == 4)  return times_4;
169    if (size == 2)  return times_2;
170    return times_1;
171  }
172  static int scale_size(ScaleFactor scale) {
173    assert(scale != no_scale, "");
174    assert(((1 << (int)times_1) == 1 &&
175            (1 << (int)times_2) == 2 &&
176            (1 << (int)times_4) == 4 &&
177            (1 << (int)times_8) == 8), "");
178    return (1 << (int)scale);
179  }
180
181 private:
182  Register         _base;
183  Register         _index;
184  ScaleFactor      _scale;
185  int              _disp;
186  RelocationHolder _rspec;
187
188  // Easily misused constructors make them private
189  // %%% can we make these go away?
190  NOT_LP64(Address(address loc, RelocationHolder spec);)
191  Address(int disp, address loc, relocInfo::relocType rtype);
192  Address(int disp, address loc, RelocationHolder spec);
193
194 public:
195
196 int disp() { return _disp; }
197  // creation
198  Address()
199    : _base(noreg),
200      _index(noreg),
201      _scale(no_scale),
202      _disp(0) {
203  }
204
205  // No default displacement otherwise Register can be implicitly
206  // converted to 0(Register) which is quite a different animal.
207
208  Address(Register base, int disp)
209    : _base(base),
210      _index(noreg),
211      _scale(no_scale),
212      _disp(disp) {
213  }
214
215  Address(Register base, Register index, ScaleFactor scale, int disp = 0)
216    : _base (base),
217      _index(index),
218      _scale(scale),
219      _disp (disp) {
220    assert(!index->is_valid() == (scale == Address::no_scale),
221           "inconsistent address");
222  }
223
224  Address(Register base, RegisterOrConstant index, ScaleFactor scale = times_1, int disp = 0)
225    : _base (base),
226      _index(index.register_or_noreg()),
227      _scale(scale),
228      _disp (disp + (index.constant_or_zero() * scale_size(scale))) {
229    if (!index.is_register())  scale = Address::no_scale;
230    assert(!_index->is_valid() == (scale == Address::no_scale),
231           "inconsistent address");
232  }
233
234  Address plus_disp(int disp) const {
235    Address a = (*this);
236    a._disp += disp;
237    return a;
238  }
239  Address plus_disp(RegisterOrConstant disp, ScaleFactor scale = times_1) const {
240    Address a = (*this);
241    a._disp += disp.constant_or_zero() * scale_size(scale);
242    if (disp.is_register()) {
243      assert(!a.index()->is_valid(), "competing indexes");
244      a._index = disp.as_register();
245      a._scale = scale;
246    }
247    return a;
248  }
249  bool is_same_address(Address a) const {
250    // disregard _rspec
251    return _base == a._base && _disp == a._disp && _index == a._index && _scale == a._scale;
252  }
253
254  // The following two overloads are used in connection with the
255  // ByteSize type (see sizes.hpp).  They simplify the use of
256  // ByteSize'd arguments in assembly code. Note that their equivalent
257  // for the optimized build are the member functions with int disp
258  // argument since ByteSize is mapped to an int type in that case.
259  //
260  // Note: DO NOT introduce similar overloaded functions for WordSize
261  // arguments as in the optimized mode, both ByteSize and WordSize
262  // are mapped to the same type and thus the compiler cannot make a
263  // distinction anymore (=> compiler errors).
264
265#ifdef ASSERT
266  Address(Register base, ByteSize disp)
267    : _base(base),
268      _index(noreg),
269      _scale(no_scale),
270      _disp(in_bytes(disp)) {
271  }
272
273  Address(Register base, Register index, ScaleFactor scale, ByteSize disp)
274    : _base(base),
275      _index(index),
276      _scale(scale),
277      _disp(in_bytes(disp)) {
278    assert(!index->is_valid() == (scale == Address::no_scale),
279           "inconsistent address");
280  }
281
282  Address(Register base, RegisterOrConstant index, ScaleFactor scale, ByteSize disp)
283    : _base (base),
284      _index(index.register_or_noreg()),
285      _scale(scale),
286      _disp (in_bytes(disp) + (index.constant_or_zero() * scale_size(scale))) {
287    if (!index.is_register())  scale = Address::no_scale;
288    assert(!_index->is_valid() == (scale == Address::no_scale),
289           "inconsistent address");
290  }
291
292#endif // ASSERT
293
294  // accessors
295  bool        uses(Register reg) const { return _base == reg || _index == reg; }
296  Register    base()             const { return _base;  }
297  Register    index()            const { return _index; }
298  ScaleFactor scale()            const { return _scale; }
299  int         disp()             const { return _disp;  }
300
301  // Convert the raw encoding form into the form expected by the constructor for
302  // Address.  An index of 4 (rsp) corresponds to having no index, so convert
303  // that to noreg for the Address constructor.
304  static Address make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc);
305
306  static Address make_array(ArrayAddress);
307
308 private:
309  bool base_needs_rex() const {
310    return _base != noreg && _base->encoding() >= 8;
311  }
312
313  bool index_needs_rex() const {
314    return _index != noreg &&_index->encoding() >= 8;
315  }
316
317  relocInfo::relocType reloc() const { return _rspec.type(); }
318
319  friend class Assembler;
320  friend class MacroAssembler;
321  friend class LIR_Assembler; // base/index/scale/disp
322};
323
324//
325// AddressLiteral has been split out from Address because operands of this type
326// need to be treated specially on 32bit vs. 64bit platforms. By splitting it out
327// the few instructions that need to deal with address literals are unique and the
328// MacroAssembler does not have to implement every instruction in the Assembler
329// in order to search for address literals that may need special handling depending
330// on the instruction and the platform. As small step on the way to merging i486/amd64
331// directories.
332//
333class AddressLiteral VALUE_OBJ_CLASS_SPEC {
334  friend class ArrayAddress;
335  RelocationHolder _rspec;
336  // Typically we use AddressLiterals we want to use their rval
337  // However in some situations we want the lval (effect address) of the item.
338  // We provide a special factory for making those lvals.
339  bool _is_lval;
340
341  // If the target is far we'll need to load the ea of this to
342  // a register to reach it. Otherwise if near we can do rip
343  // relative addressing.
344
345  address          _target;
346
347 protected:
348  // creation
349  AddressLiteral()
350    : _is_lval(false),
351      _target(NULL)
352  {}
353
354  public:
355
356
357  AddressLiteral(address target, relocInfo::relocType rtype);
358
359  AddressLiteral(address target, RelocationHolder const& rspec)
360    : _rspec(rspec),
361      _is_lval(false),
362      _target(target)
363  {}
364
365  AddressLiteral addr() {
366    AddressLiteral ret = *this;
367    ret._is_lval = true;
368    return ret;
369  }
370
371
372 private:
373
374  address target() { return _target; }
375  bool is_lval() { return _is_lval; }
376
377  relocInfo::relocType reloc() const { return _rspec.type(); }
378  const RelocationHolder& rspec() const { return _rspec; }
379
380  friend class Assembler;
381  friend class MacroAssembler;
382  friend class Address;
383  friend class LIR_Assembler;
384};
385
386// Convience classes
387class RuntimeAddress: public AddressLiteral {
388
389  public:
390
391  RuntimeAddress(address target) : AddressLiteral(target, relocInfo::runtime_call_type) {}
392
393};
394
395class ExternalAddress: public AddressLiteral {
396 private:
397  static relocInfo::relocType reloc_for_target(address target) {
398    // Sometimes ExternalAddress is used for values which aren't
399    // exactly addresses, like the card table base.
400    // external_word_type can't be used for values in the first page
401    // so just skip the reloc in that case.
402    return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
403  }
404
405 public:
406
407  ExternalAddress(address target) : AddressLiteral(target, reloc_for_target(target)) {}
408
409};
410
411class InternalAddress: public AddressLiteral {
412
413  public:
414
415  InternalAddress(address target) : AddressLiteral(target, relocInfo::internal_word_type) {}
416
417};
418
419// x86 can do array addressing as a single operation since disp can be an absolute
420// address amd64 can't. We create a class that expresses the concept but does extra
421// magic on amd64 to get the final result
422
423class ArrayAddress VALUE_OBJ_CLASS_SPEC {
424  private:
425
426  AddressLiteral _base;
427  Address        _index;
428
429  public:
430
431  ArrayAddress() {};
432  ArrayAddress(AddressLiteral base, Address index): _base(base), _index(index) {};
433  AddressLiteral base() { return _base; }
434  Address index() { return _index; }
435
436};
437
438const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512 / wordSize);
439
440// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
441// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
442// is what you get. The Assembler is generating code into a CodeBuffer.
443
444class Assembler : public AbstractAssembler  {
445  friend class AbstractAssembler; // for the non-virtual hack
446  friend class LIR_Assembler; // as_Address()
447  friend class StubGenerator;
448
449 public:
450  enum Condition {                     // The x86 condition codes used for conditional jumps/moves.
451    zero          = 0x4,
452    notZero       = 0x5,
453    equal         = 0x4,
454    notEqual      = 0x5,
455    less          = 0xc,
456    lessEqual     = 0xe,
457    greater       = 0xf,
458    greaterEqual  = 0xd,
459    below         = 0x2,
460    belowEqual    = 0x6,
461    above         = 0x7,
462    aboveEqual    = 0x3,
463    overflow      = 0x0,
464    noOverflow    = 0x1,
465    carrySet      = 0x2,
466    carryClear    = 0x3,
467    negative      = 0x8,
468    positive      = 0x9,
469    parity        = 0xa,
470    noParity      = 0xb
471  };
472
473  enum Prefix {
474    // segment overrides
475    CS_segment = 0x2e,
476    SS_segment = 0x36,
477    DS_segment = 0x3e,
478    ES_segment = 0x26,
479    FS_segment = 0x64,
480    GS_segment = 0x65,
481
482    REX        = 0x40,
483
484    REX_B      = 0x41,
485    REX_X      = 0x42,
486    REX_XB     = 0x43,
487    REX_R      = 0x44,
488    REX_RB     = 0x45,
489    REX_RX     = 0x46,
490    REX_RXB    = 0x47,
491
492    REX_W      = 0x48,
493
494    REX_WB     = 0x49,
495    REX_WX     = 0x4A,
496    REX_WXB    = 0x4B,
497    REX_WR     = 0x4C,
498    REX_WRB    = 0x4D,
499    REX_WRX    = 0x4E,
500    REX_WRXB   = 0x4F,
501
502    VEX_3bytes = 0xC4,
503    VEX_2bytes = 0xC5
504  };
505
506  enum VexPrefix {
507    VEX_B = 0x20,
508    VEX_X = 0x40,
509    VEX_R = 0x80,
510    VEX_W = 0x80
511  };
512
513  enum VexSimdPrefix {
514    VEX_SIMD_NONE = 0x0,
515    VEX_SIMD_66   = 0x1,
516    VEX_SIMD_F3   = 0x2,
517    VEX_SIMD_F2   = 0x3
518  };
519
520  enum VexOpcode {
521    VEX_OPCODE_NONE  = 0x0,
522    VEX_OPCODE_0F    = 0x1,
523    VEX_OPCODE_0F_38 = 0x2,
524    VEX_OPCODE_0F_3A = 0x3
525  };
526
527  enum WhichOperand {
528    // input to locate_operand, and format code for relocations
529    imm_operand  = 0,            // embedded 32-bit|64-bit immediate operand
530    disp32_operand = 1,          // embedded 32-bit displacement or address
531    call32_operand = 2,          // embedded 32-bit self-relative displacement
532#ifndef _LP64
533    _WhichOperand_limit = 3
534#else
535     narrow_oop_operand = 3,     // embedded 32-bit immediate narrow oop
536    _WhichOperand_limit = 4
537#endif
538  };
539
540
541
542  // NOTE: The general philopsophy of the declarations here is that 64bit versions
543  // of instructions are freely declared without the need for wrapping them an ifdef.
544  // (Some dangerous instructions are ifdef's out of inappropriate jvm's.)
545  // In the .cpp file the implementations are wrapped so that they are dropped out
546  // of the resulting jvm. This is done mostly to keep the footprint of KERNEL
547  // to the size it was prior to merging up the 32bit and 64bit assemblers.
548  //
549  // This does mean you'll get a linker/runtime error if you use a 64bit only instruction
550  // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down.
551
552private:
553
554
555  // 64bit prefixes
556  int prefix_and_encode(int reg_enc, bool byteinst = false);
557  int prefixq_and_encode(int reg_enc);
558
559  int prefix_and_encode(int dst_enc, int src_enc, bool byteinst = false);
560  int prefixq_and_encode(int dst_enc, int src_enc);
561
562  void prefix(Register reg);
563  void prefix(Address adr);
564  void prefixq(Address adr);
565
566  void prefix(Address adr, Register reg,  bool byteinst = false);
567  void prefix(Address adr, XMMRegister reg);
568  void prefixq(Address adr, Register reg);
569  void prefixq(Address adr, XMMRegister reg);
570
571  void prefetch_prefix(Address src);
572
573  void rex_prefix(Address adr, XMMRegister xreg,
574                  VexSimdPrefix pre, VexOpcode opc, bool rex_w);
575  int  rex_prefix_and_encode(int dst_enc, int src_enc,
576                             VexSimdPrefix pre, VexOpcode opc, bool rex_w);
577
578  void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
579                  int nds_enc, VexSimdPrefix pre, VexOpcode opc,
580                  bool vector256);
581
582  void vex_prefix(Address adr, int nds_enc, int xreg_enc,
583                  VexSimdPrefix pre, VexOpcode opc,
584                  bool vex_w, bool vector256);
585
586  void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
587                  VexSimdPrefix pre, bool vector256 = false) {
588    int dst_enc = dst->encoding();
589    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
590    vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
591  }
592
593  int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
594                             VexSimdPrefix pre, VexOpcode opc,
595                             bool vex_w, bool vector256);
596
597  int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
598                             VexSimdPrefix pre, bool vector256 = false,
599                             VexOpcode opc = VEX_OPCODE_0F) {
600    int src_enc = src->encoding();
601    int dst_enc = dst->encoding();
602    int nds_enc = nds->is_valid() ? nds->encoding() : 0;
603    return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256);
604  }
605
606  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
607                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
608                   bool rex_w = false, bool vector256 = false);
609
610  void simd_prefix(XMMRegister dst, Address src,
611                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
612    simd_prefix(dst, xnoreg, src, pre, opc);
613  }
614
615  void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
616    simd_prefix(src, dst, pre);
617  }
618  void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
619                     VexSimdPrefix pre) {
620    bool rex_w = true;
621    simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
622  }
623
624  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
625                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
626                             bool rex_w = false, bool vector256 = false);
627
628  // Move/convert 32-bit integer value.
629  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
630                             VexSimdPrefix pre) {
631    // It is OK to cast from Register to XMMRegister to pass argument here
632    // since only encoding is used in simd_prefix_and_encode() and number of
633    // Gen and Xmm registers are the same.
634    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre);
635  }
636  int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) {
637    return simd_prefix_and_encode(dst, xnoreg, src, pre);
638  }
639  int simd_prefix_and_encode(Register dst, XMMRegister src,
640                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
641    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc);
642  }
643
644  // Move/convert 64-bit integer value.
645  int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
646                               VexSimdPrefix pre) {
647    bool rex_w = true;
648    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w);
649  }
650  int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) {
651    return simd_prefix_and_encode_q(dst, xnoreg, src, pre);
652  }
653  int simd_prefix_and_encode_q(Register dst, XMMRegister src,
654                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
655    bool rex_w = true;
656    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w);
657  }
658
659  // Helper functions for groups of instructions
660  void emit_arith_b(int op1, int op2, Register dst, int imm8);
661
662  void emit_arith(int op1, int op2, Register dst, int32_t imm32);
663  // Force generation of a 4 byte immediate value even if it fits into 8bit
664  void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
665  void emit_arith(int op1, int op2, Register dst, Register src);
666
667  void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
668  void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
669  void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
670  void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
671  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
672                      Address src, VexSimdPrefix pre, bool vector256);
673  void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
674                      XMMRegister src, VexSimdPrefix pre, bool vector256);
675
676  void emit_operand(Register reg,
677                    Register base, Register index, Address::ScaleFactor scale,
678                    int disp,
679                    RelocationHolder const& rspec,
680                    int rip_relative_correction = 0);
681
682  void emit_operand(Register reg, Address adr, int rip_relative_correction = 0);
683
684  // operands that only take the original 32bit registers
685  void emit_operand32(Register reg, Address adr);
686
687  void emit_operand(XMMRegister reg,
688                    Register base, Register index, Address::ScaleFactor scale,
689                    int disp,
690                    RelocationHolder const& rspec);
691
692  void emit_operand(XMMRegister reg, Address adr);
693
694  void emit_operand(MMXRegister reg, Address adr);
695
696  // workaround gcc (3.2.1-7) bug
697  void emit_operand(Address adr, MMXRegister reg);
698
699
700  // Immediate-to-memory forms
701  void emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32);
702
703  void emit_farith(int b1, int b2, int i);
704
705
706 protected:
707  #ifdef ASSERT
708  void check_relocation(RelocationHolder const& rspec, int format);
709  #endif
710
711  void emit_data(jint data, relocInfo::relocType    rtype, int format);
712  void emit_data(jint data, RelocationHolder const& rspec, int format);
713  void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
714  void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
715
716  bool reachable(AddressLiteral adr) NOT_LP64({ return true;});
717
718  // These are all easily abused and hence protected
719
720  // 32BIT ONLY SECTION
721#ifndef _LP64
722  // Make these disappear in 64bit mode since they would never be correct
723  void cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec);   // 32BIT ONLY
724  void cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
725
726  void mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec);    // 32BIT ONLY
727  void mov_literal32(Address dst, int32_t imm32, RelocationHolder const& rspec);     // 32BIT ONLY
728
729  void push_literal32(int32_t imm32, RelocationHolder const& rspec);                 // 32BIT ONLY
730#else
731  // 64BIT ONLY SECTION
732  void mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec);   // 64BIT ONLY
733
734  void cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec);
735  void cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec);
736
737  void mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec);
738  void mov_narrow_oop(Address dst, int32_t imm32, RelocationHolder const& rspec);
739#endif // _LP64
740
741  // These are unique in that we are ensured by the caller that the 32bit
742  // relative in these instructions will always be able to reach the potentially
743  // 64bit address described by entry. Since they can take a 64bit address they
744  // don't have the 32 suffix like the other instructions in this class.
745
746  void call_literal(address entry, RelocationHolder const& rspec);
747  void jmp_literal(address entry, RelocationHolder const& rspec);
748
749  // Avoid using directly section
750  // Instructions in this section are actually usable by anyone without danger
751  // of failure but have performance issues that are addressed my enhanced
752  // instructions which will do the proper thing base on the particular cpu.
753  // We protect them because we don't trust you...
754
755  // Don't use next inc() and dec() methods directly. INC & DEC instructions
756  // could cause a partial flag stall since they don't set CF flag.
757  // Use MacroAssembler::decrement() & MacroAssembler::increment() methods
758  // which call inc() & dec() or add() & sub() in accordance with
759  // the product flag UseIncDec value.
760
761  void decl(Register dst);
762  void decl(Address dst);
763  void decq(Register dst);
764  void decq(Address dst);
765
766  void incl(Register dst);
767  void incl(Address dst);
768  void incq(Register dst);
769  void incq(Address dst);
770
771  // New cpus require use of movsd and movss to avoid partial register stall
772  // when loading from memory. But for old Opteron use movlpd instead of movsd.
773  // The selection is done in MacroAssembler::movdbl() and movflt().
774
775  // Move Scalar Single-Precision Floating-Point Values
776  void movss(XMMRegister dst, Address src);
777  void movss(XMMRegister dst, XMMRegister src);
778  void movss(Address dst, XMMRegister src);
779
780  // Move Scalar Double-Precision Floating-Point Values
781  void movsd(XMMRegister dst, Address src);
782  void movsd(XMMRegister dst, XMMRegister src);
783  void movsd(Address dst, XMMRegister src);
784  void movlpd(XMMRegister dst, Address src);
785
786  // New cpus require use of movaps and movapd to avoid partial register stall
787  // when moving between registers.
788  void movaps(XMMRegister dst, XMMRegister src);
789  void movapd(XMMRegister dst, XMMRegister src);
790
791  // End avoid using directly
792
793
794  // Instruction prefixes
795  void prefix(Prefix p);
796
797  public:
798
799  // Creation
800  Assembler(CodeBuffer* code) : AbstractAssembler(code) {}
801
802  // Decoding
803  static address locate_operand(address inst, WhichOperand which);
804  static address locate_next_instruction(address inst);
805
806  // Utilities
807  static bool is_polling_page_far() NOT_LP64({ return false;});
808
809  // Generic instructions
810  // Does 32bit or 64bit as needed for the platform. In some sense these
811  // belong in macro assembler but there is no need for both varieties to exist
812
813  void lea(Register dst, Address src);
814
815  void mov(Register dst, Register src);
816
817  void pusha();
818  void popa();
819
820  void pushf();
821  void popf();
822
823  void push(int32_t imm32);
824
825  void push(Register src);
826
827  void pop(Register dst);
828
829  // These are dummies to prevent surprise implicit conversions to Register
830  void push(void* v);
831  void pop(void* v);
832
833  // These do register sized moves/scans
834  void rep_mov();
835  void rep_set();
836  void repne_scan();
837#ifdef _LP64
838  void repne_scanl();
839#endif
840
841  // Vanilla instructions in lexical order
842
843  void adcl(Address dst, int32_t imm32);
844  void adcl(Address dst, Register src);
845  void adcl(Register dst, int32_t imm32);
846  void adcl(Register dst, Address src);
847  void adcl(Register dst, Register src);
848
849  void adcq(Register dst, int32_t imm32);
850  void adcq(Register dst, Address src);
851  void adcq(Register dst, Register src);
852
853  void addl(Address dst, int32_t imm32);
854  void addl(Address dst, Register src);
855  void addl(Register dst, int32_t imm32);
856  void addl(Register dst, Address src);
857  void addl(Register dst, Register src);
858
859  void addq(Address dst, int32_t imm32);
860  void addq(Address dst, Register src);
861  void addq(Register dst, int32_t imm32);
862  void addq(Register dst, Address src);
863  void addq(Register dst, Register src);
864
865  void addr_nop_4();
866  void addr_nop_5();
867  void addr_nop_7();
868  void addr_nop_8();
869
870  // Add Scalar Double-Precision Floating-Point Values
871  void addsd(XMMRegister dst, Address src);
872  void addsd(XMMRegister dst, XMMRegister src);
873
874  // Add Scalar Single-Precision Floating-Point Values
875  void addss(XMMRegister dst, Address src);
876  void addss(XMMRegister dst, XMMRegister src);
877
878  // AES instructions
879  void aesdec(XMMRegister dst, Address src);
880  void aesdec(XMMRegister dst, XMMRegister src);
881  void aesdeclast(XMMRegister dst, Address src);
882  void aesdeclast(XMMRegister dst, XMMRegister src);
883  void aesenc(XMMRegister dst, Address src);
884  void aesenc(XMMRegister dst, XMMRegister src);
885  void aesenclast(XMMRegister dst, Address src);
886  void aesenclast(XMMRegister dst, XMMRegister src);
887
888
889  void andl(Address  dst, int32_t imm32);
890  void andl(Register dst, int32_t imm32);
891  void andl(Register dst, Address src);
892  void andl(Register dst, Register src);
893
894  void andq(Address  dst, int32_t imm32);
895  void andq(Register dst, int32_t imm32);
896  void andq(Register dst, Address src);
897  void andq(Register dst, Register src);
898
899  void bsfl(Register dst, Register src);
900  void bsrl(Register dst, Register src);
901
902#ifdef _LP64
903  void bsfq(Register dst, Register src);
904  void bsrq(Register dst, Register src);
905#endif
906
907  void bswapl(Register reg);
908
909  void bswapq(Register reg);
910
911  void call(Label& L, relocInfo::relocType rtype);
912  void call(Register reg);  // push pc; pc <- reg
913  void call(Address adr);   // push pc; pc <- adr
914
915  void cdql();
916
917  void cdqq();
918
919  void cld();
920
921  void clflush(Address adr);
922
923  void cmovl(Condition cc, Register dst, Register src);
924  void cmovl(Condition cc, Register dst, Address src);
925
926  void cmovq(Condition cc, Register dst, Register src);
927  void cmovq(Condition cc, Register dst, Address src);
928
929
930  void cmpb(Address dst, int imm8);
931
932  void cmpl(Address dst, int32_t imm32);
933
934  void cmpl(Register dst, int32_t imm32);
935  void cmpl(Register dst, Register src);
936  void cmpl(Register dst, Address src);
937
938  void cmpq(Address dst, int32_t imm32);
939  void cmpq(Address dst, Register src);
940
941  void cmpq(Register dst, int32_t imm32);
942  void cmpq(Register dst, Register src);
943  void cmpq(Register dst, Address src);
944
945  // these are dummies used to catch attempting to convert NULL to Register
946  void cmpl(Register dst, void* junk); // dummy
947  void cmpq(Register dst, void* junk); // dummy
948
949  void cmpw(Address dst, int imm16);
950
951  void cmpxchg8 (Address adr);
952
953  void cmpxchgl(Register reg, Address adr);
954
955  void cmpxchgq(Register reg, Address adr);
956
957  // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
958  void comisd(XMMRegister dst, Address src);
959  void comisd(XMMRegister dst, XMMRegister src);
960
961  // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
962  void comiss(XMMRegister dst, Address src);
963  void comiss(XMMRegister dst, XMMRegister src);
964
965  // Identify processor type and features
966  void cpuid();
967
968  // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
969  void cvtsd2ss(XMMRegister dst, XMMRegister src);
970  void cvtsd2ss(XMMRegister dst, Address src);
971
972  // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
973  void cvtsi2sdl(XMMRegister dst, Register src);
974  void cvtsi2sdl(XMMRegister dst, Address src);
975  void cvtsi2sdq(XMMRegister dst, Register src);
976  void cvtsi2sdq(XMMRegister dst, Address src);
977
978  // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
979  void cvtsi2ssl(XMMRegister dst, Register src);
980  void cvtsi2ssl(XMMRegister dst, Address src);
981  void cvtsi2ssq(XMMRegister dst, Register src);
982  void cvtsi2ssq(XMMRegister dst, Address src);
983
984  // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
985  void cvtdq2pd(XMMRegister dst, XMMRegister src);
986
987  // Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value
988  void cvtdq2ps(XMMRegister dst, XMMRegister src);
989
990  // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
991  void cvtss2sd(XMMRegister dst, XMMRegister src);
992  void cvtss2sd(XMMRegister dst, Address src);
993
994  // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
995  void cvttsd2sil(Register dst, Address src);
996  void cvttsd2sil(Register dst, XMMRegister src);
997  void cvttsd2siq(Register dst, XMMRegister src);
998
999  // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
1000  void cvttss2sil(Register dst, XMMRegister src);
1001  void cvttss2siq(Register dst, XMMRegister src);
1002
1003  // Divide Scalar Double-Precision Floating-Point Values
1004  void divsd(XMMRegister dst, Address src);
1005  void divsd(XMMRegister dst, XMMRegister src);
1006
1007  // Divide Scalar Single-Precision Floating-Point Values
1008  void divss(XMMRegister dst, Address src);
1009  void divss(XMMRegister dst, XMMRegister src);
1010
1011  void emms();
1012
1013  void fabs();
1014
1015  void fadd(int i);
1016
1017  void fadd_d(Address src);
1018  void fadd_s(Address src);
1019
1020  // "Alternate" versions of x87 instructions place result down in FPU
1021  // stack instead of on TOS
1022
1023  void fadda(int i); // "alternate" fadd
1024  void faddp(int i = 1);
1025
1026  void fchs();
1027
1028  void fcom(int i);
1029
1030  void fcomp(int i = 1);
1031  void fcomp_d(Address src);
1032  void fcomp_s(Address src);
1033
1034  void fcompp();
1035
1036  void fcos();
1037
1038  void fdecstp();
1039
1040  void fdiv(int i);
1041  void fdiv_d(Address src);
1042  void fdivr_s(Address src);
1043  void fdiva(int i);  // "alternate" fdiv
1044  void fdivp(int i = 1);
1045
1046  void fdivr(int i);
1047  void fdivr_d(Address src);
1048  void fdiv_s(Address src);
1049
1050  void fdivra(int i); // "alternate" reversed fdiv
1051
1052  void fdivrp(int i = 1);
1053
1054  void ffree(int i = 0);
1055
1056  void fild_d(Address adr);
1057  void fild_s(Address adr);
1058
1059  void fincstp();
1060
1061  void finit();
1062
1063  void fist_s (Address adr);
1064  void fistp_d(Address adr);
1065  void fistp_s(Address adr);
1066
1067  void fld1();
1068
1069  void fld_d(Address adr);
1070  void fld_s(Address adr);
1071  void fld_s(int index);
1072  void fld_x(Address adr);  // extended-precision (80-bit) format
1073
1074  void fldcw(Address src);
1075
1076  void fldenv(Address src);
1077
1078  void fldlg2();
1079
1080  void fldln2();
1081
1082  void fldz();
1083
1084  void flog();
1085  void flog10();
1086
1087  void fmul(int i);
1088
1089  void fmul_d(Address src);
1090  void fmul_s(Address src);
1091
1092  void fmula(int i);  // "alternate" fmul
1093
1094  void fmulp(int i = 1);
1095
1096  void fnsave(Address dst);
1097
1098  void fnstcw(Address src);
1099
1100  void fnstsw_ax();
1101
1102  void fprem();
1103  void fprem1();
1104
1105  void frstor(Address src);
1106
1107  void fsin();
1108
1109  void fsqrt();
1110
1111  void fst_d(Address adr);
1112  void fst_s(Address adr);
1113
1114  void fstp_d(Address adr);
1115  void fstp_d(int index);
1116  void fstp_s(Address adr);
1117  void fstp_x(Address adr); // extended-precision (80-bit) format
1118
1119  void fsub(int i);
1120  void fsub_d(Address src);
1121  void fsub_s(Address src);
1122
1123  void fsuba(int i);  // "alternate" fsub
1124
1125  void fsubp(int i = 1);
1126
1127  void fsubr(int i);
1128  void fsubr_d(Address src);
1129  void fsubr_s(Address src);
1130
1131  void fsubra(int i); // "alternate" reversed fsub
1132
1133  void fsubrp(int i = 1);
1134
1135  void ftan();
1136
1137  void ftst();
1138
1139  void fucomi(int i = 1);
1140  void fucomip(int i = 1);
1141
1142  void fwait();
1143
1144  void fxch(int i = 1);
1145
1146  void fxrstor(Address src);
1147
1148  void fxsave(Address dst);
1149
1150  void fyl2x();
1151  void frndint();
1152  void f2xm1();
1153  void fldl2e();
1154
1155  void hlt();
1156
1157  void idivl(Register src);
1158  void divl(Register src); // Unsigned division
1159
1160  void idivq(Register src);
1161
1162  void imull(Register dst, Register src);
1163  void imull(Register dst, Register src, int value);
1164
1165  void imulq(Register dst, Register src);
1166  void imulq(Register dst, Register src, int value);
1167
1168
1169  // jcc is the generic conditional branch generator to run-
1170  // time routines, jcc is used for branches to labels. jcc
1171  // takes a branch opcode (cc) and a label (L) and generates
1172  // either a backward branch or a forward branch and links it
1173  // to the label fixup chain. Usage:
1174  //
1175  // Label L;      // unbound label
1176  // jcc(cc, L);   // forward branch to unbound label
1177  // bind(L);      // bind label to the current pc
1178  // jcc(cc, L);   // backward branch to bound label
1179  // bind(L);      // illegal: a label may be bound only once
1180  //
1181  // Note: The same Label can be used for forward and backward branches
1182  // but it may be bound only once.
1183
1184  void jcc(Condition cc, Label& L, bool maybe_short = true);
1185
1186  // Conditional jump to a 8-bit offset to L.
1187  // WARNING: be very careful using this for forward jumps.  If the label is
1188  // not bound within an 8-bit offset of this instruction, a run-time error
1189  // will occur.
1190  void jccb(Condition cc, Label& L);
1191
1192  void jmp(Address entry);    // pc <- entry
1193
1194  // Label operations & relative jumps (PPUM Appendix D)
1195  void jmp(Label& L, bool maybe_short = true);   // unconditional jump to L
1196
1197  void jmp(Register entry); // pc <- entry
1198
1199  // Unconditional 8-bit offset jump to L.
1200  // WARNING: be very careful using this for forward jumps.  If the label is
1201  // not bound within an 8-bit offset of this instruction, a run-time error
1202  // will occur.
1203  void jmpb(Label& L);
1204
1205  void ldmxcsr( Address src );
1206
1207  void leal(Register dst, Address src);
1208
1209  void leaq(Register dst, Address src);
1210
1211  void lfence();
1212
1213  void lock();
1214
1215  void lzcntl(Register dst, Register src);
1216
1217#ifdef _LP64
1218  void lzcntq(Register dst, Register src);
1219#endif
1220
1221  enum Membar_mask_bits {
1222    StoreStore = 1 << 3,
1223    LoadStore  = 1 << 2,
1224    StoreLoad  = 1 << 1,
1225    LoadLoad   = 1 << 0
1226  };
1227
1228  // Serializes memory and blows flags
1229  void membar(Membar_mask_bits order_constraint) {
1230    if (os::is_MP()) {
1231      // We only have to handle StoreLoad
1232      if (order_constraint & StoreLoad) {
1233        // All usable chips support "locked" instructions which suffice
1234        // as barriers, and are much faster than the alternative of
1235        // using cpuid instruction. We use here a locked add [esp],0.
1236        // This is conveniently otherwise a no-op except for blowing
1237        // flags.
1238        // Any change to this code may need to revisit other places in
1239        // the code where this idiom is used, in particular the
1240        // orderAccess code.
1241        lock();
1242        addl(Address(rsp, 0), 0);// Assert the lock# signal here
1243      }
1244    }
1245  }
1246
1247  void mfence();
1248
1249  // Moves
1250
1251  void mov64(Register dst, int64_t imm64);
1252
1253  void movb(Address dst, Register src);
1254  void movb(Address dst, int imm8);
1255  void movb(Register dst, Address src);
1256
1257  void movdl(XMMRegister dst, Register src);
1258  void movdl(Register dst, XMMRegister src);
1259  void movdl(XMMRegister dst, Address src);
1260  void movdl(Address dst, XMMRegister src);
1261
1262  // Move Double Quadword
1263  void movdq(XMMRegister dst, Register src);
1264  void movdq(Register dst, XMMRegister src);
1265
1266  // Move Aligned Double Quadword
1267  void movdqa(XMMRegister dst, XMMRegister src);
1268
1269  // Move Unaligned Double Quadword
1270  void movdqu(Address     dst, XMMRegister src);
1271  void movdqu(XMMRegister dst, Address src);
1272  void movdqu(XMMRegister dst, XMMRegister src);
1273
1274  // Move Unaligned 256bit Vector
1275  void vmovdqu(Address dst, XMMRegister src);
1276  void vmovdqu(XMMRegister dst, Address src);
1277  void vmovdqu(XMMRegister dst, XMMRegister src);
1278
1279  // Move lower 64bit to high 64bit in 128bit register
1280  void movlhps(XMMRegister dst, XMMRegister src);
1281
1282  void movl(Register dst, int32_t imm32);
1283  void movl(Address dst, int32_t imm32);
1284  void movl(Register dst, Register src);
1285  void movl(Register dst, Address src);
1286  void movl(Address dst, Register src);
1287
1288  // These dummies prevent using movl from converting a zero (like NULL) into Register
1289  // by giving the compiler two choices it can't resolve
1290
1291  void movl(Address  dst, void* junk);
1292  void movl(Register dst, void* junk);
1293
1294#ifdef _LP64
1295  void movq(Register dst, Register src);
1296  void movq(Register dst, Address src);
1297  void movq(Address  dst, Register src);
1298#endif
1299
1300  void movq(Address     dst, MMXRegister src );
1301  void movq(MMXRegister dst, Address src );
1302
1303#ifdef _LP64
1304  // These dummies prevent using movq from converting a zero (like NULL) into Register
1305  // by giving the compiler two choices it can't resolve
1306
1307  void movq(Address  dst, void* dummy);
1308  void movq(Register dst, void* dummy);
1309#endif
1310
1311  // Move Quadword
1312  void movq(Address     dst, XMMRegister src);
1313  void movq(XMMRegister dst, Address src);
1314
1315  void movsbl(Register dst, Address src);
1316  void movsbl(Register dst, Register src);
1317
1318#ifdef _LP64
1319  void movsbq(Register dst, Address src);
1320  void movsbq(Register dst, Register src);
1321
1322  // Move signed 32bit immediate to 64bit extending sign
1323  void movslq(Address  dst, int32_t imm64);
1324  void movslq(Register dst, int32_t imm64);
1325
1326  void movslq(Register dst, Address src);
1327  void movslq(Register dst, Register src);
1328  void movslq(Register dst, void* src); // Dummy declaration to cause NULL to be ambiguous
1329#endif
1330
1331  void movswl(Register dst, Address src);
1332  void movswl(Register dst, Register src);
1333
1334#ifdef _LP64
1335  void movswq(Register dst, Address src);
1336  void movswq(Register dst, Register src);
1337#endif
1338
1339  void movw(Address dst, int imm16);
1340  void movw(Register dst, Address src);
1341  void movw(Address dst, Register src);
1342
1343  void movzbl(Register dst, Address src);
1344  void movzbl(Register dst, Register src);
1345
1346#ifdef _LP64
1347  void movzbq(Register dst, Address src);
1348  void movzbq(Register dst, Register src);
1349#endif
1350
1351  void movzwl(Register dst, Address src);
1352  void movzwl(Register dst, Register src);
1353
1354#ifdef _LP64
1355  void movzwq(Register dst, Address src);
1356  void movzwq(Register dst, Register src);
1357#endif
1358
1359  void mull(Address src);
1360  void mull(Register src);
1361
1362  // Multiply Scalar Double-Precision Floating-Point Values
1363  void mulsd(XMMRegister dst, Address src);
1364  void mulsd(XMMRegister dst, XMMRegister src);
1365
1366  // Multiply Scalar Single-Precision Floating-Point Values
1367  void mulss(XMMRegister dst, Address src);
1368  void mulss(XMMRegister dst, XMMRegister src);
1369
1370  void negl(Register dst);
1371
1372#ifdef _LP64
1373  void negq(Register dst);
1374#endif
1375
1376  void nop(int i = 1);
1377
1378  void notl(Register dst);
1379
1380#ifdef _LP64
1381  void notq(Register dst);
1382#endif
1383
1384  void orl(Address dst, int32_t imm32);
1385  void orl(Register dst, int32_t imm32);
1386  void orl(Register dst, Address src);
1387  void orl(Register dst, Register src);
1388
1389  void orq(Address dst, int32_t imm32);
1390  void orq(Register dst, int32_t imm32);
1391  void orq(Register dst, Address src);
1392  void orq(Register dst, Register src);
1393
1394  // Pack with unsigned saturation
1395  void packuswb(XMMRegister dst, XMMRegister src);
1396  void packuswb(XMMRegister dst, Address src);
1397
1398  // SSE4.2 string instructions
1399  void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
1400  void pcmpestri(XMMRegister xmm1, Address src, int imm8);
1401
1402  // SSE4.1 packed move
1403  void pmovzxbw(XMMRegister dst, XMMRegister src);
1404  void pmovzxbw(XMMRegister dst, Address src);
1405
1406#ifndef _LP64 // no 32bit push/pop on amd64
1407  void popl(Address dst);
1408#endif
1409
1410#ifdef _LP64
1411  void popq(Address dst);
1412#endif
1413
1414  void popcntl(Register dst, Address src);
1415  void popcntl(Register dst, Register src);
1416
1417#ifdef _LP64
1418  void popcntq(Register dst, Address src);
1419  void popcntq(Register dst, Register src);
1420#endif
1421
1422  // Prefetches (SSE, SSE2, 3DNOW only)
1423
1424  void prefetchnta(Address src);
1425  void prefetchr(Address src);
1426  void prefetcht0(Address src);
1427  void prefetcht1(Address src);
1428  void prefetcht2(Address src);
1429  void prefetchw(Address src);
1430
1431  // Shuffle Bytes
1432  void pshufb(XMMRegister dst, XMMRegister src);
1433  void pshufb(XMMRegister dst, Address src);
1434
1435  // Shuffle Packed Doublewords
1436  void pshufd(XMMRegister dst, XMMRegister src, int mode);
1437  void pshufd(XMMRegister dst, Address src,     int mode);
1438
1439  // Shuffle Packed Low Words
1440  void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1441  void pshuflw(XMMRegister dst, Address src,     int mode);
1442
1443  // Shift Right by bytes Logical DoubleQuadword Immediate
1444  void psrldq(XMMRegister dst, int shift);
1445
1446  // Logical Compare Double Quadword
1447  void ptest(XMMRegister dst, XMMRegister src);
1448  void ptest(XMMRegister dst, Address src);
1449
1450  // Interleave Low Bytes
1451  void punpcklbw(XMMRegister dst, XMMRegister src);
1452  void punpcklbw(XMMRegister dst, Address src);
1453
1454  // Interleave Low Doublewords
1455  void punpckldq(XMMRegister dst, XMMRegister src);
1456  void punpckldq(XMMRegister dst, Address src);
1457
1458  // Interleave Low Quadwords
1459  void punpcklqdq(XMMRegister dst, XMMRegister src);
1460
1461#ifndef _LP64 // no 32bit push/pop on amd64
1462  void pushl(Address src);
1463#endif
1464
1465  void pushq(Address src);
1466
1467  void rcll(Register dst, int imm8);
1468
1469  void rclq(Register dst, int imm8);
1470
1471  void ret(int imm16);
1472
1473  void sahf();
1474
1475  void sarl(Register dst, int imm8);
1476  void sarl(Register dst);
1477
1478  void sarq(Register dst, int imm8);
1479  void sarq(Register dst);
1480
1481  void sbbl(Address dst, int32_t imm32);
1482  void sbbl(Register dst, int32_t imm32);
1483  void sbbl(Register dst, Address src);
1484  void sbbl(Register dst, Register src);
1485
1486  void sbbq(Address dst, int32_t imm32);
1487  void sbbq(Register dst, int32_t imm32);
1488  void sbbq(Register dst, Address src);
1489  void sbbq(Register dst, Register src);
1490
1491  void setb(Condition cc, Register dst);
1492
1493  void shldl(Register dst, Register src);
1494
1495  void shll(Register dst, int imm8);
1496  void shll(Register dst);
1497
1498  void shlq(Register dst, int imm8);
1499  void shlq(Register dst);
1500
1501  void shrdl(Register dst, Register src);
1502
1503  void shrl(Register dst, int imm8);
1504  void shrl(Register dst);
1505
1506  void shrq(Register dst, int imm8);
1507  void shrq(Register dst);
1508
1509  void smovl(); // QQQ generic?
1510
1511  // Compute Square Root of Scalar Double-Precision Floating-Point Value
1512  void sqrtsd(XMMRegister dst, Address src);
1513  void sqrtsd(XMMRegister dst, XMMRegister src);
1514
1515  // Compute Square Root of Scalar Single-Precision Floating-Point Value
1516  void sqrtss(XMMRegister dst, Address src);
1517  void sqrtss(XMMRegister dst, XMMRegister src);
1518
1519  void std();
1520
1521  void stmxcsr( Address dst );
1522
1523  void subl(Address dst, int32_t imm32);
1524  void subl(Address dst, Register src);
1525  void subl(Register dst, int32_t imm32);
1526  void subl(Register dst, Address src);
1527  void subl(Register dst, Register src);
1528
1529  void subq(Address dst, int32_t imm32);
1530  void subq(Address dst, Register src);
1531  void subq(Register dst, int32_t imm32);
1532  void subq(Register dst, Address src);
1533  void subq(Register dst, Register src);
1534
1535  // Force generation of a 4 byte immediate value even if it fits into 8bit
1536  void subl_imm32(Register dst, int32_t imm32);
1537  void subq_imm32(Register dst, int32_t imm32);
1538
1539  // Subtract Scalar Double-Precision Floating-Point Values
1540  void subsd(XMMRegister dst, Address src);
1541  void subsd(XMMRegister dst, XMMRegister src);
1542
1543  // Subtract Scalar Single-Precision Floating-Point Values
1544  void subss(XMMRegister dst, Address src);
1545  void subss(XMMRegister dst, XMMRegister src);
1546
1547  void testb(Register dst, int imm8);
1548
1549  void testl(Register dst, int32_t imm32);
1550  void testl(Register dst, Register src);
1551  void testl(Register dst, Address src);
1552
1553  void testq(Register dst, int32_t imm32);
1554  void testq(Register dst, Register src);
1555
1556
1557  // Unordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
1558  void ucomisd(XMMRegister dst, Address src);
1559  void ucomisd(XMMRegister dst, XMMRegister src);
1560
1561  // Unordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
1562  void ucomiss(XMMRegister dst, Address src);
1563  void ucomiss(XMMRegister dst, XMMRegister src);
1564
1565  void xaddl(Address dst, Register src);
1566
1567  void xaddq(Address dst, Register src);
1568
1569  void xchgl(Register reg, Address adr);
1570  void xchgl(Register dst, Register src);
1571
1572  void xchgq(Register reg, Address adr);
1573  void xchgq(Register dst, Register src);
1574
1575  // Get Value of Extended Control Register
1576  void xgetbv();
1577
1578  void xorl(Register dst, int32_t imm32);
1579  void xorl(Register dst, Address src);
1580  void xorl(Register dst, Register src);
1581
1582  void xorq(Register dst, Address src);
1583  void xorq(Register dst, Register src);
1584
1585  void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
1586
1587  // AVX 3-operands scalar instructions (encoded with VEX prefix)
1588
1589  void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
1590  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1591  void vaddss(XMMRegister dst, XMMRegister nds, Address src);
1592  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1593  void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
1594  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1595  void vdivss(XMMRegister dst, XMMRegister nds, Address src);
1596  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1597  void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
1598  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1599  void vmulss(XMMRegister dst, XMMRegister nds, Address src);
1600  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1601  void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
1602  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1603  void vsubss(XMMRegister dst, XMMRegister nds, Address src);
1604  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1605
1606
1607  //====================VECTOR ARITHMETIC=====================================
1608
1609  // Add Packed Floating-Point Values
1610  void addpd(XMMRegister dst, XMMRegister src);
1611  void addps(XMMRegister dst, XMMRegister src);
1612  void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1613  void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1614  void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1615  void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1616
1617  // Subtract Packed Floating-Point Values
1618  void subpd(XMMRegister dst, XMMRegister src);
1619  void subps(XMMRegister dst, XMMRegister src);
1620  void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1621  void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1622  void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1623  void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1624
1625  // Multiply Packed Floating-Point Values
1626  void mulpd(XMMRegister dst, XMMRegister src);
1627  void mulps(XMMRegister dst, XMMRegister src);
1628  void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1629  void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1630  void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1631  void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1632
1633  // Divide Packed Floating-Point Values
1634  void divpd(XMMRegister dst, XMMRegister src);
1635  void divps(XMMRegister dst, XMMRegister src);
1636  void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1637  void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1638  void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1639  void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1640
1641  // Bitwise Logical AND of Packed Floating-Point Values
1642  void andpd(XMMRegister dst, XMMRegister src);
1643  void andps(XMMRegister dst, XMMRegister src);
1644  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1645  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1646  void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1647  void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1648
1649  // Bitwise Logical XOR of Packed Floating-Point Values
1650  void xorpd(XMMRegister dst, XMMRegister src);
1651  void xorps(XMMRegister dst, XMMRegister src);
1652  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1653  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1654  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1655  void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1656
1657  // Add packed integers
1658  void paddb(XMMRegister dst, XMMRegister src);
1659  void paddw(XMMRegister dst, XMMRegister src);
1660  void paddd(XMMRegister dst, XMMRegister src);
1661  void paddq(XMMRegister dst, XMMRegister src);
1662  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1663  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1664  void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1665  void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1666  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1667  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1668  void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1669  void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1670
1671  // Sub packed integers
1672  void psubb(XMMRegister dst, XMMRegister src);
1673  void psubw(XMMRegister dst, XMMRegister src);
1674  void psubd(XMMRegister dst, XMMRegister src);
1675  void psubq(XMMRegister dst, XMMRegister src);
1676  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1677  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1678  void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1679  void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1680  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1681  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1682  void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1683  void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1684
1685  // Multiply packed integers (only shorts and ints)
1686  void pmullw(XMMRegister dst, XMMRegister src);
1687  void pmulld(XMMRegister dst, XMMRegister src);
1688  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1689  void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1690  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1691  void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1692
1693  // Shift left packed integers
1694  void psllw(XMMRegister dst, int shift);
1695  void pslld(XMMRegister dst, int shift);
1696  void psllq(XMMRegister dst, int shift);
1697  void psllw(XMMRegister dst, XMMRegister shift);
1698  void pslld(XMMRegister dst, XMMRegister shift);
1699  void psllq(XMMRegister dst, XMMRegister shift);
1700  void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1701  void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1702  void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1703  void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1704  void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1705  void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1706
1707  // Logical shift right packed integers
1708  void psrlw(XMMRegister dst, int shift);
1709  void psrld(XMMRegister dst, int shift);
1710  void psrlq(XMMRegister dst, int shift);
1711  void psrlw(XMMRegister dst, XMMRegister shift);
1712  void psrld(XMMRegister dst, XMMRegister shift);
1713  void psrlq(XMMRegister dst, XMMRegister shift);
1714  void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1715  void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1716  void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1717  void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1718  void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1719  void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1720
1721  // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
1722  void psraw(XMMRegister dst, int shift);
1723  void psrad(XMMRegister dst, int shift);
1724  void psraw(XMMRegister dst, XMMRegister shift);
1725  void psrad(XMMRegister dst, XMMRegister shift);
1726  void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1727  void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1728  void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1729  void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1730
1731  // And packed integers
1732  void pand(XMMRegister dst, XMMRegister src);
1733  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1734  void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1735
1736  // Or packed integers
1737  void por(XMMRegister dst, XMMRegister src);
1738  void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1739  void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1740
1741  // Xor packed integers
1742  void pxor(XMMRegister dst, XMMRegister src);
1743  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1744  void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1745
1746  // Copy low 128bit into high 128bit of YMM registers.
1747  void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
1748  void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
1749
1750  // Load/store high 128bit of YMM registers which does not destroy other half.
1751  void vinsertf128h(XMMRegister dst, Address src);
1752  void vinserti128h(XMMRegister dst, Address src);
1753  void vextractf128h(Address dst, XMMRegister src);
1754  void vextracti128h(Address dst, XMMRegister src);
1755
1756  // AVX instruction which is used to clear upper 128 bits of YMM registers and
1757  // to avoid transaction penalty between AVX and SSE states. There is no
1758  // penalty if legacy SSE instructions are encoded using VEX prefix because
1759  // they always clear upper 128 bits. It should be used before calling
1760  // runtime code and native libraries.
1761  void vzeroupper();
1762
1763 protected:
1764  // Next instructions require address alignment 16 bytes SSE mode.
1765  // They should be called only from corresponding MacroAssembler instructions.
1766  void andpd(XMMRegister dst, Address src);
1767  void andps(XMMRegister dst, Address src);
1768  void xorpd(XMMRegister dst, Address src);
1769  void xorps(XMMRegister dst, Address src);
1770
1771};
1772
1773#endif // CPU_X86_VM_ASSEMBLER_X86_HPP
1774