1/*
2 * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#ifndef CPU_X86_VM_MACROASSEMBLER_X86_HPP
26#define CPU_X86_VM_MACROASSEMBLER_X86_HPP
27
28#include "asm/assembler.hpp"
29#include "utilities/macros.hpp"
30#include "runtime/rtmLocking.hpp"
31
32// MacroAssembler extends Assembler by frequently used macros.
33//
34// Instructions for which a 'better' code sequence exists depending
35// on arguments should also go in here.
36
37class MacroAssembler: public Assembler {
38  friend class LIR_Assembler;
39  friend class Runtime1;      // as_Address()
40
41 protected:
42
43  Address as_Address(AddressLiteral adr);
44  Address as_Address(ArrayAddress adr);
45
46  // Support for VM calls
47  //
48  // This is the base routine called by the different versions of call_VM_leaf. The interpreter
49  // may customize this version by overriding it for its purposes (e.g., to save/restore
50  // additional registers when doing a VM call).
51
52  virtual void call_VM_leaf_base(
53    address entry_point,               // the entry point
54    int     number_of_arguments        // the number of arguments to pop after the call
55  );
56
57  // This is the base routine called by the different versions of call_VM. The interpreter
58  // may customize this version by overriding it for its purposes (e.g., to save/restore
59  // additional registers when doing a VM call).
60  //
61  // If no java_thread register is specified (noreg) than rdi will be used instead. call_VM_base
62  // returns the register which contains the thread upon return. If a thread register has been
63  // specified, the return value will correspond to that register. If no last_java_sp is specified
64  // (noreg) than rsp will be used instead.
65  virtual void call_VM_base(           // returns the register containing the thread upon return
66    Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
67    Register java_thread,              // the thread if computed before     ; use noreg otherwise
68    Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
69    address  entry_point,              // the entry point
70    int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
71    bool     check_exceptions          // whether to check for pending exceptions after return
72  );
73
74  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
75  // The implementation is only non-empty for the InterpreterMacroAssembler,
76  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
77  virtual void check_and_handle_popframe(Register java_thread);
78  virtual void check_and_handle_earlyret(Register java_thread);
79
80  void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
81
82  // helpers for FPU flag access
83  // tmp is a temporary register, if none is available use noreg
84  void save_rax   (Register tmp);
85  void restore_rax(Register tmp);
86
87 public:
88  MacroAssembler(CodeBuffer* code) : Assembler(code) {}
89
90  // Support for NULL-checks
91  //
92  // Generates code that causes a NULL OS exception if the content of reg is NULL.
93  // If the accessed location is M[reg + offset] and the offset is known, provide the
94  // offset. No explicit code generation is needed if the offset is within a certain
95  // range (0 <= offset <= page_size).
96
97  void null_check(Register reg, int offset = -1);
98  static bool needs_explicit_null_check(intptr_t offset);
99
100  // Required platform-specific helpers for Label::patch_instructions.
101  // They _shadow_ the declarations in AbstractAssembler, which are undefined.
102  void pd_patch_instruction(address branch, address target) {
103    unsigned char op = branch[0];
104    assert(op == 0xE8 /* call */ ||
105        op == 0xE9 /* jmp */ ||
106        op == 0xEB /* short jmp */ ||
107        (op & 0xF0) == 0x70 /* short jcc */ ||
108        op == 0x0F && (branch[1] & 0xF0) == 0x80 /* jcc */ ||
109        op == 0xC7 && branch[1] == 0xF8 /* xbegin */,
110        "Invalid opcode at patch point");
111
112    if (op == 0xEB || (op & 0xF0) == 0x70) {
113      // short offset operators (jmp and jcc)
114      char* disp = (char*) &branch[1];
115      int imm8 = target - (address) &disp[1];
116      guarantee(this->is8bit(imm8), "Short forward jump exceeds 8-bit offset");
117      *disp = imm8;
118    } else {
119      int* disp = (int*) &branch[(op == 0x0F || op == 0xC7)? 2: 1];
120      int imm32 = target - (address) &disp[1];
121      *disp = imm32;
122    }
123  }
124
125  // The following 4 methods return the offset of the appropriate move instruction
126
127  // Support for fast byte/short loading with zero extension (depending on particular CPU)
128  int load_unsigned_byte(Register dst, Address src);
129  int load_unsigned_short(Register dst, Address src);
130
131  // Support for fast byte/short loading with sign extension (depending on particular CPU)
132  int load_signed_byte(Register dst, Address src);
133  int load_signed_short(Register dst, Address src);
134
135  // Support for sign-extension (hi:lo = extend_sign(lo))
136  void extend_sign(Register hi, Register lo);
137
138  // Load and store values by size and signed-ness
139  void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
140  void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
141
142  // Support for inc/dec with optimal instruction selection depending on value
143
144  void increment(Register reg, int value = 1) { LP64_ONLY(incrementq(reg, value)) NOT_LP64(incrementl(reg, value)) ; }
145  void decrement(Register reg, int value = 1) { LP64_ONLY(decrementq(reg, value)) NOT_LP64(decrementl(reg, value)) ; }
146
147  void decrementl(Address dst, int value = 1);
148  void decrementl(Register reg, int value = 1);
149
150  void decrementq(Register reg, int value = 1);
151  void decrementq(Address dst, int value = 1);
152
153  void incrementl(Address dst, int value = 1);
154  void incrementl(Register reg, int value = 1);
155
156  void incrementq(Register reg, int value = 1);
157  void incrementq(Address dst, int value = 1);
158
159  // special instructions for EVEX
160  void setvectmask(Register dst, Register src);
161  void restorevectmask();
162
163  // Support optimal SSE move instructions.
164  void movflt(XMMRegister dst, XMMRegister src) {
165    if (UseXmmRegToRegMoveAll) { movaps(dst, src); return; }
166    else                       { movss (dst, src); return; }
167  }
168  void movflt(XMMRegister dst, Address src) { movss(dst, src); }
169  void movflt(XMMRegister dst, AddressLiteral src);
170  void movflt(Address dst, XMMRegister src) { movss(dst, src); }
171
172  void movdbl(XMMRegister dst, XMMRegister src) {
173    if (UseXmmRegToRegMoveAll) { movapd(dst, src); return; }
174    else                       { movsd (dst, src); return; }
175  }
176
177  void movdbl(XMMRegister dst, AddressLiteral src);
178
179  void movdbl(XMMRegister dst, Address src) {
180    if (UseXmmLoadAndClearUpper) { movsd (dst, src); return; }
181    else                         { movlpd(dst, src); return; }
182  }
183  void movdbl(Address dst, XMMRegister src) { movsd(dst, src); }
184
185  void incrementl(AddressLiteral dst);
186  void incrementl(ArrayAddress dst);
187
188  void incrementq(AddressLiteral dst);
189
190  // Alignment
191  void align(int modulus);
192  void align(int modulus, int target);
193
194  // A 5 byte nop that is safe for patching (see patch_verified_entry)
195  void fat_nop();
196
197  // Stack frame creation/removal
198  void enter();
199  void leave();
200
201  // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
202  // The pointer will be loaded into the thread register.
203  void get_thread(Register thread);
204
205
206  // Support for VM calls
207  //
208  // It is imperative that all calls into the VM are handled via the call_VM macros.
209  // They make sure that the stack linkage is setup correctly. call_VM's correspond
210  // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
211
212
213  void call_VM(Register oop_result,
214               address entry_point,
215               bool check_exceptions = true);
216  void call_VM(Register oop_result,
217               address entry_point,
218               Register arg_1,
219               bool check_exceptions = true);
220  void call_VM(Register oop_result,
221               address entry_point,
222               Register arg_1, Register arg_2,
223               bool check_exceptions = true);
224  void call_VM(Register oop_result,
225               address entry_point,
226               Register arg_1, Register arg_2, Register arg_3,
227               bool check_exceptions = true);
228
229  // Overloadings with last_Java_sp
230  void call_VM(Register oop_result,
231               Register last_java_sp,
232               address entry_point,
233               int number_of_arguments = 0,
234               bool check_exceptions = true);
235  void call_VM(Register oop_result,
236               Register last_java_sp,
237               address entry_point,
238               Register arg_1, bool
239               check_exceptions = true);
240  void call_VM(Register oop_result,
241               Register last_java_sp,
242               address entry_point,
243               Register arg_1, Register arg_2,
244               bool check_exceptions = true);
245  void call_VM(Register oop_result,
246               Register last_java_sp,
247               address entry_point,
248               Register arg_1, Register arg_2, Register arg_3,
249               bool check_exceptions = true);
250
251  void get_vm_result  (Register oop_result, Register thread);
252  void get_vm_result_2(Register metadata_result, Register thread);
253
254  // These always tightly bind to MacroAssembler::call_VM_base
255  // bypassing the virtual implementation
256  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
257  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
258  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
259  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
260  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
261
262  void call_VM_leaf0(address entry_point);
263  void call_VM_leaf(address entry_point,
264                    int number_of_arguments = 0);
265  void call_VM_leaf(address entry_point,
266                    Register arg_1);
267  void call_VM_leaf(address entry_point,
268                    Register arg_1, Register arg_2);
269  void call_VM_leaf(address entry_point,
270                    Register arg_1, Register arg_2, Register arg_3);
271
272  // These always tightly bind to MacroAssembler::call_VM_leaf_base
273  // bypassing the virtual implementation
274  void super_call_VM_leaf(address entry_point);
275  void super_call_VM_leaf(address entry_point, Register arg_1);
276  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
277  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
278  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
279
280  // last Java Frame (fills frame anchor)
281  void set_last_Java_frame(Register thread,
282                           Register last_java_sp,
283                           Register last_java_fp,
284                           address last_java_pc);
285
286  // thread in the default location (r15_thread on 64bit)
287  void set_last_Java_frame(Register last_java_sp,
288                           Register last_java_fp,
289                           address last_java_pc);
290
291  void reset_last_Java_frame(Register thread, bool clear_fp);
292
293  // thread in the default location (r15_thread on 64bit)
294  void reset_last_Java_frame(bool clear_fp);
295
296  // Stores
297  void store_check(Register obj);                // store check for obj - register is destroyed afterwards
298  void store_check(Register obj, Address dst);   // same as above, dst is exact store location (reg. is destroyed)
299
300  void resolve_jobject(Register value, Register thread, Register tmp);
301  void clear_jweak_tag(Register possibly_jweak);
302
303#if INCLUDE_ALL_GCS
304
305  void g1_write_barrier_pre(Register obj,
306                            Register pre_val,
307                            Register thread,
308                            Register tmp,
309                            bool tosca_live,
310                            bool expand_call);
311
312  void g1_write_barrier_post(Register store_addr,
313                             Register new_val,
314                             Register thread,
315                             Register tmp,
316                             Register tmp2);
317
318#endif // INCLUDE_ALL_GCS
319
320  // C 'boolean' to Java boolean: x == 0 ? 0 : 1
321  void c2bool(Register x);
322
323  // C++ bool manipulation
324
325  void movbool(Register dst, Address src);
326  void movbool(Address dst, bool boolconst);
327  void movbool(Address dst, Register src);
328  void testbool(Register dst);
329
330  void load_mirror(Register mirror, Register method);
331
332  // oop manipulations
333  void load_klass(Register dst, Register src);
334  void store_klass(Register dst, Register src);
335
336  void load_heap_oop(Register dst, Address src);
337  void load_heap_oop_not_null(Register dst, Address src);
338  void store_heap_oop(Address dst, Register src);
339  void cmp_heap_oop(Register src1, Address src2, Register tmp = noreg);
340
341  // Used for storing NULL. All other oop constants should be
342  // stored using routines that take a jobject.
343  void store_heap_oop_null(Address dst);
344
345  void load_prototype_header(Register dst, Register src);
346
347#ifdef _LP64
348  void store_klass_gap(Register dst, Register src);
349
350  // This dummy is to prevent a call to store_heap_oop from
351  // converting a zero (like NULL) into a Register by giving
352  // the compiler two choices it can't resolve
353
354  void store_heap_oop(Address dst, void* dummy);
355
356  void encode_heap_oop(Register r);
357  void decode_heap_oop(Register r);
358  void encode_heap_oop_not_null(Register r);
359  void decode_heap_oop_not_null(Register r);
360  void encode_heap_oop_not_null(Register dst, Register src);
361  void decode_heap_oop_not_null(Register dst, Register src);
362
363  void set_narrow_oop(Register dst, jobject obj);
364  void set_narrow_oop(Address dst, jobject obj);
365  void cmp_narrow_oop(Register dst, jobject obj);
366  void cmp_narrow_oop(Address dst, jobject obj);
367
368  void encode_klass_not_null(Register r);
369  void decode_klass_not_null(Register r);
370  void encode_klass_not_null(Register dst, Register src);
371  void decode_klass_not_null(Register dst, Register src);
372  void set_narrow_klass(Register dst, Klass* k);
373  void set_narrow_klass(Address dst, Klass* k);
374  void cmp_narrow_klass(Register dst, Klass* k);
375  void cmp_narrow_klass(Address dst, Klass* k);
376
377  // Returns the byte size of the instructions generated by decode_klass_not_null()
378  // when compressed klass pointers are being used.
379  static int instr_size_for_decode_klass_not_null();
380
381  // if heap base register is used - reinit it with the correct value
382  void reinit_heapbase();
383
384  DEBUG_ONLY(void verify_heapbase(const char* msg);)
385
386#endif // _LP64
387
388  // Int division/remainder for Java
389  // (as idivl, but checks for special case as described in JVM spec.)
390  // returns idivl instruction offset for implicit exception handling
391  int corrected_idivl(Register reg);
392
393  // Long division/remainder for Java
394  // (as idivq, but checks for special case as described in JVM spec.)
395  // returns idivq instruction offset for implicit exception handling
396  int corrected_idivq(Register reg);
397
398  void int3();
399
400  // Long operation macros for a 32bit cpu
401  // Long negation for Java
402  void lneg(Register hi, Register lo);
403
404  // Long multiplication for Java
405  // (destroys contents of eax, ebx, ecx and edx)
406  void lmul(int x_rsp_offset, int y_rsp_offset); // rdx:rax = x * y
407
408  // Long shifts for Java
409  // (semantics as described in JVM spec.)
410  void lshl(Register hi, Register lo);                               // hi:lo << (rcx & 0x3f)
411  void lshr(Register hi, Register lo, bool sign_extension = false);  // hi:lo >> (rcx & 0x3f)
412
413  // Long compare for Java
414  // (semantics as described in JVM spec.)
415  void lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo); // x_hi = lcmp(x, y)
416
417
418  // misc
419
420  // Sign extension
421  void sign_extend_short(Register reg);
422  void sign_extend_byte(Register reg);
423
424  // Division by power of 2, rounding towards 0
425  void division_with_shift(Register reg, int shift_value);
426
427  // Compares the top-most stack entries on the FPU stack and sets the eflags as follows:
428  //
429  // CF (corresponds to C0) if x < y
430  // PF (corresponds to C2) if unordered
431  // ZF (corresponds to C3) if x = y
432  //
433  // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
434  // tmp is a temporary register, if none is available use noreg (only matters for non-P6 code)
435  void fcmp(Register tmp);
436  // Variant of the above which allows y to be further down the stack
437  // and which only pops x and y if specified. If pop_right is
438  // specified then pop_left must also be specified.
439  void fcmp(Register tmp, int index, bool pop_left, bool pop_right);
440
441  // Floating-point comparison for Java
442  // Compares the top-most stack entries on the FPU stack and stores the result in dst.
443  // The arguments are in reversed order on the stack (i.e., top of stack is first argument).
444  // (semantics as described in JVM spec.)
445  void fcmp2int(Register dst, bool unordered_is_less);
446  // Variant of the above which allows y to be further down the stack
447  // and which only pops x and y if specified. If pop_right is
448  // specified then pop_left must also be specified.
449  void fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right);
450
451  // Floating-point remainder for Java (ST0 = ST0 fremr ST1, ST1 is empty afterwards)
452  // tmp is a temporary register, if none is available use noreg
453  void fremr(Register tmp);
454
455  // dst = c = a * b + c
456  void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
457  void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
458
459
460  // same as fcmp2int, but using SSE2
461  void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
462  void cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
463
464  // branch to L if FPU flag C2 is set/not set
465  // tmp is a temporary register, if none is available use noreg
466  void jC2 (Register tmp, Label& L);
467  void jnC2(Register tmp, Label& L);
468
469  // Pop ST (ffree & fincstp combined)
470  void fpop();
471
472  // Load float value from 'address'. If UseSSE >= 1, the value is loaded into
473  // register xmm0. Otherwise, the value is loaded onto the FPU stack.
474  void load_float(Address src);
475
476  // Store float value to 'address'. If UseSSE >= 1, the value is stored
477  // from register xmm0. Otherwise, the value is stored from the FPU stack.
478  void store_float(Address dst);
479
480  // Load double value from 'address'. If UseSSE >= 2, the value is loaded into
481  // register xmm0. Otherwise, the value is loaded onto the FPU stack.
482  void load_double(Address src);
483
484  // Store double value to 'address'. If UseSSE >= 2, the value is stored
485  // from register xmm0. Otherwise, the value is stored from the FPU stack.
486  void store_double(Address dst);
487
488  // pushes double TOS element of FPU stack on CPU stack; pops from FPU stack
489  void push_fTOS();
490
491  // pops double TOS element from CPU stack and pushes on FPU stack
492  void pop_fTOS();
493
494  void empty_FPU_stack();
495
496  void push_IU_state();
497  void pop_IU_state();
498
499  void push_FPU_state();
500  void pop_FPU_state();
501
502  void push_CPU_state();
503  void pop_CPU_state();
504
505  // Round up to a power of two
506  void round_to(Register reg, int modulus);
507
508  // Callee saved registers handling
509  void push_callee_saved_registers();
510  void pop_callee_saved_registers();
511
512  // allocation
513  void eden_allocate(
514    Register obj,                      // result: pointer to object after successful allocation
515    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
516    int      con_size_in_bytes,        // object size in bytes if   known at compile time
517    Register t1,                       // temp register
518    Label&   slow_case                 // continuation point if fast allocation fails
519  );
520  void tlab_allocate(
521    Register obj,                      // result: pointer to object after successful allocation
522    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
523    int      con_size_in_bytes,        // object size in bytes if   known at compile time
524    Register t1,                       // temp register
525    Register t2,                       // temp register
526    Label&   slow_case                 // continuation point if fast allocation fails
527  );
528  Register tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case); // returns TLS address
529  void zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp);
530
531  void incr_allocated_bytes(Register thread,
532                            Register var_size_in_bytes, int con_size_in_bytes,
533                            Register t1 = noreg);
534
535  // interface method calling
536  void lookup_interface_method(Register recv_klass,
537                               Register intf_klass,
538                               RegisterOrConstant itable_index,
539                               Register method_result,
540                               Register scan_temp,
541                               Label& no_such_interface);
542
543  // virtual method calling
544  void lookup_virtual_method(Register recv_klass,
545                             RegisterOrConstant vtable_index,
546                             Register method_result);
547
548  // Test sub_klass against super_klass, with fast and slow paths.
549
550  // The fast path produces a tri-state answer: yes / no / maybe-slow.
551  // One of the three labels can be NULL, meaning take the fall-through.
552  // If super_check_offset is -1, the value is loaded up from super_klass.
553  // No registers are killed, except temp_reg.
554  void check_klass_subtype_fast_path(Register sub_klass,
555                                     Register super_klass,
556                                     Register temp_reg,
557                                     Label* L_success,
558                                     Label* L_failure,
559                                     Label* L_slow_path,
560                RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
561
562  // The rest of the type check; must be wired to a corresponding fast path.
563  // It does not repeat the fast path logic, so don't use it standalone.
564  // The temp_reg and temp2_reg can be noreg, if no temps are available.
565  // Updates the sub's secondary super cache as necessary.
566  // If set_cond_codes, condition codes will be Z on success, NZ on failure.
567  void check_klass_subtype_slow_path(Register sub_klass,
568                                     Register super_klass,
569                                     Register temp_reg,
570                                     Register temp2_reg,
571                                     Label* L_success,
572                                     Label* L_failure,
573                                     bool set_cond_codes = false);
574
575  // Simplified, combined version, good for typical uses.
576  // Falls through on failure.
577  void check_klass_subtype(Register sub_klass,
578                           Register super_klass,
579                           Register temp_reg,
580                           Label& L_success);
581
582  // method handles (JSR 292)
583  Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
584
585  //----
586  void set_word_if_not_zero(Register reg); // sets reg to 1 if not zero, otherwise 0
587
588  // Debugging
589
590  // only if +VerifyOops
591  // TODO: Make these macros with file and line like sparc version!
592  void verify_oop(Register reg, const char* s = "broken oop");
593  void verify_oop_addr(Address addr, const char * s = "broken oop addr");
594
595  // TODO: verify method and klass metadata (compare against vptr?)
596  void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
597  void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
598
599#define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
600#define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
601
602  // only if +VerifyFPU
603  void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
604
605  // Verify or restore cpu control state after JNI call
606  void restore_cpu_control_state_after_jni();
607
608  // prints msg, dumps registers and stops execution
609  void stop(const char* msg);
610
611  // prints msg and continues
612  void warn(const char* msg);
613
614  // dumps registers and other state
615  void print_state();
616
617  static void debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg);
618  static void debug64(char* msg, int64_t pc, int64_t regs[]);
619  static void print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip);
620  static void print_state64(int64_t pc, int64_t regs[]);
621
622  void os_breakpoint();
623
624  void untested()                                { stop("untested"); }
625
626  void unimplemented(const char* what = "")      { char* b = new char[1024];  jio_snprintf(b, 1024, "unimplemented: %s", what);  stop(b); }
627
628  void should_not_reach_here()                   { stop("should not reach here"); }
629
630  void print_CPU_state();
631
632  // Stack overflow checking
633  void bang_stack_with_offset(int offset) {
634    // stack grows down, caller passes positive offset
635    assert(offset > 0, "must bang with negative offset");
636    movl(Address(rsp, (-offset)), rax);
637  }
638
639  // Writes to stack successive pages until offset reached to check for
640  // stack overflow + shadow pages.  Also, clobbers tmp
641  void bang_stack_size(Register size, Register tmp);
642
643  // Check for reserved stack access in method being exited (for JIT)
644  void reserved_stack_check();
645
646  virtual RegisterOrConstant delayed_value_impl(intptr_t* delayed_value_addr,
647                                                Register tmp,
648                                                int offset);
649
650  // Support for serializing memory accesses between threads
651  void serialize_memory(Register thread, Register tmp);
652
653  void verify_tlab();
654
655  // Biased locking support
656  // lock_reg and obj_reg must be loaded up with the appropriate values.
657  // swap_reg must be rax, and is killed.
658  // tmp_reg is optional. If it is supplied (i.e., != noreg) it will
659  // be killed; if not supplied, push/pop will be used internally to
660  // allocate a temporary (inefficient, avoid if possible).
661  // Optional slow case is for implementations (interpreter and C1) which branch to
662  // slow case directly. Leaves condition codes set for C2's Fast_Lock node.
663  // Returns offset of first potentially-faulting instruction for null
664  // check info (currently consumed only by C1). If
665  // swap_reg_contains_mark is true then returns -1 as it is assumed
666  // the calling code has already passed any potential faults.
667  int biased_locking_enter(Register lock_reg, Register obj_reg,
668                           Register swap_reg, Register tmp_reg,
669                           bool swap_reg_contains_mark,
670                           Label& done, Label* slow_case = NULL,
671                           BiasedLockingCounters* counters = NULL);
672  void biased_locking_exit (Register obj_reg, Register temp_reg, Label& done);
673#ifdef COMPILER2
674  // Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
675  // See full desription in macroAssembler_x86.cpp.
676  void fast_lock(Register obj, Register box, Register tmp,
677                 Register scr, Register cx1, Register cx2,
678                 BiasedLockingCounters* counters,
679                 RTMLockingCounters* rtm_counters,
680                 RTMLockingCounters* stack_rtm_counters,
681                 Metadata* method_data,
682                 bool use_rtm, bool profile_rtm);
683  void fast_unlock(Register obj, Register box, Register tmp, bool use_rtm);
684#if INCLUDE_RTM_OPT
685  void rtm_counters_update(Register abort_status, Register rtm_counters);
686  void branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel);
687  void rtm_abort_ratio_calculation(Register tmp, Register rtm_counters_reg,
688                                   RTMLockingCounters* rtm_counters,
689                                   Metadata* method_data);
690  void rtm_profiling(Register abort_status_Reg, Register rtm_counters_Reg,
691                     RTMLockingCounters* rtm_counters, Metadata* method_data, bool profile_rtm);
692  void rtm_retry_lock_on_abort(Register retry_count, Register abort_status, Label& retryLabel);
693  void rtm_retry_lock_on_busy(Register retry_count, Register box, Register tmp, Register scr, Label& retryLabel);
694  void rtm_stack_locking(Register obj, Register tmp, Register scr,
695                         Register retry_on_abort_count,
696                         RTMLockingCounters* stack_rtm_counters,
697                         Metadata* method_data, bool profile_rtm,
698                         Label& DONE_LABEL, Label& IsInflated);
699  void rtm_inflated_locking(Register obj, Register box, Register tmp,
700                            Register scr, Register retry_on_busy_count,
701                            Register retry_on_abort_count,
702                            RTMLockingCounters* rtm_counters,
703                            Metadata* method_data, bool profile_rtm,
704                            Label& DONE_LABEL);
705#endif
706#endif
707
708  Condition negate_condition(Condition cond);
709
710  // Instructions that use AddressLiteral operands. These instruction can handle 32bit/64bit
711  // operands. In general the names are modified to avoid hiding the instruction in Assembler
712  // so that we don't need to implement all the varieties in the Assembler with trivial wrappers
713  // here in MacroAssembler. The major exception to this rule is call
714
715  // Arithmetics
716
717
718  void addptr(Address dst, int32_t src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)) ; }
719  void addptr(Address dst, Register src);
720
721  void addptr(Register dst, Address src) { LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); }
722  void addptr(Register dst, int32_t src);
723  void addptr(Register dst, Register src);
724  void addptr(Register dst, RegisterOrConstant src) {
725    if (src.is_constant()) addptr(dst, (int) src.as_constant());
726    else                   addptr(dst,       src.as_register());
727  }
728
729  void andptr(Register dst, int32_t src);
730  void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; }
731
732  void cmp8(AddressLiteral src1, int imm);
733
734  // renamed to drag out the casting of address to int32_t/intptr_t
735  void cmp32(Register src1, int32_t imm);
736
737  void cmp32(AddressLiteral src1, int32_t imm);
738  // compare reg - mem, or reg - &mem
739  void cmp32(Register src1, AddressLiteral src2);
740
741  void cmp32(Register src1, Address src2);
742
743#ifndef _LP64
744  void cmpklass(Address dst, Metadata* obj);
745  void cmpklass(Register dst, Metadata* obj);
746  void cmpoop(Address dst, jobject obj);
747  void cmpoop(Register dst, jobject obj);
748#endif // _LP64
749
750  // NOTE src2 must be the lval. This is NOT an mem-mem compare
751  void cmpptr(Address src1, AddressLiteral src2);
752
753  void cmpptr(Register src1, AddressLiteral src2);
754
755  void cmpptr(Register src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
756  void cmpptr(Register src1, Address src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
757  // void cmpptr(Address src1, Register src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
758
759  void cmpptr(Register src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
760  void cmpptr(Address src1, int32_t src2) { LP64_ONLY(cmpq(src1, src2)) NOT_LP64(cmpl(src1, src2)) ; }
761
762  // cmp64 to avoild hiding cmpq
763  void cmp64(Register src1, AddressLiteral src);
764
765  void cmpxchgptr(Register reg, Address adr);
766
767  void locked_cmpxchgptr(Register reg, AddressLiteral adr);
768
769
770  void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
771  void imulptr(Register dst, Register src, int imm32) { LP64_ONLY(imulq(dst, src, imm32)) NOT_LP64(imull(dst, src, imm32)); }
772
773
774  void negptr(Register dst) { LP64_ONLY(negq(dst)) NOT_LP64(negl(dst)); }
775
776  void notptr(Register dst) { LP64_ONLY(notq(dst)) NOT_LP64(notl(dst)); }
777
778  void shlptr(Register dst, int32_t shift);
779  void shlptr(Register dst) { LP64_ONLY(shlq(dst)) NOT_LP64(shll(dst)); }
780
781  void shrptr(Register dst, int32_t shift);
782  void shrptr(Register dst) { LP64_ONLY(shrq(dst)) NOT_LP64(shrl(dst)); }
783
784  void sarptr(Register dst) { LP64_ONLY(sarq(dst)) NOT_LP64(sarl(dst)); }
785  void sarptr(Register dst, int32_t src) { LP64_ONLY(sarq(dst, src)) NOT_LP64(sarl(dst, src)); }
786
787  void subptr(Address dst, int32_t src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
788
789  void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
790  void subptr(Register dst, int32_t src);
791  // Force generation of a 4 byte immediate value even if it fits into 8bit
792  void subptr_imm32(Register dst, int32_t src);
793  void subptr(Register dst, Register src);
794  void subptr(Register dst, RegisterOrConstant src) {
795    if (src.is_constant()) subptr(dst, (int) src.as_constant());
796    else                   subptr(dst,       src.as_register());
797  }
798
799  void sbbptr(Address dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
800  void sbbptr(Register dst, int32_t src) { LP64_ONLY(sbbq(dst, src)) NOT_LP64(sbbl(dst, src)); }
801
802  void xchgptr(Register src1, Register src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
803  void xchgptr(Register src1, Address src2) { LP64_ONLY(xchgq(src1, src2)) NOT_LP64(xchgl(src1, src2)) ; }
804
805  void xaddptr(Address src1, Register src2) { LP64_ONLY(xaddq(src1, src2)) NOT_LP64(xaddl(src1, src2)) ; }
806
807
808
809  // Helper functions for statistics gathering.
810  // Conditionally (atomically, on MPs) increments passed counter address, preserving condition codes.
811  void cond_inc32(Condition cond, AddressLiteral counter_addr);
812  // Unconditional atomic increment.
813  void atomic_incl(Address counter_addr);
814  void atomic_incl(AddressLiteral counter_addr, Register scr = rscratch1);
815#ifdef _LP64
816  void atomic_incq(Address counter_addr);
817  void atomic_incq(AddressLiteral counter_addr, Register scr = rscratch1);
818#endif
819  void atomic_incptr(AddressLiteral counter_addr, Register scr = rscratch1) { LP64_ONLY(atomic_incq(counter_addr, scr)) NOT_LP64(atomic_incl(counter_addr, scr)) ; }
820  void atomic_incptr(Address counter_addr) { LP64_ONLY(atomic_incq(counter_addr)) NOT_LP64(atomic_incl(counter_addr)) ; }
821
822  void lea(Register dst, AddressLiteral adr);
823  void lea(Address dst, AddressLiteral adr);
824  void lea(Register dst, Address adr) { Assembler::lea(dst, adr); }
825
826  void leal32(Register dst, Address src) { leal(dst, src); }
827
828  // Import other testl() methods from the parent class or else
829  // they will be hidden by the following overriding declaration.
830  using Assembler::testl;
831  void testl(Register dst, AddressLiteral src);
832
833  void orptr(Register dst, Address src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
834  void orptr(Register dst, Register src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
835  void orptr(Register dst, int32_t src) { LP64_ONLY(orq(dst, src)) NOT_LP64(orl(dst, src)); }
836  void orptr(Address dst, int32_t imm32) { LP64_ONLY(orq(dst, imm32)) NOT_LP64(orl(dst, imm32)); }
837
838  void testptr(Register src, int32_t imm32) {  LP64_ONLY(testq(src, imm32)) NOT_LP64(testl(src, imm32)); }
839  void testptr(Register src1, Register src2);
840
841  void xorptr(Register dst, Register src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
842  void xorptr(Register dst, Address src) { LP64_ONLY(xorq(dst, src)) NOT_LP64(xorl(dst, src)); }
843
844  // Calls
845
846  void call(Label& L, relocInfo::relocType rtype);
847  void call(Register entry);
848
849  // NOTE: this call transfers to the effective address of entry NOT
850  // the address contained by entry. This is because this is more natural
851  // for jumps/calls.
852  void call(AddressLiteral entry);
853
854  // Emit the CompiledIC call idiom
855  void ic_call(address entry, jint method_index = 0);
856
857  // Jumps
858
859  // NOTE: these jumps tranfer to the effective address of dst NOT
860  // the address contained by dst. This is because this is more natural
861  // for jumps/calls.
862  void jump(AddressLiteral dst);
863  void jump_cc(Condition cc, AddressLiteral dst);
864
865  // 32bit can do a case table jump in one instruction but we no longer allow the base
866  // to be installed in the Address class. This jump will tranfers to the address
867  // contained in the location described by entry (not the address of entry)
868  void jump(ArrayAddress entry);
869
870  // Floating
871
872  void andpd(XMMRegister dst, Address src) { Assembler::andpd(dst, src); }
873  void andpd(XMMRegister dst, AddressLiteral src);
874  void andpd(XMMRegister dst, XMMRegister src) { Assembler::andpd(dst, src); }
875
876  void andps(XMMRegister dst, XMMRegister src) { Assembler::andps(dst, src); }
877  void andps(XMMRegister dst, Address src) { Assembler::andps(dst, src); }
878  void andps(XMMRegister dst, AddressLiteral src);
879
880  void comiss(XMMRegister dst, XMMRegister src) { Assembler::comiss(dst, src); }
881  void comiss(XMMRegister dst, Address src) { Assembler::comiss(dst, src); }
882  void comiss(XMMRegister dst, AddressLiteral src);
883
884  void comisd(XMMRegister dst, XMMRegister src) { Assembler::comisd(dst, src); }
885  void comisd(XMMRegister dst, Address src) { Assembler::comisd(dst, src); }
886  void comisd(XMMRegister dst, AddressLiteral src);
887
888  void fadd_s(Address src)        { Assembler::fadd_s(src); }
889  void fadd_s(AddressLiteral src) { Assembler::fadd_s(as_Address(src)); }
890
891  void fldcw(Address src) { Assembler::fldcw(src); }
892  void fldcw(AddressLiteral src);
893
894  void fld_s(int index)   { Assembler::fld_s(index); }
895  void fld_s(Address src) { Assembler::fld_s(src); }
896  void fld_s(AddressLiteral src);
897
898  void fld_d(Address src) { Assembler::fld_d(src); }
899  void fld_d(AddressLiteral src);
900
901  void fld_x(Address src) { Assembler::fld_x(src); }
902  void fld_x(AddressLiteral src);
903
904  void fmul_s(Address src)        { Assembler::fmul_s(src); }
905  void fmul_s(AddressLiteral src) { Assembler::fmul_s(as_Address(src)); }
906
907  void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
908  void ldmxcsr(AddressLiteral src);
909
910#ifdef _LP64
911 private:
912  void sha256_AVX2_one_round_compute(
913    Register  reg_old_h,
914    Register  reg_a,
915    Register  reg_b,
916    Register  reg_c,
917    Register  reg_d,
918    Register  reg_e,
919    Register  reg_f,
920    Register  reg_g,
921    Register  reg_h,
922    int iter);
923  void sha256_AVX2_four_rounds_compute_first(int start);
924  void sha256_AVX2_four_rounds_compute_last(int start);
925  void sha256_AVX2_one_round_and_sched(
926        XMMRegister xmm_0,     /* == ymm4 on 0, 1, 2, 3 iterations, then rotate 4 registers left on 4, 8, 12 iterations */
927        XMMRegister xmm_1,     /* ymm5 */  /* full cycle is 16 iterations */
928        XMMRegister xmm_2,     /* ymm6 */
929        XMMRegister xmm_3,     /* ymm7 */
930        Register    reg_a,      /* == eax on 0 iteration, then rotate 8 register right on each next iteration */
931        Register    reg_b,      /* ebx */    /* full cycle is 8 iterations */
932        Register    reg_c,      /* edi */
933        Register    reg_d,      /* esi */
934        Register    reg_e,      /* r8d */
935        Register    reg_f,      /* r9d */
936        Register    reg_g,      /* r10d */
937        Register    reg_h,      /* r11d */
938        int iter);
939
940  void addm(int disp, Register r1, Register r2);
941
942 public:
943  void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
944                   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
945                   Register buf, Register state, Register ofs, Register limit, Register rsp,
946                   bool multi_block, XMMRegister shuf_mask);
947#endif
948
949#ifdef _LP64
950 private:
951  void sha512_AVX2_one_round_compute(Register old_h, Register a, Register b, Register c, Register d,
952                                     Register e, Register f, Register g, Register h, int iteration);
953
954  void sha512_AVX2_one_round_and_schedule(XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
955                                          Register a, Register b, Register c, Register d, Register e, Register f,
956                                          Register g, Register h, int iteration);
957
958  void addmq(int disp, Register r1, Register r2);
959 public:
960  void sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
961                   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
962                   Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
963                   XMMRegister shuf_mask);
964#endif
965
966  void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
967                 XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
968                 Register buf, Register state, Register ofs, Register limit, Register rsp,
969                 bool multi_block);
970
971#ifdef _LP64
972  void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
973                   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
974                   Register buf, Register state, Register ofs, Register limit, Register rsp,
975                   bool multi_block, XMMRegister shuf_mask);
976#else
977  void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
978                   XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
979                   Register buf, Register state, Register ofs, Register limit, Register rsp,
980                   bool multi_block);
981#endif
982
983  void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
984                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
985                Register rax, Register rcx, Register rdx, Register tmp);
986
987#ifdef _LP64
988  void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
989                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
990                Register rax, Register rcx, Register rdx, Register tmp1, Register tmp2);
991
992  void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
993                  XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
994                  Register rax, Register rcx, Register rdx, Register r11);
995
996  void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
997                XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
998                Register rdx, Register tmp1, Register tmp2, Register tmp3, Register tmp4);
999
1000  void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1001                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1002                Register rax, Register rbx, Register rcx, Register rdx, Register tmp1, Register tmp2,
1003                Register tmp3, Register tmp4);
1004
1005  void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1006                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1007                Register rax, Register rcx, Register rdx, Register tmp1,
1008                Register tmp2, Register tmp3, Register tmp4);
1009  void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1010                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1011                Register rax, Register rcx, Register rdx, Register tmp1,
1012                Register tmp2, Register tmp3, Register tmp4);
1013#else
1014  void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1015                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1016                Register rax, Register rcx, Register rdx, Register tmp1);
1017
1018  void fast_log10(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1019                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1020                Register rax, Register rcx, Register rdx, Register tmp);
1021
1022  void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1023                XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
1024                Register rdx, Register tmp);
1025
1026  void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1027                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1028                Register rax, Register rbx, Register rdx);
1029
1030  void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1031                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1032                Register rax, Register rcx, Register rdx, Register tmp);
1033
1034  void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1035                        Register edx, Register ebx, Register esi, Register edi,
1036                        Register ebp, Register esp);
1037
1038  void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
1039                         Register esi, Register edi, Register ebp, Register esp);
1040
1041  void libm_tancot_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
1042                        Register edx, Register ebx, Register esi, Register edi,
1043                        Register ebp, Register esp);
1044
1045  void fast_tan(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
1046                XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
1047                Register rax, Register rcx, Register rdx, Register tmp);
1048#endif
1049
1050  void increase_precision();
1051  void restore_precision();
1052
1053private:
1054
1055  // these are private because users should be doing movflt/movdbl
1056
1057  void movss(Address dst, XMMRegister src)     { Assembler::movss(dst, src); }
1058  void movss(XMMRegister dst, XMMRegister src) { Assembler::movss(dst, src); }
1059  void movss(XMMRegister dst, Address src)     { Assembler::movss(dst, src); }
1060  void movss(XMMRegister dst, AddressLiteral src);
1061
1062  void movlpd(XMMRegister dst, Address src)    {Assembler::movlpd(dst, src); }
1063  void movlpd(XMMRegister dst, AddressLiteral src);
1064
1065public:
1066
1067  void addsd(XMMRegister dst, XMMRegister src)    { Assembler::addsd(dst, src); }
1068  void addsd(XMMRegister dst, Address src)        { Assembler::addsd(dst, src); }
1069  void addsd(XMMRegister dst, AddressLiteral src);
1070
1071  void addss(XMMRegister dst, XMMRegister src)    { Assembler::addss(dst, src); }
1072  void addss(XMMRegister dst, Address src)        { Assembler::addss(dst, src); }
1073  void addss(XMMRegister dst, AddressLiteral src);
1074
1075  void addpd(XMMRegister dst, XMMRegister src)    { Assembler::addpd(dst, src); }
1076  void addpd(XMMRegister dst, Address src)        { Assembler::addpd(dst, src); }
1077  void addpd(XMMRegister dst, AddressLiteral src);
1078
1079  void divsd(XMMRegister dst, XMMRegister src)    { Assembler::divsd(dst, src); }
1080  void divsd(XMMRegister dst, Address src)        { Assembler::divsd(dst, src); }
1081  void divsd(XMMRegister dst, AddressLiteral src);
1082
1083  void divss(XMMRegister dst, XMMRegister src)    { Assembler::divss(dst, src); }
1084  void divss(XMMRegister dst, Address src)        { Assembler::divss(dst, src); }
1085  void divss(XMMRegister dst, AddressLiteral src);
1086
1087  // Move Unaligned Double Quadword
1088  void movdqu(Address     dst, XMMRegister src);
1089  void movdqu(XMMRegister dst, Address src);
1090  void movdqu(XMMRegister dst, XMMRegister src);
1091  void movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg = rscratch1);
1092  // AVX Unaligned forms
1093  void vmovdqu(Address     dst, XMMRegister src);
1094  void vmovdqu(XMMRegister dst, Address src);
1095  void vmovdqu(XMMRegister dst, XMMRegister src);
1096  void vmovdqu(XMMRegister dst, AddressLiteral src);
1097
1098  // Move Aligned Double Quadword
1099  void movdqa(XMMRegister dst, Address src)       { Assembler::movdqa(dst, src); }
1100  void movdqa(XMMRegister dst, XMMRegister src)   { Assembler::movdqa(dst, src); }
1101  void movdqa(XMMRegister dst, AddressLiteral src);
1102
1103  void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); }
1104  void movsd(Address dst, XMMRegister src)     { Assembler::movsd(dst, src); }
1105  void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
1106  void movsd(XMMRegister dst, AddressLiteral src);
1107
1108  void mulpd(XMMRegister dst, XMMRegister src)    { Assembler::mulpd(dst, src); }
1109  void mulpd(XMMRegister dst, Address src)        { Assembler::mulpd(dst, src); }
1110  void mulpd(XMMRegister dst, AddressLiteral src);
1111
1112  void mulsd(XMMRegister dst, XMMRegister src)    { Assembler::mulsd(dst, src); }
1113  void mulsd(XMMRegister dst, Address src)        { Assembler::mulsd(dst, src); }
1114  void mulsd(XMMRegister dst, AddressLiteral src);
1115
1116  void mulss(XMMRegister dst, XMMRegister src)    { Assembler::mulss(dst, src); }
1117  void mulss(XMMRegister dst, Address src)        { Assembler::mulss(dst, src); }
1118  void mulss(XMMRegister dst, AddressLiteral src);
1119
1120  // Carry-Less Multiplication Quadword
1121  void pclmulldq(XMMRegister dst, XMMRegister src) {
1122    // 0x00 - multiply lower 64 bits [0:63]
1123    Assembler::pclmulqdq(dst, src, 0x00);
1124  }
1125  void pclmulhdq(XMMRegister dst, XMMRegister src) {
1126    // 0x11 - multiply upper 64 bits [64:127]
1127    Assembler::pclmulqdq(dst, src, 0x11);
1128  }
1129
1130  void pcmpeqb(XMMRegister dst, XMMRegister src);
1131  void pcmpeqw(XMMRegister dst, XMMRegister src);
1132
1133  void pcmpestri(XMMRegister dst, Address src, int imm8);
1134  void pcmpestri(XMMRegister dst, XMMRegister src, int imm8);
1135
1136  void pmovzxbw(XMMRegister dst, XMMRegister src);
1137  void pmovzxbw(XMMRegister dst, Address src);
1138
1139  void pmovmskb(Register dst, XMMRegister src);
1140
1141  void ptest(XMMRegister dst, XMMRegister src);
1142
1143  void sqrtsd(XMMRegister dst, XMMRegister src)    { Assembler::sqrtsd(dst, src); }
1144  void sqrtsd(XMMRegister dst, Address src)        { Assembler::sqrtsd(dst, src); }
1145  void sqrtsd(XMMRegister dst, AddressLiteral src);
1146
1147  void sqrtss(XMMRegister dst, XMMRegister src)    { Assembler::sqrtss(dst, src); }
1148  void sqrtss(XMMRegister dst, Address src)        { Assembler::sqrtss(dst, src); }
1149  void sqrtss(XMMRegister dst, AddressLiteral src);
1150
1151  void subsd(XMMRegister dst, XMMRegister src)    { Assembler::subsd(dst, src); }
1152  void subsd(XMMRegister dst, Address src)        { Assembler::subsd(dst, src); }
1153  void subsd(XMMRegister dst, AddressLiteral src);
1154
1155  void subss(XMMRegister dst, XMMRegister src)    { Assembler::subss(dst, src); }
1156  void subss(XMMRegister dst, Address src)        { Assembler::subss(dst, src); }
1157  void subss(XMMRegister dst, AddressLiteral src);
1158
1159  void ucomiss(XMMRegister dst, XMMRegister src) { Assembler::ucomiss(dst, src); }
1160  void ucomiss(XMMRegister dst, Address src)     { Assembler::ucomiss(dst, src); }
1161  void ucomiss(XMMRegister dst, AddressLiteral src);
1162
1163  void ucomisd(XMMRegister dst, XMMRegister src) { Assembler::ucomisd(dst, src); }
1164  void ucomisd(XMMRegister dst, Address src)     { Assembler::ucomisd(dst, src); }
1165  void ucomisd(XMMRegister dst, AddressLiteral src);
1166
1167  // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
1168  void xorpd(XMMRegister dst, XMMRegister src);
1169  void xorpd(XMMRegister dst, Address src)     { Assembler::xorpd(dst, src); }
1170  void xorpd(XMMRegister dst, AddressLiteral src);
1171
1172  // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
1173  void xorps(XMMRegister dst, XMMRegister src);
1174  void xorps(XMMRegister dst, Address src)     { Assembler::xorps(dst, src); }
1175  void xorps(XMMRegister dst, AddressLiteral src);
1176
1177  // Shuffle Bytes
1178  void pshufb(XMMRegister dst, XMMRegister src) { Assembler::pshufb(dst, src); }
1179  void pshufb(XMMRegister dst, Address src)     { Assembler::pshufb(dst, src); }
1180  void pshufb(XMMRegister dst, AddressLiteral src);
1181  // AVX 3-operands instructions
1182
1183  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); }
1184  void vaddsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddsd(dst, nds, src); }
1185  void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1186
1187  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddss(dst, nds, src); }
1188  void vaddss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddss(dst, nds, src); }
1189  void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1190
1191  void vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len);
1192  void vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len);
1193
1194  void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1195  void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1196
1197  void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1198  void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1199
1200  void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1201  void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vpand(dst, nds, src, vector_len); }
1202  void vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
1203
1204  void vpbroadcastw(XMMRegister dst, XMMRegister src);
1205
1206  void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1207  void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1208
1209  void vpmovzxbw(XMMRegister dst, Address src, int vector_len);
1210  void vpmovmskb(Register dst, XMMRegister src);
1211
1212  void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1213  void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1214
1215  void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1216  void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1217
1218  void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1219  void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1220
1221  void vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1222  void vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1223
1224  void vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1225  void vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1226
1227  void vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len);
1228  void vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len);
1229
1230  void vptest(XMMRegister dst, XMMRegister src);
1231
1232  void punpcklbw(XMMRegister dst, XMMRegister src);
1233  void punpcklbw(XMMRegister dst, Address src) { Assembler::punpcklbw(dst, src); }
1234
1235  void pshufd(XMMRegister dst, Address src, int mode);
1236  void pshufd(XMMRegister dst, XMMRegister src, int mode) { Assembler::pshufd(dst, src, mode); }
1237
1238  void pshuflw(XMMRegister dst, XMMRegister src, int mode);
1239  void pshuflw(XMMRegister dst, Address src, int mode) { Assembler::pshuflw(dst, src, mode); }
1240
1241  void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandpd(dst, nds, src, vector_len); }
1242  void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len)     { Assembler::vandpd(dst, nds, src, vector_len); }
1243  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
1244
1245  void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vandps(dst, nds, src, vector_len); }
1246  void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len)     { Assembler::vandps(dst, nds, src, vector_len); }
1247  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
1248
1249  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
1250  void vdivsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivsd(dst, nds, src); }
1251  void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1252
1253  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivss(dst, nds, src); }
1254  void vdivss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivss(dst, nds, src); }
1255  void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1256
1257  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulsd(dst, nds, src); }
1258  void vmulsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulsd(dst, nds, src); }
1259  void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1260
1261  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulss(dst, nds, src); }
1262  void vmulss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulss(dst, nds, src); }
1263  void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1264
1265  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubsd(dst, nds, src); }
1266  void vsubsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubsd(dst, nds, src); }
1267  void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1268
1269  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubss(dst, nds, src); }
1270  void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
1271  void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1272
1273  void vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1274  void vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
1275
1276  // AVX Vector instructions
1277
1278  void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1279  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorpd(dst, nds, src, vector_len); }
1280  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
1281
1282  void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1283  void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { Assembler::vxorps(dst, nds, src, vector_len); }
1284  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len);
1285
1286  void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1287    if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1288      Assembler::vpxor(dst, nds, src, vector_len);
1289    else
1290      Assembler::vxorpd(dst, nds, src, vector_len);
1291  }
1292  void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1293    if (UseAVX > 1 || (vector_len < 1)) // vpxor 256 bit is available only in AVX2
1294      Assembler::vpxor(dst, nds, src, vector_len);
1295    else
1296      Assembler::vxorpd(dst, nds, src, vector_len);
1297  }
1298
1299  // Simple version for AVX2 256bit vectors
1300  void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); }
1301  void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }
1302
1303  void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
1304    if (UseAVX > 2) {
1305      Assembler::vinserti32x4(dst, dst, src, imm8);
1306    } else if (UseAVX > 1) {
1307      // vinserti128 is available only in AVX2
1308      Assembler::vinserti128(dst, nds, src, imm8);
1309    } else {
1310      Assembler::vinsertf128(dst, nds, src, imm8);
1311    }
1312  }
1313
1314  void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8) {
1315    if (UseAVX > 2) {
1316      Assembler::vinserti32x4(dst, dst, src, imm8);
1317    } else if (UseAVX > 1) {
1318      // vinserti128 is available only in AVX2
1319      Assembler::vinserti128(dst, nds, src, imm8);
1320    } else {
1321      Assembler::vinsertf128(dst, nds, src, imm8);
1322    }
1323  }
1324
1325  void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8) {
1326    if (UseAVX > 2) {
1327      Assembler::vextracti32x4(dst, src, imm8);
1328    } else if (UseAVX > 1) {
1329      // vextracti128 is available only in AVX2
1330      Assembler::vextracti128(dst, src, imm8);
1331    } else {
1332      Assembler::vextractf128(dst, src, imm8);
1333    }
1334  }
1335
1336  void vextracti128(Address dst, XMMRegister src, uint8_t imm8) {
1337    if (UseAVX > 2) {
1338      Assembler::vextracti32x4(dst, src, imm8);
1339    } else if (UseAVX > 1) {
1340      // vextracti128 is available only in AVX2
1341      Assembler::vextracti128(dst, src, imm8);
1342    } else {
1343      Assembler::vextractf128(dst, src, imm8);
1344    }
1345  }
1346
1347  // 128bit copy to/from high 128 bits of 256bit (YMM) vector registers
1348  void vinserti128_high(XMMRegister dst, XMMRegister src) {
1349    vinserti128(dst, dst, src, 1);
1350  }
1351  void vinserti128_high(XMMRegister dst, Address src) {
1352    vinserti128(dst, dst, src, 1);
1353  }
1354  void vextracti128_high(XMMRegister dst, XMMRegister src) {
1355    vextracti128(dst, src, 1);
1356  }
1357  void vextracti128_high(Address dst, XMMRegister src) {
1358    vextracti128(dst, src, 1);
1359  }
1360
1361  void vinsertf128_high(XMMRegister dst, XMMRegister src) {
1362    if (UseAVX > 2) {
1363      Assembler::vinsertf32x4(dst, dst, src, 1);
1364    } else {
1365      Assembler::vinsertf128(dst, dst, src, 1);
1366    }
1367  }
1368
1369  void vinsertf128_high(XMMRegister dst, Address src) {
1370    if (UseAVX > 2) {
1371      Assembler::vinsertf32x4(dst, dst, src, 1);
1372    } else {
1373      Assembler::vinsertf128(dst, dst, src, 1);
1374    }
1375  }
1376
1377  void vextractf128_high(XMMRegister dst, XMMRegister src) {
1378    if (UseAVX > 2) {
1379      Assembler::vextractf32x4(dst, src, 1);
1380    } else {
1381      Assembler::vextractf128(dst, src, 1);
1382    }
1383  }
1384
1385  void vextractf128_high(Address dst, XMMRegister src) {
1386    if (UseAVX > 2) {
1387      Assembler::vextractf32x4(dst, src, 1);
1388    } else {
1389      Assembler::vextractf128(dst, src, 1);
1390    }
1391  }
1392
1393  // 256bit copy to/from high 256 bits of 512bit (ZMM) vector registers
1394  void vinserti64x4_high(XMMRegister dst, XMMRegister src) {
1395    Assembler::vinserti64x4(dst, dst, src, 1);
1396  }
1397  void vinsertf64x4_high(XMMRegister dst, XMMRegister src) {
1398    Assembler::vinsertf64x4(dst, dst, src, 1);
1399  }
1400  void vextracti64x4_high(XMMRegister dst, XMMRegister src) {
1401    Assembler::vextracti64x4(dst, src, 1);
1402  }
1403  void vextractf64x4_high(XMMRegister dst, XMMRegister src) {
1404    Assembler::vextractf64x4(dst, src, 1);
1405  }
1406  void vextractf64x4_high(Address dst, XMMRegister src) {
1407    Assembler::vextractf64x4(dst, src, 1);
1408  }
1409  void vinsertf64x4_high(XMMRegister dst, Address src) {
1410    Assembler::vinsertf64x4(dst, dst, src, 1);
1411  }
1412
1413  // 128bit copy to/from low 128 bits of 256bit (YMM) vector registers
1414  void vinserti128_low(XMMRegister dst, XMMRegister src) {
1415    vinserti128(dst, dst, src, 0);
1416  }
1417  void vinserti128_low(XMMRegister dst, Address src) {
1418    vinserti128(dst, dst, src, 0);
1419  }
1420  void vextracti128_low(XMMRegister dst, XMMRegister src) {
1421    vextracti128(dst, src, 0);
1422  }
1423  void vextracti128_low(Address dst, XMMRegister src) {
1424    vextracti128(dst, src, 0);
1425  }
1426
1427  void vinsertf128_low(XMMRegister dst, XMMRegister src) {
1428    if (UseAVX > 2) {
1429      Assembler::vinsertf32x4(dst, dst, src, 0);
1430    } else {
1431      Assembler::vinsertf128(dst, dst, src, 0);
1432    }
1433  }
1434
1435  void vinsertf128_low(XMMRegister dst, Address src) {
1436    if (UseAVX > 2) {
1437      Assembler::vinsertf32x4(dst, dst, src, 0);
1438    } else {
1439      Assembler::vinsertf128(dst, dst, src, 0);
1440    }
1441  }
1442
1443  void vextractf128_low(XMMRegister dst, XMMRegister src) {
1444    if (UseAVX > 2) {
1445      Assembler::vextractf32x4(dst, src, 0);
1446    } else {
1447      Assembler::vextractf128(dst, src, 0);
1448    }
1449  }
1450
1451  void vextractf128_low(Address dst, XMMRegister src) {
1452    if (UseAVX > 2) {
1453      Assembler::vextractf32x4(dst, src, 0);
1454    } else {
1455      Assembler::vextractf128(dst, src, 0);
1456    }
1457  }
1458
1459  // 256bit copy to/from low 256 bits of 512bit (ZMM) vector registers
1460  void vinserti64x4_low(XMMRegister dst, XMMRegister src) {
1461    Assembler::vinserti64x4(dst, dst, src, 0);
1462  }
1463  void vinsertf64x4_low(XMMRegister dst, XMMRegister src) {
1464    Assembler::vinsertf64x4(dst, dst, src, 0);
1465  }
1466  void vextracti64x4_low(XMMRegister dst, XMMRegister src) {
1467    Assembler::vextracti64x4(dst, src, 0);
1468  }
1469  void vextractf64x4_low(XMMRegister dst, XMMRegister src) {
1470    Assembler::vextractf64x4(dst, src, 0);
1471  }
1472  void vextractf64x4_low(Address dst, XMMRegister src) {
1473    Assembler::vextractf64x4(dst, src, 0);
1474  }
1475  void vinsertf64x4_low(XMMRegister dst, Address src) {
1476    Assembler::vinsertf64x4(dst, dst, src, 0);
1477  }
1478
1479  // Carry-Less Multiplication Quadword
1480  void vpclmulldq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1481    // 0x00 - multiply lower 64 bits [0:63]
1482    Assembler::vpclmulqdq(dst, nds, src, 0x00);
1483  }
1484  void vpclmulhdq(XMMRegister dst, XMMRegister nds, XMMRegister src) {
1485    // 0x11 - multiply upper 64 bits [64:127]
1486    Assembler::vpclmulqdq(dst, nds, src, 0x11);
1487  }
1488
1489  // Data
1490
1491  void cmov32( Condition cc, Register dst, Address  src);
1492  void cmov32( Condition cc, Register dst, Register src);
1493
1494  void cmov(   Condition cc, Register dst, Register src) { cmovptr(cc, dst, src); }
1495
1496  void cmovptr(Condition cc, Register dst, Address  src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1497  void cmovptr(Condition cc, Register dst, Register src) { LP64_ONLY(cmovq(cc, dst, src)) NOT_LP64(cmov32(cc, dst, src)); }
1498
1499  void movoop(Register dst, jobject obj);
1500  void movoop(Address dst, jobject obj);
1501
1502  void mov_metadata(Register dst, Metadata* obj);
1503  void mov_metadata(Address dst, Metadata* obj);
1504
1505  void movptr(ArrayAddress dst, Register src);
1506  // can this do an lea?
1507  void movptr(Register dst, ArrayAddress src);
1508
1509  void movptr(Register dst, Address src);
1510
1511#ifdef _LP64
1512  void movptr(Register dst, AddressLiteral src, Register scratch=rscratch1);
1513#else
1514  void movptr(Register dst, AddressLiteral src, Register scratch=noreg); // Scratch reg is ignored in 32-bit
1515#endif
1516
1517  void movptr(Register dst, intptr_t src);
1518  void movptr(Register dst, Register src);
1519  void movptr(Address dst, intptr_t src);
1520
1521  void movptr(Address dst, Register src);
1522
1523  void movptr(Register dst, RegisterOrConstant src) {
1524    if (src.is_constant()) movptr(dst, src.as_constant());
1525    else                   movptr(dst, src.as_register());
1526  }
1527
1528#ifdef _LP64
1529  // Generally the next two are only used for moving NULL
1530  // Although there are situations in initializing the mark word where
1531  // they could be used. They are dangerous.
1532
1533  // They only exist on LP64 so that int32_t and intptr_t are not the same
1534  // and we have ambiguous declarations.
1535
1536  void movptr(Address dst, int32_t imm32);
1537  void movptr(Register dst, int32_t imm32);
1538#endif // _LP64
1539
1540  // to avoid hiding movl
1541  void mov32(AddressLiteral dst, Register src);
1542  void mov32(Register dst, AddressLiteral src);
1543
1544  // to avoid hiding movb
1545  void movbyte(ArrayAddress dst, int src);
1546
1547  // Import other mov() methods from the parent class or else
1548  // they will be hidden by the following overriding declaration.
1549  using Assembler::movdl;
1550  using Assembler::movq;
1551  void movdl(XMMRegister dst, AddressLiteral src);
1552  void movq(XMMRegister dst, AddressLiteral src);
1553
1554  // Can push value or effective address
1555  void pushptr(AddressLiteral src);
1556
1557  void pushptr(Address src) { LP64_ONLY(pushq(src)) NOT_LP64(pushl(src)); }
1558  void popptr(Address src) { LP64_ONLY(popq(src)) NOT_LP64(popl(src)); }
1559
1560  void pushoop(jobject obj);
1561  void pushklass(Metadata* obj);
1562
1563  // sign extend as need a l to ptr sized element
1564  void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
1565  void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
1566
1567  // C2 compiled method's prolog code.
1568  void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b);
1569
1570  // clear memory of size 'cnt' qwords, starting at 'base';
1571  // if 'is_large' is set, do not try to produce short loop
1572  void clear_mem(Register base, Register cnt, Register rtmp, bool is_large);
1573
1574#ifdef COMPILER2
1575  void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
1576                           XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp);
1577
1578  // IndexOf strings.
1579  // Small strings are loaded through stack if they cross page boundary.
1580  void string_indexof(Register str1, Register str2,
1581                      Register cnt1, Register cnt2,
1582                      int int_cnt2,  Register result,
1583                      XMMRegister vec, Register tmp,
1584                      int ae);
1585
1586  // IndexOf for constant substrings with size >= 8 elements
1587  // which don't need to be loaded through stack.
1588  void string_indexofC8(Register str1, Register str2,
1589                      Register cnt1, Register cnt2,
1590                      int int_cnt2,  Register result,
1591                      XMMRegister vec, Register tmp,
1592                      int ae);
1593
1594    // Smallest code: we don't need to load through stack,
1595    // check string tail.
1596
1597  // helper function for string_compare
1598  void load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
1599                          Address::ScaleFactor scale, Address::ScaleFactor scale1,
1600                          Address::ScaleFactor scale2, Register index, int ae);
1601  // Compare strings.
1602  void string_compare(Register str1, Register str2,
1603                      Register cnt1, Register cnt2, Register result,
1604                      XMMRegister vec1, int ae);
1605
1606  // Search for Non-ASCII character (Negative byte value) in a byte array,
1607  // return true if it has any and false otherwise.
1608  void has_negatives(Register ary1, Register len,
1609                     Register result, Register tmp1,
1610                     XMMRegister vec1, XMMRegister vec2);
1611
1612  // Compare char[] or byte[] arrays.
1613  void arrays_equals(bool is_array_equ, Register ary1, Register ary2,
1614                     Register limit, Register result, Register chr,
1615                     XMMRegister vec1, XMMRegister vec2, bool is_char);
1616
1617#endif
1618
1619  // Fill primitive arrays
1620  void generate_fill(BasicType t, bool aligned,
1621                     Register to, Register value, Register count,
1622                     Register rtmp, XMMRegister xtmp);
1623
1624  void encode_iso_array(Register src, Register dst, Register len,
1625                        XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1626                        XMMRegister tmp4, Register tmp5, Register result);
1627
1628#ifdef _LP64
1629  void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
1630  void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
1631                             Register y, Register y_idx, Register z,
1632                             Register carry, Register product,
1633                             Register idx, Register kdx);
1634  void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
1635                              Register yz_idx, Register idx,
1636                              Register carry, Register product, int offset);
1637  void multiply_128_x_128_bmi2_loop(Register y, Register z,
1638                                    Register carry, Register carry2,
1639                                    Register idx, Register jdx,
1640                                    Register yz_idx1, Register yz_idx2,
1641                                    Register tmp, Register tmp3, Register tmp4);
1642  void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
1643                               Register yz_idx, Register idx, Register jdx,
1644                               Register carry, Register product,
1645                               Register carry2);
1646  void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
1647                       Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
1648  void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
1649                     Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1650  void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
1651                            Register tmp2);
1652  void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
1653                       Register rdxReg, Register raxReg);
1654  void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
1655  void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1656                       Register tmp3, Register tmp4);
1657  void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
1658                     Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
1659
1660  void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
1661               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1662               Register raxReg);
1663  void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
1664               Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
1665               Register raxReg);
1666  void vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
1667                           Register result, Register tmp1, Register tmp2,
1668                           XMMRegister vec1, XMMRegister vec2, XMMRegister vec3);
1669#endif
1670
1671  // CRC32 code for java.util.zip.CRC32::updateBytes() intrinsic.
1672  void update_byte_crc32(Register crc, Register val, Register table);
1673  void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
1674  // CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
1675  // Note on a naming convention:
1676  // Prefix w = register only used on a Westmere+ architecture
1677  // Prefix n = register only used on a Nehalem architecture
1678#ifdef _LP64
1679  void crc32c_ipl_alg4(Register in_out, uint32_t n,
1680                       Register tmp1, Register tmp2, Register tmp3);
1681#else
1682  void crc32c_ipl_alg4(Register in_out, uint32_t n,
1683                       Register tmp1, Register tmp2, Register tmp3,
1684                       XMMRegister xtmp1, XMMRegister xtmp2);
1685#endif
1686  void crc32c_pclmulqdq(XMMRegister w_xtmp1,
1687                        Register in_out,
1688                        uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
1689                        XMMRegister w_xtmp2,
1690                        Register tmp1,
1691                        Register n_tmp2, Register n_tmp3);
1692  void crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
1693                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1694                       Register tmp1, Register tmp2,
1695                       Register n_tmp3);
1696  void crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
1697                         Register in_out1, Register in_out2, Register in_out3,
1698                         Register tmp1, Register tmp2, Register tmp3,
1699                         XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1700                         Register tmp4, Register tmp5,
1701                         Register n_tmp6);
1702  void crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
1703                            Register tmp1, Register tmp2, Register tmp3,
1704                            Register tmp4, Register tmp5, Register tmp6,
1705                            XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
1706                            bool is_pclmulqdq_supported);
1707  // Fold 128-bit data chunk
1708  void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset);
1709  void fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf);
1710  // Fold 8-bit data
1711  void fold_8bit_crc32(Register crc, Register table, Register tmp);
1712  void fold_8bit_crc32(XMMRegister crc, Register table, XMMRegister xtmp, Register tmp);
1713
1714  // Compress char[] array to byte[].
1715  void char_array_compress(Register src, Register dst, Register len,
1716                           XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
1717                           XMMRegister tmp4, Register tmp5, Register result);
1718
1719  // Inflate byte[] array to char[].
1720  void byte_array_inflate(Register src, Register dst, Register len,
1721                          XMMRegister tmp1, Register tmp2);
1722
1723};
1724
1725/**
1726 * class SkipIfEqual:
1727 *
1728 * Instantiating this class will result in assembly code being output that will
1729 * jump around any code emitted between the creation of the instance and it's
1730 * automatic destruction at the end of a scope block, depending on the value of
1731 * the flag passed to the constructor, which will be checked at run-time.
1732 */
1733class SkipIfEqual {
1734 private:
1735  MacroAssembler* _masm;
1736  Label _label;
1737
1738 public:
1739   SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value);
1740   ~SkipIfEqual();
1741};
1742
1743#endif // CPU_X86_VM_MACROASSEMBLER_X86_HPP
1744