macroAssembler_x86.cpp revision 9867:3125c4a60cc9
1/*
2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#include "precompiled.hpp"
26#include "asm/assembler.hpp"
27#include "asm/assembler.inline.hpp"
28#include "compiler/disassembler.hpp"
29#include "gc/shared/cardTableModRefBS.hpp"
30#include "gc/shared/collectedHeap.inline.hpp"
31#include "interpreter/interpreter.hpp"
32#include "memory/resourceArea.hpp"
33#include "memory/universe.hpp"
34#include "oops/klass.inline.hpp"
35#include "prims/methodHandles.hpp"
36#include "runtime/biasedLocking.hpp"
37#include "runtime/interfaceSupport.hpp"
38#include "runtime/objectMonitor.hpp"
39#include "runtime/os.hpp"
40#include "runtime/sharedRuntime.hpp"
41#include "runtime/stubRoutines.hpp"
42#include "runtime/thread.hpp"
43#include "utilities/macros.hpp"
44#if INCLUDE_ALL_GCS
45#include "gc/g1/g1CollectedHeap.inline.hpp"
46#include "gc/g1/g1SATBCardTableModRefBS.hpp"
47#include "gc/g1/heapRegion.hpp"
48#endif // INCLUDE_ALL_GCS
49#include "crc32c.h"
50#ifdef COMPILER2
51#include "opto/intrinsicnode.hpp"
52#endif
53
54#ifdef PRODUCT
55#define BLOCK_COMMENT(str) /* nothing */
56#define STOP(error) stop(error)
57#else
58#define BLOCK_COMMENT(str) block_comment(str)
59#define STOP(error) block_comment(error); stop(error)
60#endif
61
62#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
63
64#ifdef ASSERT
65bool AbstractAssembler::pd_check_instruction_mark() { return true; }
66#endif
67
68static Assembler::Condition reverse[] = {
69    Assembler::noOverflow     /* overflow      = 0x0 */ ,
70    Assembler::overflow       /* noOverflow    = 0x1 */ ,
71    Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
72    Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
73    Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
74    Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
75    Assembler::above          /* belowEqual    = 0x6 */ ,
76    Assembler::belowEqual     /* above         = 0x7 */ ,
77    Assembler::positive       /* negative      = 0x8 */ ,
78    Assembler::negative       /* positive      = 0x9 */ ,
79    Assembler::noParity       /* parity        = 0xa */ ,
80    Assembler::parity         /* noParity      = 0xb */ ,
81    Assembler::greaterEqual   /* less          = 0xc */ ,
82    Assembler::less           /* greaterEqual  = 0xd */ ,
83    Assembler::greater        /* lessEqual     = 0xe */ ,
84    Assembler::lessEqual      /* greater       = 0xf, */
85
86};
87
88
89// Implementation of MacroAssembler
90
91// First all the versions that have distinct versions depending on 32/64 bit
92// Unless the difference is trivial (1 line or so).
93
94#ifndef _LP64
95
96// 32bit versions
97
98Address MacroAssembler::as_Address(AddressLiteral adr) {
99  return Address(adr.target(), adr.rspec());
100}
101
102Address MacroAssembler::as_Address(ArrayAddress adr) {
103  return Address::make_array(adr);
104}
105
106void MacroAssembler::call_VM_leaf_base(address entry_point,
107                                       int number_of_arguments) {
108  call(RuntimeAddress(entry_point));
109  increment(rsp, number_of_arguments * wordSize);
110}
111
112void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
113  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
114}
115
116void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
117  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
118}
119
120void MacroAssembler::cmpoop(Address src1, jobject obj) {
121  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
122}
123
124void MacroAssembler::cmpoop(Register src1, jobject obj) {
125  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
126}
127
128void MacroAssembler::extend_sign(Register hi, Register lo) {
129  // According to Intel Doc. AP-526, "Integer Divide", p.18.
130  if (VM_Version::is_P6() && hi == rdx && lo == rax) {
131    cdql();
132  } else {
133    movl(hi, lo);
134    sarl(hi, 31);
135  }
136}
137
138void MacroAssembler::jC2(Register tmp, Label& L) {
139  // set parity bit if FPU flag C2 is set (via rax)
140  save_rax(tmp);
141  fwait(); fnstsw_ax();
142  sahf();
143  restore_rax(tmp);
144  // branch
145  jcc(Assembler::parity, L);
146}
147
148void MacroAssembler::jnC2(Register tmp, Label& L) {
149  // set parity bit if FPU flag C2 is set (via rax)
150  save_rax(tmp);
151  fwait(); fnstsw_ax();
152  sahf();
153  restore_rax(tmp);
154  // branch
155  jcc(Assembler::noParity, L);
156}
157
158// 32bit can do a case table jump in one instruction but we no longer allow the base
159// to be installed in the Address class
160void MacroAssembler::jump(ArrayAddress entry) {
161  jmp(as_Address(entry));
162}
163
164// Note: y_lo will be destroyed
165void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
166  // Long compare for Java (semantics as described in JVM spec.)
167  Label high, low, done;
168
169  cmpl(x_hi, y_hi);
170  jcc(Assembler::less, low);
171  jcc(Assembler::greater, high);
172  // x_hi is the return register
173  xorl(x_hi, x_hi);
174  cmpl(x_lo, y_lo);
175  jcc(Assembler::below, low);
176  jcc(Assembler::equal, done);
177
178  bind(high);
179  xorl(x_hi, x_hi);
180  increment(x_hi);
181  jmp(done);
182
183  bind(low);
184  xorl(x_hi, x_hi);
185  decrementl(x_hi);
186
187  bind(done);
188}
189
190void MacroAssembler::lea(Register dst, AddressLiteral src) {
191    mov_literal32(dst, (int32_t)src.target(), src.rspec());
192}
193
194void MacroAssembler::lea(Address dst, AddressLiteral adr) {
195  // leal(dst, as_Address(adr));
196  // see note in movl as to why we must use a move
197  mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
198}
199
200void MacroAssembler::leave() {
201  mov(rsp, rbp);
202  pop(rbp);
203}
204
205void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
206  // Multiplication of two Java long values stored on the stack
207  // as illustrated below. Result is in rdx:rax.
208  //
209  // rsp ---> [  ??  ] \               \
210  //            ....    | y_rsp_offset  |
211  //          [ y_lo ] /  (in bytes)    | x_rsp_offset
212  //          [ y_hi ]                  | (in bytes)
213  //            ....                    |
214  //          [ x_lo ]                 /
215  //          [ x_hi ]
216  //            ....
217  //
218  // Basic idea: lo(result) = lo(x_lo * y_lo)
219  //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
220  Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
221  Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
222  Label quick;
223  // load x_hi, y_hi and check if quick
224  // multiplication is possible
225  movl(rbx, x_hi);
226  movl(rcx, y_hi);
227  movl(rax, rbx);
228  orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
229  jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
230  // do full multiplication
231  // 1st step
232  mull(y_lo);                                    // x_hi * y_lo
233  movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
234  // 2nd step
235  movl(rax, x_lo);
236  mull(rcx);                                     // x_lo * y_hi
237  addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
238  // 3rd step
239  bind(quick);                                   // note: rbx, = 0 if quick multiply!
240  movl(rax, x_lo);
241  mull(y_lo);                                    // x_lo * y_lo
242  addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
243}
244
245void MacroAssembler::lneg(Register hi, Register lo) {
246  negl(lo);
247  adcl(hi, 0);
248  negl(hi);
249}
250
251void MacroAssembler::lshl(Register hi, Register lo) {
252  // Java shift left long support (semantics as described in JVM spec., p.305)
253  // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
254  // shift value is in rcx !
255  assert(hi != rcx, "must not use rcx");
256  assert(lo != rcx, "must not use rcx");
257  const Register s = rcx;                        // shift count
258  const int      n = BitsPerWord;
259  Label L;
260  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
261  cmpl(s, n);                                    // if (s < n)
262  jcc(Assembler::less, L);                       // else (s >= n)
263  movl(hi, lo);                                  // x := x << n
264  xorl(lo, lo);
265  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
266  bind(L);                                       // s (mod n) < n
267  shldl(hi, lo);                                 // x := x << s
268  shll(lo);
269}
270
271
272void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
273  // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
274  // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
275  assert(hi != rcx, "must not use rcx");
276  assert(lo != rcx, "must not use rcx");
277  const Register s = rcx;                        // shift count
278  const int      n = BitsPerWord;
279  Label L;
280  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
281  cmpl(s, n);                                    // if (s < n)
282  jcc(Assembler::less, L);                       // else (s >= n)
283  movl(lo, hi);                                  // x := x >> n
284  if (sign_extension) sarl(hi, 31);
285  else                xorl(hi, hi);
286  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
287  bind(L);                                       // s (mod n) < n
288  shrdl(lo, hi);                                 // x := x >> s
289  if (sign_extension) sarl(hi);
290  else                shrl(hi);
291}
292
293void MacroAssembler::movoop(Register dst, jobject obj) {
294  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
295}
296
297void MacroAssembler::movoop(Address dst, jobject obj) {
298  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
299}
300
301void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
302  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
303}
304
305void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
306  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
307}
308
309void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
310  // scratch register is not used,
311  // it is defined to match parameters of 64-bit version of this method.
312  if (src.is_lval()) {
313    mov_literal32(dst, (intptr_t)src.target(), src.rspec());
314  } else {
315    movl(dst, as_Address(src));
316  }
317}
318
319void MacroAssembler::movptr(ArrayAddress dst, Register src) {
320  movl(as_Address(dst), src);
321}
322
323void MacroAssembler::movptr(Register dst, ArrayAddress src) {
324  movl(dst, as_Address(src));
325}
326
327// src should NEVER be a real pointer. Use AddressLiteral for true pointers
328void MacroAssembler::movptr(Address dst, intptr_t src) {
329  movl(dst, src);
330}
331
332
333void MacroAssembler::pop_callee_saved_registers() {
334  pop(rcx);
335  pop(rdx);
336  pop(rdi);
337  pop(rsi);
338}
339
340void MacroAssembler::pop_fTOS() {
341  fld_d(Address(rsp, 0));
342  addl(rsp, 2 * wordSize);
343}
344
345void MacroAssembler::push_callee_saved_registers() {
346  push(rsi);
347  push(rdi);
348  push(rdx);
349  push(rcx);
350}
351
352void MacroAssembler::push_fTOS() {
353  subl(rsp, 2 * wordSize);
354  fstp_d(Address(rsp, 0));
355}
356
357
358void MacroAssembler::pushoop(jobject obj) {
359  push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
360}
361
362void MacroAssembler::pushklass(Metadata* obj) {
363  push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
364}
365
366void MacroAssembler::pushptr(AddressLiteral src) {
367  if (src.is_lval()) {
368    push_literal32((int32_t)src.target(), src.rspec());
369  } else {
370    pushl(as_Address(src));
371  }
372}
373
374void MacroAssembler::set_word_if_not_zero(Register dst) {
375  xorl(dst, dst);
376  set_byte_if_not_zero(dst);
377}
378
379static void pass_arg0(MacroAssembler* masm, Register arg) {
380  masm->push(arg);
381}
382
383static void pass_arg1(MacroAssembler* masm, Register arg) {
384  masm->push(arg);
385}
386
387static void pass_arg2(MacroAssembler* masm, Register arg) {
388  masm->push(arg);
389}
390
391static void pass_arg3(MacroAssembler* masm, Register arg) {
392  masm->push(arg);
393}
394
395#ifndef PRODUCT
396extern "C" void findpc(intptr_t x);
397#endif
398
399void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
400  // In order to get locks to work, we need to fake a in_VM state
401  JavaThread* thread = JavaThread::current();
402  JavaThreadState saved_state = thread->thread_state();
403  thread->set_thread_state(_thread_in_vm);
404  if (ShowMessageBoxOnError) {
405    JavaThread* thread = JavaThread::current();
406    JavaThreadState saved_state = thread->thread_state();
407    thread->set_thread_state(_thread_in_vm);
408    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
409      ttyLocker ttyl;
410      BytecodeCounter::print();
411    }
412    // To see where a verify_oop failed, get $ebx+40/X for this frame.
413    // This is the value of eip which points to where verify_oop will return.
414    if (os::message_box(msg, "Execution stopped, print registers?")) {
415      print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
416      BREAKPOINT;
417    }
418  } else {
419    ttyLocker ttyl;
420    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
421  }
422  // Don't assert holding the ttyLock
423    assert(false, "DEBUG MESSAGE: %s", msg);
424  ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
425}
426
427void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
428  ttyLocker ttyl;
429  FlagSetting fs(Debugging, true);
430  tty->print_cr("eip = 0x%08x", eip);
431#ifndef PRODUCT
432  if ((WizardMode || Verbose) && PrintMiscellaneous) {
433    tty->cr();
434    findpc(eip);
435    tty->cr();
436  }
437#endif
438#define PRINT_REG(rax) \
439  { tty->print("%s = ", #rax); os::print_location(tty, rax); }
440  PRINT_REG(rax);
441  PRINT_REG(rbx);
442  PRINT_REG(rcx);
443  PRINT_REG(rdx);
444  PRINT_REG(rdi);
445  PRINT_REG(rsi);
446  PRINT_REG(rbp);
447  PRINT_REG(rsp);
448#undef PRINT_REG
449  // Print some words near top of staack.
450  int* dump_sp = (int*) rsp;
451  for (int col1 = 0; col1 < 8; col1++) {
452    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
453    os::print_location(tty, *dump_sp++);
454  }
455  for (int row = 0; row < 16; row++) {
456    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
457    for (int col = 0; col < 8; col++) {
458      tty->print(" 0x%08x", *dump_sp++);
459    }
460    tty->cr();
461  }
462  // Print some instructions around pc:
463  Disassembler::decode((address)eip-64, (address)eip);
464  tty->print_cr("--------");
465  Disassembler::decode((address)eip, (address)eip+32);
466}
467
468void MacroAssembler::stop(const char* msg) {
469  ExternalAddress message((address)msg);
470  // push address of message
471  pushptr(message.addr());
472  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
473  pusha();                                            // push registers
474  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
475  hlt();
476}
477
478void MacroAssembler::warn(const char* msg) {
479  push_CPU_state();
480
481  ExternalAddress message((address) msg);
482  // push address of message
483  pushptr(message.addr());
484
485  call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
486  addl(rsp, wordSize);       // discard argument
487  pop_CPU_state();
488}
489
490void MacroAssembler::print_state() {
491  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
492  pusha();                                            // push registers
493
494  push_CPU_state();
495  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
496  pop_CPU_state();
497
498  popa();
499  addl(rsp, wordSize);
500}
501
502#else // _LP64
503
504// 64 bit versions
505
506Address MacroAssembler::as_Address(AddressLiteral adr) {
507  // amd64 always does this as a pc-rel
508  // we can be absolute or disp based on the instruction type
509  // jmp/call are displacements others are absolute
510  assert(!adr.is_lval(), "must be rval");
511  assert(reachable(adr), "must be");
512  return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
513
514}
515
516Address MacroAssembler::as_Address(ArrayAddress adr) {
517  AddressLiteral base = adr.base();
518  lea(rscratch1, base);
519  Address index = adr.index();
520  assert(index._disp == 0, "must not have disp"); // maybe it can?
521  Address array(rscratch1, index._index, index._scale, index._disp);
522  return array;
523}
524
525void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
526  Label L, E;
527
528#ifdef _WIN64
529  // Windows always allocates space for it's register args
530  assert(num_args <= 4, "only register arguments supported");
531  subq(rsp,  frame::arg_reg_save_area_bytes);
532#endif
533
534  // Align stack if necessary
535  testl(rsp, 15);
536  jcc(Assembler::zero, L);
537
538  subq(rsp, 8);
539  {
540    call(RuntimeAddress(entry_point));
541  }
542  addq(rsp, 8);
543  jmp(E);
544
545  bind(L);
546  {
547    call(RuntimeAddress(entry_point));
548  }
549
550  bind(E);
551
552#ifdef _WIN64
553  // restore stack pointer
554  addq(rsp, frame::arg_reg_save_area_bytes);
555#endif
556
557}
558
559void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
560  assert(!src2.is_lval(), "should use cmpptr");
561
562  if (reachable(src2)) {
563    cmpq(src1, as_Address(src2));
564  } else {
565    lea(rscratch1, src2);
566    Assembler::cmpq(src1, Address(rscratch1, 0));
567  }
568}
569
570int MacroAssembler::corrected_idivq(Register reg) {
571  // Full implementation of Java ldiv and lrem; checks for special
572  // case as described in JVM spec., p.243 & p.271.  The function
573  // returns the (pc) offset of the idivl instruction - may be needed
574  // for implicit exceptions.
575  //
576  //         normal case                           special case
577  //
578  // input : rax: dividend                         min_long
579  //         reg: divisor   (may not be eax/edx)   -1
580  //
581  // output: rax: quotient  (= rax idiv reg)       min_long
582  //         rdx: remainder (= rax irem reg)       0
583  assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
584  static const int64_t min_long = 0x8000000000000000;
585  Label normal_case, special_case;
586
587  // check for special case
588  cmp64(rax, ExternalAddress((address) &min_long));
589  jcc(Assembler::notEqual, normal_case);
590  xorl(rdx, rdx); // prepare rdx for possible special case (where
591                  // remainder = 0)
592  cmpq(reg, -1);
593  jcc(Assembler::equal, special_case);
594
595  // handle normal case
596  bind(normal_case);
597  cdqq();
598  int idivq_offset = offset();
599  idivq(reg);
600
601  // normal and special case exit
602  bind(special_case);
603
604  return idivq_offset;
605}
606
607void MacroAssembler::decrementq(Register reg, int value) {
608  if (value == min_jint) { subq(reg, value); return; }
609  if (value <  0) { incrementq(reg, -value); return; }
610  if (value == 0) {                        ; return; }
611  if (value == 1 && UseIncDec) { decq(reg) ; return; }
612  /* else */      { subq(reg, value)       ; return; }
613}
614
615void MacroAssembler::decrementq(Address dst, int value) {
616  if (value == min_jint) { subq(dst, value); return; }
617  if (value <  0) { incrementq(dst, -value); return; }
618  if (value == 0) {                        ; return; }
619  if (value == 1 && UseIncDec) { decq(dst) ; return; }
620  /* else */      { subq(dst, value)       ; return; }
621}
622
623void MacroAssembler::incrementq(AddressLiteral dst) {
624  if (reachable(dst)) {
625    incrementq(as_Address(dst));
626  } else {
627    lea(rscratch1, dst);
628    incrementq(Address(rscratch1, 0));
629  }
630}
631
632void MacroAssembler::incrementq(Register reg, int value) {
633  if (value == min_jint) { addq(reg, value); return; }
634  if (value <  0) { decrementq(reg, -value); return; }
635  if (value == 0) {                        ; return; }
636  if (value == 1 && UseIncDec) { incq(reg) ; return; }
637  /* else */      { addq(reg, value)       ; return; }
638}
639
640void MacroAssembler::incrementq(Address dst, int value) {
641  if (value == min_jint) { addq(dst, value); return; }
642  if (value <  0) { decrementq(dst, -value); return; }
643  if (value == 0) {                        ; return; }
644  if (value == 1 && UseIncDec) { incq(dst) ; return; }
645  /* else */      { addq(dst, value)       ; return; }
646}
647
648// 32bit can do a case table jump in one instruction but we no longer allow the base
649// to be installed in the Address class
650void MacroAssembler::jump(ArrayAddress entry) {
651  lea(rscratch1, entry.base());
652  Address dispatch = entry.index();
653  assert(dispatch._base == noreg, "must be");
654  dispatch._base = rscratch1;
655  jmp(dispatch);
656}
657
658void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
659  ShouldNotReachHere(); // 64bit doesn't use two regs
660  cmpq(x_lo, y_lo);
661}
662
663void MacroAssembler::lea(Register dst, AddressLiteral src) {
664    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
665}
666
667void MacroAssembler::lea(Address dst, AddressLiteral adr) {
668  mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
669  movptr(dst, rscratch1);
670}
671
672void MacroAssembler::leave() {
673  // %%% is this really better? Why not on 32bit too?
674  emit_int8((unsigned char)0xC9); // LEAVE
675}
676
677void MacroAssembler::lneg(Register hi, Register lo) {
678  ShouldNotReachHere(); // 64bit doesn't use two regs
679  negq(lo);
680}
681
682void MacroAssembler::movoop(Register dst, jobject obj) {
683  mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
684}
685
686void MacroAssembler::movoop(Address dst, jobject obj) {
687  mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
688  movq(dst, rscratch1);
689}
690
691void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
692  mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
693}
694
695void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
696  mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
697  movq(dst, rscratch1);
698}
699
700void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
701  if (src.is_lval()) {
702    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
703  } else {
704    if (reachable(src)) {
705      movq(dst, as_Address(src));
706    } else {
707      lea(scratch, src);
708      movq(dst, Address(scratch, 0));
709    }
710  }
711}
712
713void MacroAssembler::movptr(ArrayAddress dst, Register src) {
714  movq(as_Address(dst), src);
715}
716
717void MacroAssembler::movptr(Register dst, ArrayAddress src) {
718  movq(dst, as_Address(src));
719}
720
721// src should NEVER be a real pointer. Use AddressLiteral for true pointers
722void MacroAssembler::movptr(Address dst, intptr_t src) {
723  mov64(rscratch1, src);
724  movq(dst, rscratch1);
725}
726
727// These are mostly for initializing NULL
728void MacroAssembler::movptr(Address dst, int32_t src) {
729  movslq(dst, src);
730}
731
732void MacroAssembler::movptr(Register dst, int32_t src) {
733  mov64(dst, (intptr_t)src);
734}
735
736void MacroAssembler::pushoop(jobject obj) {
737  movoop(rscratch1, obj);
738  push(rscratch1);
739}
740
741void MacroAssembler::pushklass(Metadata* obj) {
742  mov_metadata(rscratch1, obj);
743  push(rscratch1);
744}
745
746void MacroAssembler::pushptr(AddressLiteral src) {
747  lea(rscratch1, src);
748  if (src.is_lval()) {
749    push(rscratch1);
750  } else {
751    pushq(Address(rscratch1, 0));
752  }
753}
754
755void MacroAssembler::reset_last_Java_frame(bool clear_fp,
756                                           bool clear_pc) {
757  // we must set sp to zero to clear frame
758  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
759  // must clear fp, so that compiled frames are not confused; it is
760  // possible that we need it only for debugging
761  if (clear_fp) {
762    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
763  }
764
765  if (clear_pc) {
766    movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
767  }
768}
769
770void MacroAssembler::set_last_Java_frame(Register last_java_sp,
771                                         Register last_java_fp,
772                                         address  last_java_pc) {
773  // determine last_java_sp register
774  if (!last_java_sp->is_valid()) {
775    last_java_sp = rsp;
776  }
777
778  // last_java_fp is optional
779  if (last_java_fp->is_valid()) {
780    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
781           last_java_fp);
782  }
783
784  // last_java_pc is optional
785  if (last_java_pc != NULL) {
786    Address java_pc(r15_thread,
787                    JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
788    lea(rscratch1, InternalAddress(last_java_pc));
789    movptr(java_pc, rscratch1);
790  }
791
792  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
793}
794
795static void pass_arg0(MacroAssembler* masm, Register arg) {
796  if (c_rarg0 != arg ) {
797    masm->mov(c_rarg0, arg);
798  }
799}
800
801static void pass_arg1(MacroAssembler* masm, Register arg) {
802  if (c_rarg1 != arg ) {
803    masm->mov(c_rarg1, arg);
804  }
805}
806
807static void pass_arg2(MacroAssembler* masm, Register arg) {
808  if (c_rarg2 != arg ) {
809    masm->mov(c_rarg2, arg);
810  }
811}
812
813static void pass_arg3(MacroAssembler* masm, Register arg) {
814  if (c_rarg3 != arg ) {
815    masm->mov(c_rarg3, arg);
816  }
817}
818
819void MacroAssembler::stop(const char* msg) {
820  address rip = pc();
821  pusha(); // get regs on stack
822  lea(c_rarg0, ExternalAddress((address) msg));
823  lea(c_rarg1, InternalAddress(rip));
824  movq(c_rarg2, rsp); // pass pointer to regs array
825  andq(rsp, -16); // align stack as required by ABI
826  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
827  hlt();
828}
829
830void MacroAssembler::warn(const char* msg) {
831  push(rbp);
832  movq(rbp, rsp);
833  andq(rsp, -16);     // align stack as required by push_CPU_state and call
834  push_CPU_state();   // keeps alignment at 16 bytes
835  lea(c_rarg0, ExternalAddress((address) msg));
836  call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
837  pop_CPU_state();
838  mov(rsp, rbp);
839  pop(rbp);
840}
841
842void MacroAssembler::print_state() {
843  address rip = pc();
844  pusha();            // get regs on stack
845  push(rbp);
846  movq(rbp, rsp);
847  andq(rsp, -16);     // align stack as required by push_CPU_state and call
848  push_CPU_state();   // keeps alignment at 16 bytes
849
850  lea(c_rarg0, InternalAddress(rip));
851  lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
852  call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
853
854  pop_CPU_state();
855  mov(rsp, rbp);
856  pop(rbp);
857  popa();
858}
859
860#ifndef PRODUCT
861extern "C" void findpc(intptr_t x);
862#endif
863
864void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
865  // In order to get locks to work, we need to fake a in_VM state
866  if (ShowMessageBoxOnError) {
867    JavaThread* thread = JavaThread::current();
868    JavaThreadState saved_state = thread->thread_state();
869    thread->set_thread_state(_thread_in_vm);
870#ifndef PRODUCT
871    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
872      ttyLocker ttyl;
873      BytecodeCounter::print();
874    }
875#endif
876    // To see where a verify_oop failed, get $ebx+40/X for this frame.
877    // XXX correct this offset for amd64
878    // This is the value of eip which points to where verify_oop will return.
879    if (os::message_box(msg, "Execution stopped, print registers?")) {
880      print_state64(pc, regs);
881      BREAKPOINT;
882      assert(false, "start up GDB");
883    }
884    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
885  } else {
886    ttyLocker ttyl;
887    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
888                    msg);
889    assert(false, "DEBUG MESSAGE: %s", msg);
890  }
891}
892
893void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
894  ttyLocker ttyl;
895  FlagSetting fs(Debugging, true);
896  tty->print_cr("rip = 0x%016lx", pc);
897#ifndef PRODUCT
898  tty->cr();
899  findpc(pc);
900  tty->cr();
901#endif
902#define PRINT_REG(rax, value) \
903  { tty->print("%s = ", #rax); os::print_location(tty, value); }
904  PRINT_REG(rax, regs[15]);
905  PRINT_REG(rbx, regs[12]);
906  PRINT_REG(rcx, regs[14]);
907  PRINT_REG(rdx, regs[13]);
908  PRINT_REG(rdi, regs[8]);
909  PRINT_REG(rsi, regs[9]);
910  PRINT_REG(rbp, regs[10]);
911  PRINT_REG(rsp, regs[11]);
912  PRINT_REG(r8 , regs[7]);
913  PRINT_REG(r9 , regs[6]);
914  PRINT_REG(r10, regs[5]);
915  PRINT_REG(r11, regs[4]);
916  PRINT_REG(r12, regs[3]);
917  PRINT_REG(r13, regs[2]);
918  PRINT_REG(r14, regs[1]);
919  PRINT_REG(r15, regs[0]);
920#undef PRINT_REG
921  // Print some words near top of staack.
922  int64_t* rsp = (int64_t*) regs[11];
923  int64_t* dump_sp = rsp;
924  for (int col1 = 0; col1 < 8; col1++) {
925    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
926    os::print_location(tty, *dump_sp++);
927  }
928  for (int row = 0; row < 25; row++) {
929    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
930    for (int col = 0; col < 4; col++) {
931      tty->print(" 0x%016lx", *dump_sp++);
932    }
933    tty->cr();
934  }
935  // Print some instructions around pc:
936  Disassembler::decode((address)pc-64, (address)pc);
937  tty->print_cr("--------");
938  Disassembler::decode((address)pc, (address)pc+32);
939}
940
941#endif // _LP64
942
943// Now versions that are common to 32/64 bit
944
945void MacroAssembler::addptr(Register dst, int32_t imm32) {
946  LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
947}
948
949void MacroAssembler::addptr(Register dst, Register src) {
950  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
951}
952
953void MacroAssembler::addptr(Address dst, Register src) {
954  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
955}
956
957void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
958  if (reachable(src)) {
959    Assembler::addsd(dst, as_Address(src));
960  } else {
961    lea(rscratch1, src);
962    Assembler::addsd(dst, Address(rscratch1, 0));
963  }
964}
965
966void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
967  if (reachable(src)) {
968    addss(dst, as_Address(src));
969  } else {
970    lea(rscratch1, src);
971    addss(dst, Address(rscratch1, 0));
972  }
973}
974
975void MacroAssembler::align(int modulus) {
976  align(modulus, offset());
977}
978
979void MacroAssembler::align(int modulus, int target) {
980  if (target % modulus != 0) {
981    nop(modulus - (target % modulus));
982  }
983}
984
985void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
986  // Used in sign-masking with aligned address.
987  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
988  if (reachable(src)) {
989    Assembler::andpd(dst, as_Address(src));
990  } else {
991    lea(rscratch1, src);
992    Assembler::andpd(dst, Address(rscratch1, 0));
993  }
994}
995
996void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
997  // Used in sign-masking with aligned address.
998  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
999  if (reachable(src)) {
1000    Assembler::andps(dst, as_Address(src));
1001  } else {
1002    lea(rscratch1, src);
1003    Assembler::andps(dst, Address(rscratch1, 0));
1004  }
1005}
1006
1007void MacroAssembler::andptr(Register dst, int32_t imm32) {
1008  LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1009}
1010
1011void MacroAssembler::atomic_incl(Address counter_addr) {
1012  if (os::is_MP())
1013    lock();
1014  incrementl(counter_addr);
1015}
1016
1017void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1018  if (reachable(counter_addr)) {
1019    atomic_incl(as_Address(counter_addr));
1020  } else {
1021    lea(scr, counter_addr);
1022    atomic_incl(Address(scr, 0));
1023  }
1024}
1025
1026#ifdef _LP64
1027void MacroAssembler::atomic_incq(Address counter_addr) {
1028  if (os::is_MP())
1029    lock();
1030  incrementq(counter_addr);
1031}
1032
1033void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1034  if (reachable(counter_addr)) {
1035    atomic_incq(as_Address(counter_addr));
1036  } else {
1037    lea(scr, counter_addr);
1038    atomic_incq(Address(scr, 0));
1039  }
1040}
1041#endif
1042
1043// Writes to stack successive pages until offset reached to check for
1044// stack overflow + shadow pages.  This clobbers tmp.
1045void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1046  movptr(tmp, rsp);
1047  // Bang stack for total size given plus shadow page size.
1048  // Bang one page at a time because large size can bang beyond yellow and
1049  // red zones.
1050  Label loop;
1051  bind(loop);
1052  movl(Address(tmp, (-os::vm_page_size())), size );
1053  subptr(tmp, os::vm_page_size());
1054  subl(size, os::vm_page_size());
1055  jcc(Assembler::greater, loop);
1056
1057  // Bang down shadow pages too.
1058  // At this point, (tmp-0) is the last address touched, so don't
1059  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1060  // was post-decremented.)  Skip this address by starting at i=1, and
1061  // touch a few more pages below.  N.B.  It is important to touch all
1062  // the way down including all pages in the shadow zone.
1063  for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1064    // this could be any sized move but this is can be a debugging crumb
1065    // so the bigger the better.
1066    movptr(Address(tmp, (-i*os::vm_page_size())), size );
1067  }
1068}
1069
1070void MacroAssembler::reserved_stack_check() {
1071    // testing if reserved zone needs to be enabled
1072    Label no_reserved_zone_enabling;
1073    Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1074    NOT_LP64(get_thread(rsi);)
1075
1076    cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1077    jcc(Assembler::below, no_reserved_zone_enabling);
1078
1079    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1080    jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1081    should_not_reach_here();
1082
1083    bind(no_reserved_zone_enabling);
1084}
1085
1086int MacroAssembler::biased_locking_enter(Register lock_reg,
1087                                         Register obj_reg,
1088                                         Register swap_reg,
1089                                         Register tmp_reg,
1090                                         bool swap_reg_contains_mark,
1091                                         Label& done,
1092                                         Label* slow_case,
1093                                         BiasedLockingCounters* counters) {
1094  assert(UseBiasedLocking, "why call this otherwise?");
1095  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1096  assert(tmp_reg != noreg, "tmp_reg must be supplied");
1097  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1098  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1099  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1100  Address saved_mark_addr(lock_reg, 0);
1101
1102  if (PrintBiasedLockingStatistics && counters == NULL) {
1103    counters = BiasedLocking::counters();
1104  }
1105  // Biased locking
1106  // See whether the lock is currently biased toward our thread and
1107  // whether the epoch is still valid
1108  // Note that the runtime guarantees sufficient alignment of JavaThread
1109  // pointers to allow age to be placed into low bits
1110  // First check to see whether biasing is even enabled for this object
1111  Label cas_label;
1112  int null_check_offset = -1;
1113  if (!swap_reg_contains_mark) {
1114    null_check_offset = offset();
1115    movptr(swap_reg, mark_addr);
1116  }
1117  movptr(tmp_reg, swap_reg);
1118  andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1119  cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1120  jcc(Assembler::notEqual, cas_label);
1121  // The bias pattern is present in the object's header. Need to check
1122  // whether the bias owner and the epoch are both still current.
1123#ifndef _LP64
1124  // Note that because there is no current thread register on x86_32 we
1125  // need to store off the mark word we read out of the object to
1126  // avoid reloading it and needing to recheck invariants below. This
1127  // store is unfortunate but it makes the overall code shorter and
1128  // simpler.
1129  movptr(saved_mark_addr, swap_reg);
1130#endif
1131  if (swap_reg_contains_mark) {
1132    null_check_offset = offset();
1133  }
1134  load_prototype_header(tmp_reg, obj_reg);
1135#ifdef _LP64
1136  orptr(tmp_reg, r15_thread);
1137  xorptr(tmp_reg, swap_reg);
1138  Register header_reg = tmp_reg;
1139#else
1140  xorptr(tmp_reg, swap_reg);
1141  get_thread(swap_reg);
1142  xorptr(swap_reg, tmp_reg);
1143  Register header_reg = swap_reg;
1144#endif
1145  andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1146  if (counters != NULL) {
1147    cond_inc32(Assembler::zero,
1148               ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1149  }
1150  jcc(Assembler::equal, done);
1151
1152  Label try_revoke_bias;
1153  Label try_rebias;
1154
1155  // At this point we know that the header has the bias pattern and
1156  // that we are not the bias owner in the current epoch. We need to
1157  // figure out more details about the state of the header in order to
1158  // know what operations can be legally performed on the object's
1159  // header.
1160
1161  // If the low three bits in the xor result aren't clear, that means
1162  // the prototype header is no longer biased and we have to revoke
1163  // the bias on this object.
1164  testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1165  jccb(Assembler::notZero, try_revoke_bias);
1166
1167  // Biasing is still enabled for this data type. See whether the
1168  // epoch of the current bias is still valid, meaning that the epoch
1169  // bits of the mark word are equal to the epoch bits of the
1170  // prototype header. (Note that the prototype header's epoch bits
1171  // only change at a safepoint.) If not, attempt to rebias the object
1172  // toward the current thread. Note that we must be absolutely sure
1173  // that the current epoch is invalid in order to do this because
1174  // otherwise the manipulations it performs on the mark word are
1175  // illegal.
1176  testptr(header_reg, markOopDesc::epoch_mask_in_place);
1177  jccb(Assembler::notZero, try_rebias);
1178
1179  // The epoch of the current bias is still valid but we know nothing
1180  // about the owner; it might be set or it might be clear. Try to
1181  // acquire the bias of the object using an atomic operation. If this
1182  // fails we will go in to the runtime to revoke the object's bias.
1183  // Note that we first construct the presumed unbiased header so we
1184  // don't accidentally blow away another thread's valid bias.
1185  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1186  andptr(swap_reg,
1187         markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1188#ifdef _LP64
1189  movptr(tmp_reg, swap_reg);
1190  orptr(tmp_reg, r15_thread);
1191#else
1192  get_thread(tmp_reg);
1193  orptr(tmp_reg, swap_reg);
1194#endif
1195  if (os::is_MP()) {
1196    lock();
1197  }
1198  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1199  // If the biasing toward our thread failed, this means that
1200  // another thread succeeded in biasing it toward itself and we
1201  // need to revoke that bias. The revocation will occur in the
1202  // interpreter runtime in the slow case.
1203  if (counters != NULL) {
1204    cond_inc32(Assembler::zero,
1205               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1206  }
1207  if (slow_case != NULL) {
1208    jcc(Assembler::notZero, *slow_case);
1209  }
1210  jmp(done);
1211
1212  bind(try_rebias);
1213  // At this point we know the epoch has expired, meaning that the
1214  // current "bias owner", if any, is actually invalid. Under these
1215  // circumstances _only_, we are allowed to use the current header's
1216  // value as the comparison value when doing the cas to acquire the
1217  // bias in the current epoch. In other words, we allow transfer of
1218  // the bias from one thread to another directly in this situation.
1219  //
1220  // FIXME: due to a lack of registers we currently blow away the age
1221  // bits in this situation. Should attempt to preserve them.
1222  load_prototype_header(tmp_reg, obj_reg);
1223#ifdef _LP64
1224  orptr(tmp_reg, r15_thread);
1225#else
1226  get_thread(swap_reg);
1227  orptr(tmp_reg, swap_reg);
1228  movptr(swap_reg, saved_mark_addr);
1229#endif
1230  if (os::is_MP()) {
1231    lock();
1232  }
1233  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1234  // If the biasing toward our thread failed, then another thread
1235  // succeeded in biasing it toward itself and we need to revoke that
1236  // bias. The revocation will occur in the runtime in the slow case.
1237  if (counters != NULL) {
1238    cond_inc32(Assembler::zero,
1239               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1240  }
1241  if (slow_case != NULL) {
1242    jcc(Assembler::notZero, *slow_case);
1243  }
1244  jmp(done);
1245
1246  bind(try_revoke_bias);
1247  // The prototype mark in the klass doesn't have the bias bit set any
1248  // more, indicating that objects of this data type are not supposed
1249  // to be biased any more. We are going to try to reset the mark of
1250  // this object to the prototype value and fall through to the
1251  // CAS-based locking scheme. Note that if our CAS fails, it means
1252  // that another thread raced us for the privilege of revoking the
1253  // bias of this particular object, so it's okay to continue in the
1254  // normal locking code.
1255  //
1256  // FIXME: due to a lack of registers we currently blow away the age
1257  // bits in this situation. Should attempt to preserve them.
1258  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1259  load_prototype_header(tmp_reg, obj_reg);
1260  if (os::is_MP()) {
1261    lock();
1262  }
1263  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1264  // Fall through to the normal CAS-based lock, because no matter what
1265  // the result of the above CAS, some thread must have succeeded in
1266  // removing the bias bit from the object's header.
1267  if (counters != NULL) {
1268    cond_inc32(Assembler::zero,
1269               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1270  }
1271
1272  bind(cas_label);
1273
1274  return null_check_offset;
1275}
1276
1277void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1278  assert(UseBiasedLocking, "why call this otherwise?");
1279
1280  // Check for biased locking unlock case, which is a no-op
1281  // Note: we do not have to check the thread ID for two reasons.
1282  // First, the interpreter checks for IllegalMonitorStateException at
1283  // a higher level. Second, if the bias was revoked while we held the
1284  // lock, the object could not be rebiased toward another thread, so
1285  // the bias bit would be clear.
1286  movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1287  andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1288  cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1289  jcc(Assembler::equal, done);
1290}
1291
1292#ifdef COMPILER2
1293
1294#if INCLUDE_RTM_OPT
1295
1296// Update rtm_counters based on abort status
1297// input: abort_status
1298//        rtm_counters (RTMLockingCounters*)
1299// flags are killed
1300void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1301
1302  atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1303  if (PrintPreciseRTMLockingStatistics) {
1304    for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1305      Label check_abort;
1306      testl(abort_status, (1<<i));
1307      jccb(Assembler::equal, check_abort);
1308      atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1309      bind(check_abort);
1310    }
1311  }
1312}
1313
1314// Branch if (random & (count-1) != 0), count is 2^n
1315// tmp, scr and flags are killed
1316void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1317  assert(tmp == rax, "");
1318  assert(scr == rdx, "");
1319  rdtsc(); // modifies EDX:EAX
1320  andptr(tmp, count-1);
1321  jccb(Assembler::notZero, brLabel);
1322}
1323
1324// Perform abort ratio calculation, set no_rtm bit if high ratio
1325// input:  rtm_counters_Reg (RTMLockingCounters* address)
1326// tmpReg, rtm_counters_Reg and flags are killed
1327void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1328                                                 Register rtm_counters_Reg,
1329                                                 RTMLockingCounters* rtm_counters,
1330                                                 Metadata* method_data) {
1331  Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1332
1333  if (RTMLockingCalculationDelay > 0) {
1334    // Delay calculation
1335    movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1336    testptr(tmpReg, tmpReg);
1337    jccb(Assembler::equal, L_done);
1338  }
1339  // Abort ratio calculation only if abort_count > RTMAbortThreshold
1340  //   Aborted transactions = abort_count * 100
1341  //   All transactions = total_count *  RTMTotalCountIncrRate
1342  //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1343
1344  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1345  cmpptr(tmpReg, RTMAbortThreshold);
1346  jccb(Assembler::below, L_check_always_rtm2);
1347  imulptr(tmpReg, tmpReg, 100);
1348
1349  Register scrReg = rtm_counters_Reg;
1350  movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1351  imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1352  imulptr(scrReg, scrReg, RTMAbortRatio);
1353  cmpptr(tmpReg, scrReg);
1354  jccb(Assembler::below, L_check_always_rtm1);
1355  if (method_data != NULL) {
1356    // set rtm_state to "no rtm" in MDO
1357    mov_metadata(tmpReg, method_data);
1358    if (os::is_MP()) {
1359      lock();
1360    }
1361    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1362  }
1363  jmpb(L_done);
1364  bind(L_check_always_rtm1);
1365  // Reload RTMLockingCounters* address
1366  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1367  bind(L_check_always_rtm2);
1368  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1369  cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1370  jccb(Assembler::below, L_done);
1371  if (method_data != NULL) {
1372    // set rtm_state to "always rtm" in MDO
1373    mov_metadata(tmpReg, method_data);
1374    if (os::is_MP()) {
1375      lock();
1376    }
1377    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1378  }
1379  bind(L_done);
1380}
1381
1382// Update counters and perform abort ratio calculation
1383// input:  abort_status_Reg
1384// rtm_counters_Reg, flags are killed
1385void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1386                                   Register rtm_counters_Reg,
1387                                   RTMLockingCounters* rtm_counters,
1388                                   Metadata* method_data,
1389                                   bool profile_rtm) {
1390
1391  assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1392  // update rtm counters based on rax value at abort
1393  // reads abort_status_Reg, updates flags
1394  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1395  rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1396  if (profile_rtm) {
1397    // Save abort status because abort_status_Reg is used by following code.
1398    if (RTMRetryCount > 0) {
1399      push(abort_status_Reg);
1400    }
1401    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1402    rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1403    // restore abort status
1404    if (RTMRetryCount > 0) {
1405      pop(abort_status_Reg);
1406    }
1407  }
1408}
1409
1410// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1411// inputs: retry_count_Reg
1412//       : abort_status_Reg
1413// output: retry_count_Reg decremented by 1
1414// flags are killed
1415void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1416  Label doneRetry;
1417  assert(abort_status_Reg == rax, "");
1418  // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1419  // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1420  // if reason is in 0x6 and retry count != 0 then retry
1421  andptr(abort_status_Reg, 0x6);
1422  jccb(Assembler::zero, doneRetry);
1423  testl(retry_count_Reg, retry_count_Reg);
1424  jccb(Assembler::zero, doneRetry);
1425  pause();
1426  decrementl(retry_count_Reg);
1427  jmp(retryLabel);
1428  bind(doneRetry);
1429}
1430
1431// Spin and retry if lock is busy,
1432// inputs: box_Reg (monitor address)
1433//       : retry_count_Reg
1434// output: retry_count_Reg decremented by 1
1435//       : clear z flag if retry count exceeded
1436// tmp_Reg, scr_Reg, flags are killed
1437void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1438                                            Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1439  Label SpinLoop, SpinExit, doneRetry;
1440  int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1441
1442  testl(retry_count_Reg, retry_count_Reg);
1443  jccb(Assembler::zero, doneRetry);
1444  decrementl(retry_count_Reg);
1445  movptr(scr_Reg, RTMSpinLoopCount);
1446
1447  bind(SpinLoop);
1448  pause();
1449  decrementl(scr_Reg);
1450  jccb(Assembler::lessEqual, SpinExit);
1451  movptr(tmp_Reg, Address(box_Reg, owner_offset));
1452  testptr(tmp_Reg, tmp_Reg);
1453  jccb(Assembler::notZero, SpinLoop);
1454
1455  bind(SpinExit);
1456  jmp(retryLabel);
1457  bind(doneRetry);
1458  incrementl(retry_count_Reg); // clear z flag
1459}
1460
1461// Use RTM for normal stack locks
1462// Input: objReg (object to lock)
1463void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1464                                       Register retry_on_abort_count_Reg,
1465                                       RTMLockingCounters* stack_rtm_counters,
1466                                       Metadata* method_data, bool profile_rtm,
1467                                       Label& DONE_LABEL, Label& IsInflated) {
1468  assert(UseRTMForStackLocks, "why call this otherwise?");
1469  assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1470  assert(tmpReg == rax, "");
1471  assert(scrReg == rdx, "");
1472  Label L_rtm_retry, L_decrement_retry, L_on_abort;
1473
1474  if (RTMRetryCount > 0) {
1475    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1476    bind(L_rtm_retry);
1477  }
1478  movptr(tmpReg, Address(objReg, 0));
1479  testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
1480  jcc(Assembler::notZero, IsInflated);
1481
1482  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1483    Label L_noincrement;
1484    if (RTMTotalCountIncrRate > 1) {
1485      // tmpReg, scrReg and flags are killed
1486      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1487    }
1488    assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1489    atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1490    bind(L_noincrement);
1491  }
1492  xbegin(L_on_abort);
1493  movptr(tmpReg, Address(objReg, 0));       // fetch markword
1494  andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1495  cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1496  jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1497
1498  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1499  if (UseRTMXendForLockBusy) {
1500    xend();
1501    movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1502    jmp(L_decrement_retry);
1503  }
1504  else {
1505    xabort(0);
1506  }
1507  bind(L_on_abort);
1508  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1509    rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1510  }
1511  bind(L_decrement_retry);
1512  if (RTMRetryCount > 0) {
1513    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1514    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1515  }
1516}
1517
1518// Use RTM for inflating locks
1519// inputs: objReg (object to lock)
1520//         boxReg (on-stack box address (displaced header location) - KILLED)
1521//         tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
1522void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1523                                          Register scrReg, Register retry_on_busy_count_Reg,
1524                                          Register retry_on_abort_count_Reg,
1525                                          RTMLockingCounters* rtm_counters,
1526                                          Metadata* method_data, bool profile_rtm,
1527                                          Label& DONE_LABEL) {
1528  assert(UseRTMLocking, "why call this otherwise?");
1529  assert(tmpReg == rax, "");
1530  assert(scrReg == rdx, "");
1531  Label L_rtm_retry, L_decrement_retry, L_on_abort;
1532  int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1533
1534  // Without cast to int32_t a movptr will destroy r10 which is typically obj
1535  movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1536  movptr(boxReg, tmpReg); // Save ObjectMonitor address
1537
1538  if (RTMRetryCount > 0) {
1539    movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1540    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1541    bind(L_rtm_retry);
1542  }
1543  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1544    Label L_noincrement;
1545    if (RTMTotalCountIncrRate > 1) {
1546      // tmpReg, scrReg and flags are killed
1547      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1548    }
1549    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1550    atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1551    bind(L_noincrement);
1552  }
1553  xbegin(L_on_abort);
1554  movptr(tmpReg, Address(objReg, 0));
1555  movptr(tmpReg, Address(tmpReg, owner_offset));
1556  testptr(tmpReg, tmpReg);
1557  jcc(Assembler::zero, DONE_LABEL);
1558  if (UseRTMXendForLockBusy) {
1559    xend();
1560    jmp(L_decrement_retry);
1561  }
1562  else {
1563    xabort(0);
1564  }
1565  bind(L_on_abort);
1566  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1567  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1568    rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1569  }
1570  if (RTMRetryCount > 0) {
1571    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1572    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1573  }
1574
1575  movptr(tmpReg, Address(boxReg, owner_offset)) ;
1576  testptr(tmpReg, tmpReg) ;
1577  jccb(Assembler::notZero, L_decrement_retry) ;
1578
1579  // Appears unlocked - try to swing _owner from null to non-null.
1580  // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1581#ifdef _LP64
1582  Register threadReg = r15_thread;
1583#else
1584  get_thread(scrReg);
1585  Register threadReg = scrReg;
1586#endif
1587  if (os::is_MP()) {
1588    lock();
1589  }
1590  cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1591
1592  if (RTMRetryCount > 0) {
1593    // success done else retry
1594    jccb(Assembler::equal, DONE_LABEL) ;
1595    bind(L_decrement_retry);
1596    // Spin and retry if lock is busy.
1597    rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1598  }
1599  else {
1600    bind(L_decrement_retry);
1601  }
1602}
1603
1604#endif //  INCLUDE_RTM_OPT
1605
1606// Fast_Lock and Fast_Unlock used by C2
1607
1608// Because the transitions from emitted code to the runtime
1609// monitorenter/exit helper stubs are so slow it's critical that
1610// we inline both the stack-locking fast-path and the inflated fast path.
1611//
1612// See also: cmpFastLock and cmpFastUnlock.
1613//
1614// What follows is a specialized inline transliteration of the code
1615// in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1616// another option would be to emit TrySlowEnter and TrySlowExit methods
1617// at startup-time.  These methods would accept arguments as
1618// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1619// indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1620// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1621// In practice, however, the # of lock sites is bounded and is usually small.
1622// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1623// if the processor uses simple bimodal branch predictors keyed by EIP
1624// Since the helper routines would be called from multiple synchronization
1625// sites.
1626//
1627// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1628// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1629// to those specialized methods.  That'd give us a mostly platform-independent
1630// implementation that the JITs could optimize and inline at their pleasure.
1631// Done correctly, the only time we'd need to cross to native could would be
1632// to park() or unpark() threads.  We'd also need a few more unsafe operators
1633// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1634// (b) explicit barriers or fence operations.
1635//
1636// TODO:
1637//
1638// *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1639//    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1640//    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1641//    the lock operators would typically be faster than reifying Self.
1642//
1643// *  Ideally I'd define the primitives as:
1644//       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1645//       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1646//    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1647//    Instead, we're stuck with a rather awkward and brittle register assignments below.
1648//    Furthermore the register assignments are overconstrained, possibly resulting in
1649//    sub-optimal code near the synchronization site.
1650//
1651// *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1652//    Alternately, use a better sp-proximity test.
1653//
1654// *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1655//    Either one is sufficient to uniquely identify a thread.
1656//    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1657//
1658// *  Intrinsify notify() and notifyAll() for the common cases where the
1659//    object is locked by the calling thread but the waitlist is empty.
1660//    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1661//
1662// *  use jccb and jmpb instead of jcc and jmp to improve code density.
1663//    But beware of excessive branch density on AMD Opterons.
1664//
1665// *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1666//    or failure of the fast-path.  If the fast-path fails then we pass
1667//    control to the slow-path, typically in C.  In Fast_Lock and
1668//    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1669//    will emit a conditional branch immediately after the node.
1670//    So we have branches to branches and lots of ICC.ZF games.
1671//    Instead, it might be better to have C2 pass a "FailureLabel"
1672//    into Fast_Lock and Fast_Unlock.  In the case of success, control
1673//    will drop through the node.  ICC.ZF is undefined at exit.
1674//    In the case of failure, the node will branch directly to the
1675//    FailureLabel
1676
1677
1678// obj: object to lock
1679// box: on-stack box address (displaced header location) - KILLED
1680// rax,: tmp -- KILLED
1681// scr: tmp -- KILLED
1682void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1683                               Register scrReg, Register cx1Reg, Register cx2Reg,
1684                               BiasedLockingCounters* counters,
1685                               RTMLockingCounters* rtm_counters,
1686                               RTMLockingCounters* stack_rtm_counters,
1687                               Metadata* method_data,
1688                               bool use_rtm, bool profile_rtm) {
1689  // Ensure the register assignents are disjoint
1690  assert(tmpReg == rax, "");
1691
1692  if (use_rtm) {
1693    assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1694  } else {
1695    assert(cx1Reg == noreg, "");
1696    assert(cx2Reg == noreg, "");
1697    assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1698  }
1699
1700  if (counters != NULL) {
1701    atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1702  }
1703  if (EmitSync & 1) {
1704      // set box->dhw = markOopDesc::unused_mark()
1705      // Force all sync thru slow-path: slow_enter() and slow_exit()
1706      movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1707      cmpptr (rsp, (int32_t)NULL_WORD);
1708  } else {
1709    // Possible cases that we'll encounter in fast_lock
1710    // ------------------------------------------------
1711    // * Inflated
1712    //    -- unlocked
1713    //    -- Locked
1714    //       = by self
1715    //       = by other
1716    // * biased
1717    //    -- by Self
1718    //    -- by other
1719    // * neutral
1720    // * stack-locked
1721    //    -- by self
1722    //       = sp-proximity test hits
1723    //       = sp-proximity test generates false-negative
1724    //    -- by other
1725    //
1726
1727    Label IsInflated, DONE_LABEL;
1728
1729    // it's stack-locked, biased or neutral
1730    // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1731    // order to reduce the number of conditional branches in the most common cases.
1732    // Beware -- there's a subtle invariant that fetch of the markword
1733    // at [FETCH], below, will never observe a biased encoding (*101b).
1734    // If this invariant is not held we risk exclusion (safety) failure.
1735    if (UseBiasedLocking && !UseOptoBiasInlining) {
1736      biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1737    }
1738
1739#if INCLUDE_RTM_OPT
1740    if (UseRTMForStackLocks && use_rtm) {
1741      rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1742                        stack_rtm_counters, method_data, profile_rtm,
1743                        DONE_LABEL, IsInflated);
1744    }
1745#endif // INCLUDE_RTM_OPT
1746
1747    movptr(tmpReg, Address(objReg, 0));          // [FETCH]
1748    testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1749    jccb(Assembler::notZero, IsInflated);
1750
1751    // Attempt stack-locking ...
1752    orptr (tmpReg, markOopDesc::unlocked_value);
1753    movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1754    if (os::is_MP()) {
1755      lock();
1756    }
1757    cmpxchgptr(boxReg, Address(objReg, 0));      // Updates tmpReg
1758    if (counters != NULL) {
1759      cond_inc32(Assembler::equal,
1760                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1761    }
1762    jcc(Assembler::equal, DONE_LABEL);           // Success
1763
1764    // Recursive locking.
1765    // The object is stack-locked: markword contains stack pointer to BasicLock.
1766    // Locked by current thread if difference with current SP is less than one page.
1767    subptr(tmpReg, rsp);
1768    // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1769    andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1770    movptr(Address(boxReg, 0), tmpReg);
1771    if (counters != NULL) {
1772      cond_inc32(Assembler::equal,
1773                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1774    }
1775    jmp(DONE_LABEL);
1776
1777    bind(IsInflated);
1778    // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1779
1780#if INCLUDE_RTM_OPT
1781    // Use the same RTM locking code in 32- and 64-bit VM.
1782    if (use_rtm) {
1783      rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1784                           rtm_counters, method_data, profile_rtm, DONE_LABEL);
1785    } else {
1786#endif // INCLUDE_RTM_OPT
1787
1788#ifndef _LP64
1789    // The object is inflated.
1790
1791    // boxReg refers to the on-stack BasicLock in the current frame.
1792    // We'd like to write:
1793    //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1794    // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1795    // additional latency as we have another ST in the store buffer that must drain.
1796
1797    if (EmitSync & 8192) {
1798       movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
1799       get_thread (scrReg);
1800       movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
1801       movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
1802       if (os::is_MP()) {
1803         lock();
1804       }
1805       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1806    } else
1807    if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
1808       // register juggle because we need tmpReg for cmpxchgptr below
1809       movptr(scrReg, boxReg);
1810       movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1811
1812       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1813       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1814          // prefetchw [eax + Offset(_owner)-2]
1815          prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1816       }
1817
1818       if ((EmitSync & 64) == 0) {
1819         // Optimistic form: consider XORL tmpReg,tmpReg
1820         movptr(tmpReg, NULL_WORD);
1821       } else {
1822         // Can suffer RTS->RTO upgrades on shared or cold $ lines
1823         // Test-And-CAS instead of CAS
1824         movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1825         testptr(tmpReg, tmpReg);                   // Locked ?
1826         jccb  (Assembler::notZero, DONE_LABEL);
1827       }
1828
1829       // Appears unlocked - try to swing _owner from null to non-null.
1830       // Ideally, I'd manifest "Self" with get_thread and then attempt
1831       // to CAS the register containing Self into m->Owner.
1832       // But we don't have enough registers, so instead we can either try to CAS
1833       // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1834       // we later store "Self" into m->Owner.  Transiently storing a stack address
1835       // (rsp or the address of the box) into  m->owner is harmless.
1836       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1837       if (os::is_MP()) {
1838         lock();
1839       }
1840       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1841       movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1842       // If we weren't able to swing _owner from NULL to the BasicLock
1843       // then take the slow path.
1844       jccb  (Assembler::notZero, DONE_LABEL);
1845       // update _owner from BasicLock to thread
1846       get_thread (scrReg);                    // beware: clobbers ICCs
1847       movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1848       xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1849
1850       // If the CAS fails we can either retry or pass control to the slow-path.
1851       // We use the latter tactic.
1852       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1853       // If the CAS was successful ...
1854       //   Self has acquired the lock
1855       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1856       // Intentional fall-through into DONE_LABEL ...
1857    } else {
1858       movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
1859       movptr(boxReg, tmpReg);
1860
1861       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1862       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1863          // prefetchw [eax + Offset(_owner)-2]
1864          prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1865       }
1866
1867       if ((EmitSync & 64) == 0) {
1868         // Optimistic form
1869         xorptr  (tmpReg, tmpReg);
1870       } else {
1871         // Can suffer RTS->RTO upgrades on shared or cold $ lines
1872         movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1873         testptr(tmpReg, tmpReg);                   // Locked ?
1874         jccb  (Assembler::notZero, DONE_LABEL);
1875       }
1876
1877       // Appears unlocked - try to swing _owner from null to non-null.
1878       // Use either "Self" (in scr) or rsp as thread identity in _owner.
1879       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1880       get_thread (scrReg);
1881       if (os::is_MP()) {
1882         lock();
1883       }
1884       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1885
1886       // If the CAS fails we can either retry or pass control to the slow-path.
1887       // We use the latter tactic.
1888       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1889       // If the CAS was successful ...
1890       //   Self has acquired the lock
1891       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1892       // Intentional fall-through into DONE_LABEL ...
1893    }
1894#else // _LP64
1895    // It's inflated
1896    movq(scrReg, tmpReg);
1897    xorq(tmpReg, tmpReg);
1898
1899    if (os::is_MP()) {
1900      lock();
1901    }
1902    cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1903    // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1904    // Without cast to int32_t movptr will destroy r10 which is typically obj.
1905    movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1906    // Intentional fall-through into DONE_LABEL ...
1907    // Propagate ICC.ZF from CAS above into DONE_LABEL.
1908#endif // _LP64
1909#if INCLUDE_RTM_OPT
1910    } // use_rtm()
1911#endif
1912    // DONE_LABEL is a hot target - we'd really like to place it at the
1913    // start of cache line by padding with NOPs.
1914    // See the AMD and Intel software optimization manuals for the
1915    // most efficient "long" NOP encodings.
1916    // Unfortunately none of our alignment mechanisms suffice.
1917    bind(DONE_LABEL);
1918
1919    // At DONE_LABEL the icc ZFlag is set as follows ...
1920    // Fast_Unlock uses the same protocol.
1921    // ZFlag == 1 -> Success
1922    // ZFlag == 0 -> Failure - force control through the slow-path
1923  }
1924}
1925
1926// obj: object to unlock
1927// box: box address (displaced header location), killed.  Must be EAX.
1928// tmp: killed, cannot be obj nor box.
1929//
1930// Some commentary on balanced locking:
1931//
1932// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1933// Methods that don't have provably balanced locking are forced to run in the
1934// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1935// The interpreter provides two properties:
1936// I1:  At return-time the interpreter automatically and quietly unlocks any
1937//      objects acquired the current activation (frame).  Recall that the
1938//      interpreter maintains an on-stack list of locks currently held by
1939//      a frame.
1940// I2:  If a method attempts to unlock an object that is not held by the
1941//      the frame the interpreter throws IMSX.
1942//
1943// Lets say A(), which has provably balanced locking, acquires O and then calls B().
1944// B() doesn't have provably balanced locking so it runs in the interpreter.
1945// Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1946// is still locked by A().
1947//
1948// The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1949// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1950// should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1951// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1952// Arguably given that the spec legislates the JNI case as undefined our implementation
1953// could reasonably *avoid* checking owner in Fast_Unlock().
1954// In the interest of performance we elide m->Owner==Self check in unlock.
1955// A perfectly viable alternative is to elide the owner check except when
1956// Xcheck:jni is enabled.
1957
1958void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1959  assert(boxReg == rax, "");
1960  assert_different_registers(objReg, boxReg, tmpReg);
1961
1962  if (EmitSync & 4) {
1963    // Disable - inhibit all inlining.  Force control through the slow-path
1964    cmpptr (rsp, 0);
1965  } else {
1966    Label DONE_LABEL, Stacked, CheckSucc;
1967
1968    // Critically, the biased locking test must have precedence over
1969    // and appear before the (box->dhw == 0) recursive stack-lock test.
1970    if (UseBiasedLocking && !UseOptoBiasInlining) {
1971       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1972    }
1973
1974#if INCLUDE_RTM_OPT
1975    if (UseRTMForStackLocks && use_rtm) {
1976      assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1977      Label L_regular_unlock;
1978      movptr(tmpReg, Address(objReg, 0));           // fetch markword
1979      andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1980      cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1981      jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
1982      xend();                                       // otherwise end...
1983      jmp(DONE_LABEL);                              // ... and we're done
1984      bind(L_regular_unlock);
1985    }
1986#endif
1987
1988    cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1989    jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
1990    movptr(tmpReg, Address(objReg, 0));             // Examine the object's markword
1991    testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
1992    jccb  (Assembler::zero, Stacked);
1993
1994    // It's inflated.
1995#if INCLUDE_RTM_OPT
1996    if (use_rtm) {
1997      Label L_regular_inflated_unlock;
1998      int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1999      movptr(boxReg, Address(tmpReg, owner_offset));
2000      testptr(boxReg, boxReg);
2001      jccb(Assembler::notZero, L_regular_inflated_unlock);
2002      xend();
2003      jmpb(DONE_LABEL);
2004      bind(L_regular_inflated_unlock);
2005    }
2006#endif
2007
2008    // Despite our balanced locking property we still check that m->_owner == Self
2009    // as java routines or native JNI code called by this thread might
2010    // have released the lock.
2011    // Refer to the comments in synchronizer.cpp for how we might encode extra
2012    // state in _succ so we can avoid fetching EntryList|cxq.
2013    //
2014    // I'd like to add more cases in fast_lock() and fast_unlock() --
2015    // such as recursive enter and exit -- but we have to be wary of
2016    // I$ bloat, T$ effects and BP$ effects.
2017    //
2018    // If there's no contention try a 1-0 exit.  That is, exit without
2019    // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
2020    // we detect and recover from the race that the 1-0 exit admits.
2021    //
2022    // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
2023    // before it STs null into _owner, releasing the lock.  Updates
2024    // to data protected by the critical section must be visible before
2025    // we drop the lock (and thus before any other thread could acquire
2026    // the lock and observe the fields protected by the lock).
2027    // IA32's memory-model is SPO, so STs are ordered with respect to
2028    // each other and there's no need for an explicit barrier (fence).
2029    // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2030#ifndef _LP64
2031    get_thread (boxReg);
2032    if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2033      // prefetchw [ebx + Offset(_owner)-2]
2034      prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2035    }
2036
2037    // Note that we could employ various encoding schemes to reduce
2038    // the number of loads below (currently 4) to just 2 or 3.
2039    // Refer to the comments in synchronizer.cpp.
2040    // In practice the chain of fetches doesn't seem to impact performance, however.
2041    xorptr(boxReg, boxReg);
2042    if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2043       // Attempt to reduce branch density - AMD's branch predictor.
2044       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2045       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2046       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2047       jccb  (Assembler::notZero, DONE_LABEL);
2048       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2049       jmpb  (DONE_LABEL);
2050    } else {
2051       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2052       jccb  (Assembler::notZero, DONE_LABEL);
2053       movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2054       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2055       jccb  (Assembler::notZero, CheckSucc);
2056       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2057       jmpb  (DONE_LABEL);
2058    }
2059
2060    // The Following code fragment (EmitSync & 65536) improves the performance of
2061    // contended applications and contended synchronization microbenchmarks.
2062    // Unfortunately the emission of the code - even though not executed - causes regressions
2063    // in scimark and jetstream, evidently because of $ effects.  Replacing the code
2064    // with an equal number of never-executed NOPs results in the same regression.
2065    // We leave it off by default.
2066
2067    if ((EmitSync & 65536) != 0) {
2068       Label LSuccess, LGoSlowPath ;
2069
2070       bind  (CheckSucc);
2071
2072       // Optional pre-test ... it's safe to elide this
2073       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2074       jccb(Assembler::zero, LGoSlowPath);
2075
2076       // We have a classic Dekker-style idiom:
2077       //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
2078       // There are a number of ways to implement the barrier:
2079       // (1) lock:andl &m->_owner, 0
2080       //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2081       //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2082       //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2083       // (2) If supported, an explicit MFENCE is appealing.
2084       //     In older IA32 processors MFENCE is slower than lock:add or xchg
2085       //     particularly if the write-buffer is full as might be the case if
2086       //     if stores closely precede the fence or fence-equivalent instruction.
2087       //     See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2088       //     as the situation has changed with Nehalem and Shanghai.
2089       // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2090       //     The $lines underlying the top-of-stack should be in M-state.
2091       //     The locked add instruction is serializing, of course.
2092       // (4) Use xchg, which is serializing
2093       //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2094       // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2095       //     The integer condition codes will tell us if succ was 0.
2096       //     Since _succ and _owner should reside in the same $line and
2097       //     we just stored into _owner, it's likely that the $line
2098       //     remains in M-state for the lock:orl.
2099       //
2100       // We currently use (3), although it's likely that switching to (2)
2101       // is correct for the future.
2102
2103       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2104       if (os::is_MP()) {
2105         lock(); addptr(Address(rsp, 0), 0);
2106       }
2107       // Ratify _succ remains non-null
2108       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
2109       jccb  (Assembler::notZero, LSuccess);
2110
2111       xorptr(boxReg, boxReg);                  // box is really EAX
2112       if (os::is_MP()) { lock(); }
2113       cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2114       // There's no successor so we tried to regrab the lock with the
2115       // placeholder value. If that didn't work, then another thread
2116       // grabbed the lock so we're done (and exit was a success).
2117       jccb  (Assembler::notEqual, LSuccess);
2118       // Since we're low on registers we installed rsp as a placeholding in _owner.
2119       // Now install Self over rsp.  This is safe as we're transitioning from
2120       // non-null to non=null
2121       get_thread (boxReg);
2122       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
2123       // Intentional fall-through into LGoSlowPath ...
2124
2125       bind  (LGoSlowPath);
2126       orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2127       jmpb  (DONE_LABEL);
2128
2129       bind  (LSuccess);
2130       xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
2131       jmpb  (DONE_LABEL);
2132    }
2133
2134    bind (Stacked);
2135    // It's not inflated and it's not recursively stack-locked and it's not biased.
2136    // It must be stack-locked.
2137    // Try to reset the header to displaced header.
2138    // The "box" value on the stack is stable, so we can reload
2139    // and be assured we observe the same value as above.
2140    movptr(tmpReg, Address(boxReg, 0));
2141    if (os::is_MP()) {
2142      lock();
2143    }
2144    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2145    // Intention fall-thru into DONE_LABEL
2146
2147    // DONE_LABEL is a hot target - we'd really like to place it at the
2148    // start of cache line by padding with NOPs.
2149    // See the AMD and Intel software optimization manuals for the
2150    // most efficient "long" NOP encodings.
2151    // Unfortunately none of our alignment mechanisms suffice.
2152    if ((EmitSync & 65536) == 0) {
2153       bind (CheckSucc);
2154    }
2155#else // _LP64
2156    // It's inflated
2157    if (EmitSync & 1024) {
2158      // Emit code to check that _owner == Self
2159      // We could fold the _owner test into subsequent code more efficiently
2160      // than using a stand-alone check, but since _owner checking is off by
2161      // default we don't bother. We also might consider predicating the
2162      // _owner==Self check on Xcheck:jni or running on a debug build.
2163      movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2164      xorptr(boxReg, r15_thread);
2165    } else {
2166      xorptr(boxReg, boxReg);
2167    }
2168    orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2169    jccb  (Assembler::notZero, DONE_LABEL);
2170    movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2171    orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2172    jccb  (Assembler::notZero, CheckSucc);
2173    movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2174    jmpb  (DONE_LABEL);
2175
2176    if ((EmitSync & 65536) == 0) {
2177      // Try to avoid passing control into the slow_path ...
2178      Label LSuccess, LGoSlowPath ;
2179      bind  (CheckSucc);
2180
2181      // The following optional optimization can be elided if necessary
2182      // Effectively: if (succ == null) goto SlowPath
2183      // The code reduces the window for a race, however,
2184      // and thus benefits performance.
2185      cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2186      jccb  (Assembler::zero, LGoSlowPath);
2187
2188      if ((EmitSync & 16) && os::is_MP()) {
2189        orptr(boxReg, boxReg);
2190        xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2191      } else {
2192        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2193        if (os::is_MP()) {
2194          // Memory barrier/fence
2195          // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2196          // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2197          // This is faster on Nehalem and AMD Shanghai/Barcelona.
2198          // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2199          // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2200          // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2201          lock(); addl(Address(rsp, 0), 0);
2202        }
2203      }
2204      cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2205      jccb  (Assembler::notZero, LSuccess);
2206
2207      // Rare inopportune interleaving - race.
2208      // The successor vanished in the small window above.
2209      // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2210      // We need to ensure progress and succession.
2211      // Try to reacquire the lock.
2212      // If that fails then the new owner is responsible for succession and this
2213      // thread needs to take no further action and can exit via the fast path (success).
2214      // If the re-acquire succeeds then pass control into the slow path.
2215      // As implemented, this latter mode is horrible because we generated more
2216      // coherence traffic on the lock *and* artifically extended the critical section
2217      // length while by virtue of passing control into the slow path.
2218
2219      // box is really RAX -- the following CMPXCHG depends on that binding
2220      // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2221      movptr(boxReg, (int32_t)NULL_WORD);
2222      if (os::is_MP()) { lock(); }
2223      cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2224      // There's no successor so we tried to regrab the lock.
2225      // If that didn't work, then another thread grabbed the
2226      // lock so we're done (and exit was a success).
2227      jccb  (Assembler::notEqual, LSuccess);
2228      // Intentional fall-through into slow-path
2229
2230      bind  (LGoSlowPath);
2231      orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2232      jmpb  (DONE_LABEL);
2233
2234      bind  (LSuccess);
2235      testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2236      jmpb  (DONE_LABEL);
2237    }
2238
2239    bind  (Stacked);
2240    movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2241    if (os::is_MP()) { lock(); }
2242    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2243
2244    if (EmitSync & 65536) {
2245       bind (CheckSucc);
2246    }
2247#endif
2248    bind(DONE_LABEL);
2249  }
2250}
2251#endif // COMPILER2
2252
2253void MacroAssembler::c2bool(Register x) {
2254  // implements x == 0 ? 0 : 1
2255  // note: must only look at least-significant byte of x
2256  //       since C-style booleans are stored in one byte
2257  //       only! (was bug)
2258  andl(x, 0xFF);
2259  setb(Assembler::notZero, x);
2260}
2261
2262// Wouldn't need if AddressLiteral version had new name
2263void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2264  Assembler::call(L, rtype);
2265}
2266
2267void MacroAssembler::call(Register entry) {
2268  Assembler::call(entry);
2269}
2270
2271void MacroAssembler::call(AddressLiteral entry) {
2272  if (reachable(entry)) {
2273    Assembler::call_literal(entry.target(), entry.rspec());
2274  } else {
2275    lea(rscratch1, entry);
2276    Assembler::call(rscratch1);
2277  }
2278}
2279
2280void MacroAssembler::ic_call(address entry) {
2281  RelocationHolder rh = virtual_call_Relocation::spec(pc());
2282  movptr(rax, (intptr_t)Universe::non_oop_word());
2283  call(AddressLiteral(entry, rh));
2284}
2285
2286// Implementation of call_VM versions
2287
2288void MacroAssembler::call_VM(Register oop_result,
2289                             address entry_point,
2290                             bool check_exceptions) {
2291  Label C, E;
2292  call(C, relocInfo::none);
2293  jmp(E);
2294
2295  bind(C);
2296  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2297  ret(0);
2298
2299  bind(E);
2300}
2301
2302void MacroAssembler::call_VM(Register oop_result,
2303                             address entry_point,
2304                             Register arg_1,
2305                             bool check_exceptions) {
2306  Label C, E;
2307  call(C, relocInfo::none);
2308  jmp(E);
2309
2310  bind(C);
2311  pass_arg1(this, arg_1);
2312  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2313  ret(0);
2314
2315  bind(E);
2316}
2317
2318void MacroAssembler::call_VM(Register oop_result,
2319                             address entry_point,
2320                             Register arg_1,
2321                             Register arg_2,
2322                             bool check_exceptions) {
2323  Label C, E;
2324  call(C, relocInfo::none);
2325  jmp(E);
2326
2327  bind(C);
2328
2329  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2330
2331  pass_arg2(this, arg_2);
2332  pass_arg1(this, arg_1);
2333  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2334  ret(0);
2335
2336  bind(E);
2337}
2338
2339void MacroAssembler::call_VM(Register oop_result,
2340                             address entry_point,
2341                             Register arg_1,
2342                             Register arg_2,
2343                             Register arg_3,
2344                             bool check_exceptions) {
2345  Label C, E;
2346  call(C, relocInfo::none);
2347  jmp(E);
2348
2349  bind(C);
2350
2351  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2352  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2353  pass_arg3(this, arg_3);
2354
2355  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2356  pass_arg2(this, arg_2);
2357
2358  pass_arg1(this, arg_1);
2359  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2360  ret(0);
2361
2362  bind(E);
2363}
2364
2365void MacroAssembler::call_VM(Register oop_result,
2366                             Register last_java_sp,
2367                             address entry_point,
2368                             int number_of_arguments,
2369                             bool check_exceptions) {
2370  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2371  call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2372}
2373
2374void MacroAssembler::call_VM(Register oop_result,
2375                             Register last_java_sp,
2376                             address entry_point,
2377                             Register arg_1,
2378                             bool check_exceptions) {
2379  pass_arg1(this, arg_1);
2380  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2381}
2382
2383void MacroAssembler::call_VM(Register oop_result,
2384                             Register last_java_sp,
2385                             address entry_point,
2386                             Register arg_1,
2387                             Register arg_2,
2388                             bool check_exceptions) {
2389
2390  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2391  pass_arg2(this, arg_2);
2392  pass_arg1(this, arg_1);
2393  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2394}
2395
2396void MacroAssembler::call_VM(Register oop_result,
2397                             Register last_java_sp,
2398                             address entry_point,
2399                             Register arg_1,
2400                             Register arg_2,
2401                             Register arg_3,
2402                             bool check_exceptions) {
2403  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2404  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2405  pass_arg3(this, arg_3);
2406  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2407  pass_arg2(this, arg_2);
2408  pass_arg1(this, arg_1);
2409  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2410}
2411
2412void MacroAssembler::super_call_VM(Register oop_result,
2413                                   Register last_java_sp,
2414                                   address entry_point,
2415                                   int number_of_arguments,
2416                                   bool check_exceptions) {
2417  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2418  MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2419}
2420
2421void MacroAssembler::super_call_VM(Register oop_result,
2422                                   Register last_java_sp,
2423                                   address entry_point,
2424                                   Register arg_1,
2425                                   bool check_exceptions) {
2426  pass_arg1(this, arg_1);
2427  super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2428}
2429
2430void MacroAssembler::super_call_VM(Register oop_result,
2431                                   Register last_java_sp,
2432                                   address entry_point,
2433                                   Register arg_1,
2434                                   Register arg_2,
2435                                   bool check_exceptions) {
2436
2437  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2438  pass_arg2(this, arg_2);
2439  pass_arg1(this, arg_1);
2440  super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2441}
2442
2443void MacroAssembler::super_call_VM(Register oop_result,
2444                                   Register last_java_sp,
2445                                   address entry_point,
2446                                   Register arg_1,
2447                                   Register arg_2,
2448                                   Register arg_3,
2449                                   bool check_exceptions) {
2450  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2451  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2452  pass_arg3(this, arg_3);
2453  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2454  pass_arg2(this, arg_2);
2455  pass_arg1(this, arg_1);
2456  super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2457}
2458
2459void MacroAssembler::call_VM_base(Register oop_result,
2460                                  Register java_thread,
2461                                  Register last_java_sp,
2462                                  address  entry_point,
2463                                  int      number_of_arguments,
2464                                  bool     check_exceptions) {
2465  // determine java_thread register
2466  if (!java_thread->is_valid()) {
2467#ifdef _LP64
2468    java_thread = r15_thread;
2469#else
2470    java_thread = rdi;
2471    get_thread(java_thread);
2472#endif // LP64
2473  }
2474  // determine last_java_sp register
2475  if (!last_java_sp->is_valid()) {
2476    last_java_sp = rsp;
2477  }
2478  // debugging support
2479  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2480  LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2481#ifdef ASSERT
2482  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2483  // r12 is the heapbase.
2484  LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2485#endif // ASSERT
2486
2487  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2488  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2489
2490  // push java thread (becomes first argument of C function)
2491
2492  NOT_LP64(push(java_thread); number_of_arguments++);
2493  LP64_ONLY(mov(c_rarg0, r15_thread));
2494
2495  // set last Java frame before call
2496  assert(last_java_sp != rbp, "can't use ebp/rbp");
2497
2498  // Only interpreter should have to set fp
2499  set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2500
2501  // do the call, remove parameters
2502  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2503
2504  // restore the thread (cannot use the pushed argument since arguments
2505  // may be overwritten by C code generated by an optimizing compiler);
2506  // however can use the register value directly if it is callee saved.
2507  if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2508    // rdi & rsi (also r15) are callee saved -> nothing to do
2509#ifdef ASSERT
2510    guarantee(java_thread != rax, "change this code");
2511    push(rax);
2512    { Label L;
2513      get_thread(rax);
2514      cmpptr(java_thread, rax);
2515      jcc(Assembler::equal, L);
2516      STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2517      bind(L);
2518    }
2519    pop(rax);
2520#endif
2521  } else {
2522    get_thread(java_thread);
2523  }
2524  // reset last Java frame
2525  // Only interpreter should have to clear fp
2526  reset_last_Java_frame(java_thread, true, false);
2527
2528#ifndef CC_INTERP
2529   // C++ interp handles this in the interpreter
2530  check_and_handle_popframe(java_thread);
2531  check_and_handle_earlyret(java_thread);
2532#endif /* CC_INTERP */
2533
2534  if (check_exceptions) {
2535    // check for pending exceptions (java_thread is set upon return)
2536    cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2537#ifndef _LP64
2538    jump_cc(Assembler::notEqual,
2539            RuntimeAddress(StubRoutines::forward_exception_entry()));
2540#else
2541    // This used to conditionally jump to forward_exception however it is
2542    // possible if we relocate that the branch will not reach. So we must jump
2543    // around so we can always reach
2544
2545    Label ok;
2546    jcc(Assembler::equal, ok);
2547    jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2548    bind(ok);
2549#endif // LP64
2550  }
2551
2552  // get oop result if there is one and reset the value in the thread
2553  if (oop_result->is_valid()) {
2554    get_vm_result(oop_result, java_thread);
2555  }
2556}
2557
2558void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2559
2560  // Calculate the value for last_Java_sp
2561  // somewhat subtle. call_VM does an intermediate call
2562  // which places a return address on the stack just under the
2563  // stack pointer as the user finsihed with it. This allows
2564  // use to retrieve last_Java_pc from last_Java_sp[-1].
2565  // On 32bit we then have to push additional args on the stack to accomplish
2566  // the actual requested call. On 64bit call_VM only can use register args
2567  // so the only extra space is the return address that call_VM created.
2568  // This hopefully explains the calculations here.
2569
2570#ifdef _LP64
2571  // We've pushed one address, correct last_Java_sp
2572  lea(rax, Address(rsp, wordSize));
2573#else
2574  lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2575#endif // LP64
2576
2577  call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2578
2579}
2580
2581void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2582  call_VM_leaf_base(entry_point, number_of_arguments);
2583}
2584
2585void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2586  pass_arg0(this, arg_0);
2587  call_VM_leaf(entry_point, 1);
2588}
2589
2590void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2591
2592  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2593  pass_arg1(this, arg_1);
2594  pass_arg0(this, arg_0);
2595  call_VM_leaf(entry_point, 2);
2596}
2597
2598void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2599  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2600  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2601  pass_arg2(this, arg_2);
2602  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2603  pass_arg1(this, arg_1);
2604  pass_arg0(this, arg_0);
2605  call_VM_leaf(entry_point, 3);
2606}
2607
2608void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2609  pass_arg0(this, arg_0);
2610  MacroAssembler::call_VM_leaf_base(entry_point, 1);
2611}
2612
2613void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2614
2615  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2616  pass_arg1(this, arg_1);
2617  pass_arg0(this, arg_0);
2618  MacroAssembler::call_VM_leaf_base(entry_point, 2);
2619}
2620
2621void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2622  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2623  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2624  pass_arg2(this, arg_2);
2625  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2626  pass_arg1(this, arg_1);
2627  pass_arg0(this, arg_0);
2628  MacroAssembler::call_VM_leaf_base(entry_point, 3);
2629}
2630
2631void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2632  LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2633  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2634  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2635  pass_arg3(this, arg_3);
2636  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2637  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2638  pass_arg2(this, arg_2);
2639  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2640  pass_arg1(this, arg_1);
2641  pass_arg0(this, arg_0);
2642  MacroAssembler::call_VM_leaf_base(entry_point, 4);
2643}
2644
2645void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2646  movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2647  movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2648  verify_oop(oop_result, "broken oop in call_VM_base");
2649}
2650
2651void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2652  movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2653  movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2654}
2655
2656void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2657}
2658
2659void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2660}
2661
2662void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2663  if (reachable(src1)) {
2664    cmpl(as_Address(src1), imm);
2665  } else {
2666    lea(rscratch1, src1);
2667    cmpl(Address(rscratch1, 0), imm);
2668  }
2669}
2670
2671void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2672  assert(!src2.is_lval(), "use cmpptr");
2673  if (reachable(src2)) {
2674    cmpl(src1, as_Address(src2));
2675  } else {
2676    lea(rscratch1, src2);
2677    cmpl(src1, Address(rscratch1, 0));
2678  }
2679}
2680
2681void MacroAssembler::cmp32(Register src1, int32_t imm) {
2682  Assembler::cmpl(src1, imm);
2683}
2684
2685void MacroAssembler::cmp32(Register src1, Address src2) {
2686  Assembler::cmpl(src1, src2);
2687}
2688
2689void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2690  ucomisd(opr1, opr2);
2691
2692  Label L;
2693  if (unordered_is_less) {
2694    movl(dst, -1);
2695    jcc(Assembler::parity, L);
2696    jcc(Assembler::below , L);
2697    movl(dst, 0);
2698    jcc(Assembler::equal , L);
2699    increment(dst);
2700  } else { // unordered is greater
2701    movl(dst, 1);
2702    jcc(Assembler::parity, L);
2703    jcc(Assembler::above , L);
2704    movl(dst, 0);
2705    jcc(Assembler::equal , L);
2706    decrementl(dst);
2707  }
2708  bind(L);
2709}
2710
2711void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2712  ucomiss(opr1, opr2);
2713
2714  Label L;
2715  if (unordered_is_less) {
2716    movl(dst, -1);
2717    jcc(Assembler::parity, L);
2718    jcc(Assembler::below , L);
2719    movl(dst, 0);
2720    jcc(Assembler::equal , L);
2721    increment(dst);
2722  } else { // unordered is greater
2723    movl(dst, 1);
2724    jcc(Assembler::parity, L);
2725    jcc(Assembler::above , L);
2726    movl(dst, 0);
2727    jcc(Assembler::equal , L);
2728    decrementl(dst);
2729  }
2730  bind(L);
2731}
2732
2733
2734void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2735  if (reachable(src1)) {
2736    cmpb(as_Address(src1), imm);
2737  } else {
2738    lea(rscratch1, src1);
2739    cmpb(Address(rscratch1, 0), imm);
2740  }
2741}
2742
2743void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2744#ifdef _LP64
2745  if (src2.is_lval()) {
2746    movptr(rscratch1, src2);
2747    Assembler::cmpq(src1, rscratch1);
2748  } else if (reachable(src2)) {
2749    cmpq(src1, as_Address(src2));
2750  } else {
2751    lea(rscratch1, src2);
2752    Assembler::cmpq(src1, Address(rscratch1, 0));
2753  }
2754#else
2755  if (src2.is_lval()) {
2756    cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2757  } else {
2758    cmpl(src1, as_Address(src2));
2759  }
2760#endif // _LP64
2761}
2762
2763void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2764  assert(src2.is_lval(), "not a mem-mem compare");
2765#ifdef _LP64
2766  // moves src2's literal address
2767  movptr(rscratch1, src2);
2768  Assembler::cmpq(src1, rscratch1);
2769#else
2770  cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2771#endif // _LP64
2772}
2773
2774void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2775  if (reachable(adr)) {
2776    if (os::is_MP())
2777      lock();
2778    cmpxchgptr(reg, as_Address(adr));
2779  } else {
2780    lea(rscratch1, adr);
2781    if (os::is_MP())
2782      lock();
2783    cmpxchgptr(reg, Address(rscratch1, 0));
2784  }
2785}
2786
2787void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2788  LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2789}
2790
2791void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2792  if (reachable(src)) {
2793    Assembler::comisd(dst, as_Address(src));
2794  } else {
2795    lea(rscratch1, src);
2796    Assembler::comisd(dst, Address(rscratch1, 0));
2797  }
2798}
2799
2800void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2801  if (reachable(src)) {
2802    Assembler::comiss(dst, as_Address(src));
2803  } else {
2804    lea(rscratch1, src);
2805    Assembler::comiss(dst, Address(rscratch1, 0));
2806  }
2807}
2808
2809
2810void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2811  Condition negated_cond = negate_condition(cond);
2812  Label L;
2813  jcc(negated_cond, L);
2814  pushf(); // Preserve flags
2815  atomic_incl(counter_addr);
2816  popf();
2817  bind(L);
2818}
2819
2820int MacroAssembler::corrected_idivl(Register reg) {
2821  // Full implementation of Java idiv and irem; checks for
2822  // special case as described in JVM spec., p.243 & p.271.
2823  // The function returns the (pc) offset of the idivl
2824  // instruction - may be needed for implicit exceptions.
2825  //
2826  //         normal case                           special case
2827  //
2828  // input : rax,: dividend                         min_int
2829  //         reg: divisor   (may not be rax,/rdx)   -1
2830  //
2831  // output: rax,: quotient  (= rax, idiv reg)       min_int
2832  //         rdx: remainder (= rax, irem reg)       0
2833  assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2834  const int min_int = 0x80000000;
2835  Label normal_case, special_case;
2836
2837  // check for special case
2838  cmpl(rax, min_int);
2839  jcc(Assembler::notEqual, normal_case);
2840  xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2841  cmpl(reg, -1);
2842  jcc(Assembler::equal, special_case);
2843
2844  // handle normal case
2845  bind(normal_case);
2846  cdql();
2847  int idivl_offset = offset();
2848  idivl(reg);
2849
2850  // normal and special case exit
2851  bind(special_case);
2852
2853  return idivl_offset;
2854}
2855
2856
2857
2858void MacroAssembler::decrementl(Register reg, int value) {
2859  if (value == min_jint) {subl(reg, value) ; return; }
2860  if (value <  0) { incrementl(reg, -value); return; }
2861  if (value == 0) {                        ; return; }
2862  if (value == 1 && UseIncDec) { decl(reg) ; return; }
2863  /* else */      { subl(reg, value)       ; return; }
2864}
2865
2866void MacroAssembler::decrementl(Address dst, int value) {
2867  if (value == min_jint) {subl(dst, value) ; return; }
2868  if (value <  0) { incrementl(dst, -value); return; }
2869  if (value == 0) {                        ; return; }
2870  if (value == 1 && UseIncDec) { decl(dst) ; return; }
2871  /* else */      { subl(dst, value)       ; return; }
2872}
2873
2874void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2875  assert (shift_value > 0, "illegal shift value");
2876  Label _is_positive;
2877  testl (reg, reg);
2878  jcc (Assembler::positive, _is_positive);
2879  int offset = (1 << shift_value) - 1 ;
2880
2881  if (offset == 1) {
2882    incrementl(reg);
2883  } else {
2884    addl(reg, offset);
2885  }
2886
2887  bind (_is_positive);
2888  sarl(reg, shift_value);
2889}
2890
2891void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2892  if (reachable(src)) {
2893    Assembler::divsd(dst, as_Address(src));
2894  } else {
2895    lea(rscratch1, src);
2896    Assembler::divsd(dst, Address(rscratch1, 0));
2897  }
2898}
2899
2900void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2901  if (reachable(src)) {
2902    Assembler::divss(dst, as_Address(src));
2903  } else {
2904    lea(rscratch1, src);
2905    Assembler::divss(dst, Address(rscratch1, 0));
2906  }
2907}
2908
2909// !defined(COMPILER2) is because of stupid core builds
2910#if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2911void MacroAssembler::empty_FPU_stack() {
2912  if (VM_Version::supports_mmx()) {
2913    emms();
2914  } else {
2915    for (int i = 8; i-- > 0; ) ffree(i);
2916  }
2917}
2918#endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2919
2920
2921// Defines obj, preserves var_size_in_bytes
2922void MacroAssembler::eden_allocate(Register obj,
2923                                   Register var_size_in_bytes,
2924                                   int con_size_in_bytes,
2925                                   Register t1,
2926                                   Label& slow_case) {
2927  assert(obj == rax, "obj must be in rax, for cmpxchg");
2928  assert_different_registers(obj, var_size_in_bytes, t1);
2929  if (!Universe::heap()->supports_inline_contig_alloc()) {
2930    jmp(slow_case);
2931  } else {
2932    Register end = t1;
2933    Label retry;
2934    bind(retry);
2935    ExternalAddress heap_top((address) Universe::heap()->top_addr());
2936    movptr(obj, heap_top);
2937    if (var_size_in_bytes == noreg) {
2938      lea(end, Address(obj, con_size_in_bytes));
2939    } else {
2940      lea(end, Address(obj, var_size_in_bytes, Address::times_1));
2941    }
2942    // if end < obj then we wrapped around => object too long => slow case
2943    cmpptr(end, obj);
2944    jcc(Assembler::below, slow_case);
2945    cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
2946    jcc(Assembler::above, slow_case);
2947    // Compare obj with the top addr, and if still equal, store the new top addr in
2948    // end at the address of the top addr pointer. Sets ZF if was equal, and clears
2949    // it otherwise. Use lock prefix for atomicity on MPs.
2950    locked_cmpxchgptr(end, heap_top);
2951    jcc(Assembler::notEqual, retry);
2952  }
2953}
2954
2955void MacroAssembler::enter() {
2956  push(rbp);
2957  mov(rbp, rsp);
2958}
2959
2960// A 5 byte nop that is safe for patching (see patch_verified_entry)
2961void MacroAssembler::fat_nop() {
2962  if (UseAddressNop) {
2963    addr_nop_5();
2964  } else {
2965    emit_int8(0x26); // es:
2966    emit_int8(0x2e); // cs:
2967    emit_int8(0x64); // fs:
2968    emit_int8(0x65); // gs:
2969    emit_int8((unsigned char)0x90);
2970  }
2971}
2972
2973void MacroAssembler::fcmp(Register tmp) {
2974  fcmp(tmp, 1, true, true);
2975}
2976
2977void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2978  assert(!pop_right || pop_left, "usage error");
2979  if (VM_Version::supports_cmov()) {
2980    assert(tmp == noreg, "unneeded temp");
2981    if (pop_left) {
2982      fucomip(index);
2983    } else {
2984      fucomi(index);
2985    }
2986    if (pop_right) {
2987      fpop();
2988    }
2989  } else {
2990    assert(tmp != noreg, "need temp");
2991    if (pop_left) {
2992      if (pop_right) {
2993        fcompp();
2994      } else {
2995        fcomp(index);
2996      }
2997    } else {
2998      fcom(index);
2999    }
3000    // convert FPU condition into eflags condition via rax,
3001    save_rax(tmp);
3002    fwait(); fnstsw_ax();
3003    sahf();
3004    restore_rax(tmp);
3005  }
3006  // condition codes set as follows:
3007  //
3008  // CF (corresponds to C0) if x < y
3009  // PF (corresponds to C2) if unordered
3010  // ZF (corresponds to C3) if x = y
3011}
3012
3013void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
3014  fcmp2int(dst, unordered_is_less, 1, true, true);
3015}
3016
3017void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
3018  fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
3019  Label L;
3020  if (unordered_is_less) {
3021    movl(dst, -1);
3022    jcc(Assembler::parity, L);
3023    jcc(Assembler::below , L);
3024    movl(dst, 0);
3025    jcc(Assembler::equal , L);
3026    increment(dst);
3027  } else { // unordered is greater
3028    movl(dst, 1);
3029    jcc(Assembler::parity, L);
3030    jcc(Assembler::above , L);
3031    movl(dst, 0);
3032    jcc(Assembler::equal , L);
3033    decrementl(dst);
3034  }
3035  bind(L);
3036}
3037
3038void MacroAssembler::fld_d(AddressLiteral src) {
3039  fld_d(as_Address(src));
3040}
3041
3042void MacroAssembler::fld_s(AddressLiteral src) {
3043  fld_s(as_Address(src));
3044}
3045
3046void MacroAssembler::fld_x(AddressLiteral src) {
3047  Assembler::fld_x(as_Address(src));
3048}
3049
3050void MacroAssembler::fldcw(AddressLiteral src) {
3051  Assembler::fldcw(as_Address(src));
3052}
3053
3054void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
3055  if (reachable(src)) {
3056    Assembler::mulpd(dst, as_Address(src));
3057  } else {
3058    lea(rscratch1, src);
3059    Assembler::mulpd(dst, Address(rscratch1, 0));
3060  }
3061}
3062
3063void MacroAssembler::pow_exp_core_encoding() {
3064  // kills rax, rcx, rdx
3065  subptr(rsp,sizeof(jdouble));
3066  // computes 2^X. Stack: X ...
3067  // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
3068  // keep it on the thread's stack to compute 2^int(X) later
3069  // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
3070  // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
3071  fld_s(0);                 // Stack: X X ...
3072  frndint();                // Stack: int(X) X ...
3073  fsuba(1);                 // Stack: int(X) X-int(X) ...
3074  fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
3075  f2xm1();                  // Stack: 2^(X-int(X))-1 ...
3076  fld1();                   // Stack: 1 2^(X-int(X))-1 ...
3077  faddp(1);                 // Stack: 2^(X-int(X))
3078  // computes 2^(int(X)): add exponent bias (1023) to int(X), then
3079  // shift int(X)+1023 to exponent position.
3080  // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
3081  // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
3082  // values so detect them and set result to NaN.
3083  movl(rax,Address(rsp,0));
3084  movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
3085  addl(rax, 1023);
3086  movl(rdx,rax);
3087  shll(rax,20);
3088  // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
3089  addl(rdx,1);
3090  // Check that 1 < int(X)+1023+1 < 2048
3091  // in 3 steps:
3092  // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
3093  // 2- (int(X)+1023+1)&-2048 != 0
3094  // 3- (int(X)+1023+1)&-2048 != 1
3095  // Do 2- first because addl just updated the flags.
3096  cmov32(Assembler::equal,rax,rcx);
3097  cmpl(rdx,1);
3098  cmov32(Assembler::equal,rax,rcx);
3099  testl(rdx,rcx);
3100  cmov32(Assembler::notEqual,rax,rcx);
3101  movl(Address(rsp,4),rax);
3102  movl(Address(rsp,0),0);
3103  fmul_d(Address(rsp,0));   // Stack: 2^X ...
3104  addptr(rsp,sizeof(jdouble));
3105}
3106
3107void MacroAssembler::increase_precision() {
3108  subptr(rsp, BytesPerWord);
3109  fnstcw(Address(rsp, 0));
3110  movl(rax, Address(rsp, 0));
3111  orl(rax, 0x300);
3112  push(rax);
3113  fldcw(Address(rsp, 0));
3114  pop(rax);
3115}
3116
3117void MacroAssembler::restore_precision() {
3118  fldcw(Address(rsp, 0));
3119  addptr(rsp, BytesPerWord);
3120}
3121
3122void MacroAssembler::fast_pow() {
3123  // computes X^Y = 2^(Y * log2(X))
3124  // if fast computation is not possible, result is NaN. Requires
3125  // fallback from user of this macro.
3126  // increase precision for intermediate steps of the computation
3127  BLOCK_COMMENT("fast_pow {");
3128  increase_precision();
3129  fyl2x();                 // Stack: (Y*log2(X)) ...
3130  pow_exp_core_encoding(); // Stack: exp(X) ...
3131  restore_precision();
3132  BLOCK_COMMENT("} fast_pow");
3133}
3134
3135void MacroAssembler::pow_or_exp(int num_fpu_regs_in_use) {
3136  // kills rax, rcx, rdx
3137  // pow and exp needs 2 extra registers on the fpu stack.
3138  Label slow_case, done;
3139  Register tmp = noreg;
3140  if (!VM_Version::supports_cmov()) {
3141    // fcmp needs a temporary so preserve rdx,
3142    tmp = rdx;
3143  }
3144  Register tmp2 = rax;
3145  Register tmp3 = rcx;
3146
3147  // Stack: X Y
3148  Label x_negative, y_not_2;
3149
3150  static double two = 2.0;
3151  ExternalAddress two_addr((address)&two);
3152
3153  // constant maybe too far on 64 bit
3154  lea(tmp2, two_addr);
3155  fld_d(Address(tmp2, 0));    // Stack: 2 X Y
3156  fcmp(tmp, 2, true, false);  // Stack: X Y
3157  jcc(Assembler::parity, y_not_2);
3158  jcc(Assembler::notEqual, y_not_2);
3159
3160  fxch(); fpop();             // Stack: X
3161  fmul(0);                    // Stack: X*X
3162
3163  jmp(done);
3164
3165  bind(y_not_2);
3166
3167  fldz();                     // Stack: 0 X Y
3168  fcmp(tmp, 1, true, false);  // Stack: X Y
3169  jcc(Assembler::above, x_negative);
3170
3171  // X >= 0
3172
3173  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
3174  fld_s(1);                   // Stack: X Y X Y
3175  fast_pow();                 // Stack: X^Y X Y
3176  fcmp(tmp, 0, false, false); // Stack: X^Y X Y
3177  // X^Y not equal to itself: X^Y is NaN go to slow case.
3178  jcc(Assembler::parity, slow_case);
3179  // get rid of duplicate arguments. Stack: X^Y
3180  if (num_fpu_regs_in_use > 0) {
3181    fxch(); fpop();
3182    fxch(); fpop();
3183  } else {
3184    ffree(2);
3185    ffree(1);
3186  }
3187  jmp(done);
3188
3189  // X <= 0
3190  bind(x_negative);
3191
3192  fld_s(1);                   // Stack: Y X Y
3193  frndint();                  // Stack: int(Y) X Y
3194  fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
3195  jcc(Assembler::notEqual, slow_case);
3196
3197  subptr(rsp, 8);
3198
3199  // For X^Y, when X < 0, Y has to be an integer and the final
3200  // result depends on whether it's odd or even. We just checked
3201  // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
3202  // integer to test its parity. If int(Y) is huge and doesn't fit
3203  // in the 64 bit integer range, the integer indefinite value will
3204  // end up in the gp registers. Huge numbers are all even, the
3205  // integer indefinite number is even so it's fine.
3206
3207#ifdef ASSERT
3208  // Let's check we don't end up with an integer indefinite number
3209  // when not expected. First test for huge numbers: check whether
3210  // int(Y)+1 == int(Y) which is true for very large numbers and
3211  // those are all even. A 64 bit integer is guaranteed to not
3212  // overflow for numbers where y+1 != y (when precision is set to
3213  // double precision).
3214  Label y_not_huge;
3215
3216  fld1();                     // Stack: 1 int(Y) X Y
3217  fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
3218
3219#ifdef _LP64
3220  // trip to memory to force the precision down from double extended
3221  // precision
3222  fstp_d(Address(rsp, 0));
3223  fld_d(Address(rsp, 0));
3224#endif
3225
3226  fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
3227#endif
3228
3229  // move int(Y) as 64 bit integer to thread's stack
3230  fistp_d(Address(rsp,0));    // Stack: X Y
3231
3232#ifdef ASSERT
3233  jcc(Assembler::notEqual, y_not_huge);
3234
3235  // Y is huge so we know it's even. It may not fit in a 64 bit
3236  // integer and we don't want the debug code below to see the
3237  // integer indefinite value so overwrite int(Y) on the thread's
3238  // stack with 0.
3239  movl(Address(rsp, 0), 0);
3240  movl(Address(rsp, 4), 0);
3241
3242  bind(y_not_huge);
3243#endif
3244
3245  fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
3246  fld_s(1);                   // Stack: X Y X Y
3247  fabs();                     // Stack: abs(X) Y X Y
3248  fast_pow();                 // Stack: abs(X)^Y X Y
3249  fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
3250  // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
3251
3252  pop(tmp2);
3253  NOT_LP64(pop(tmp3));
3254  jcc(Assembler::parity, slow_case);
3255
3256#ifdef ASSERT
3257  // Check that int(Y) is not integer indefinite value (int
3258  // overflow). Shouldn't happen because for values that would
3259  // overflow, 1+int(Y)==Y which was tested earlier.
3260#ifndef _LP64
3261  {
3262    Label integer;
3263    testl(tmp2, tmp2);
3264    jcc(Assembler::notZero, integer);
3265    cmpl(tmp3, 0x80000000);
3266    jcc(Assembler::notZero, integer);
3267    STOP("integer indefinite value shouldn't be seen here");
3268    bind(integer);
3269  }
3270#else
3271  {
3272    Label integer;
3273    mov(tmp3, tmp2); // preserve tmp2 for parity check below
3274    shlq(tmp3, 1);
3275    jcc(Assembler::carryClear, integer);
3276    jcc(Assembler::notZero, integer);
3277    STOP("integer indefinite value shouldn't be seen here");
3278    bind(integer);
3279  }
3280#endif
3281#endif
3282
3283  // get rid of duplicate arguments. Stack: X^Y
3284  if (num_fpu_regs_in_use > 0) {
3285    fxch(); fpop();
3286    fxch(); fpop();
3287  } else {
3288    ffree(2);
3289    ffree(1);
3290  }
3291
3292  testl(tmp2, 1);
3293  jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
3294  // X <= 0, Y even: X^Y = -abs(X)^Y
3295
3296  fchs();                     // Stack: -abs(X)^Y Y
3297  jmp(done);
3298
3299  // slow case: runtime call
3300  bind(slow_case);
3301
3302  fpop();                       // pop incorrect result or int(Y)
3303
3304  fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dpow), 2, num_fpu_regs_in_use);
3305
3306  // Come here with result in F-TOS
3307  bind(done);
3308}
3309
3310void MacroAssembler::fpop() {
3311  ffree();
3312  fincstp();
3313}
3314
3315void MacroAssembler::load_float(Address src) {
3316  if (UseSSE >= 1) {
3317    movflt(xmm0, src);
3318  } else {
3319    LP64_ONLY(ShouldNotReachHere());
3320    NOT_LP64(fld_s(src));
3321  }
3322}
3323
3324void MacroAssembler::store_float(Address dst) {
3325  if (UseSSE >= 1) {
3326    movflt(dst, xmm0);
3327  } else {
3328    LP64_ONLY(ShouldNotReachHere());
3329    NOT_LP64(fstp_s(dst));
3330  }
3331}
3332
3333void MacroAssembler::load_double(Address src) {
3334  if (UseSSE >= 2) {
3335    movdbl(xmm0, src);
3336  } else {
3337    LP64_ONLY(ShouldNotReachHere());
3338    NOT_LP64(fld_d(src));
3339  }
3340}
3341
3342void MacroAssembler::store_double(Address dst) {
3343  if (UseSSE >= 2) {
3344    movdbl(dst, xmm0);
3345  } else {
3346    LP64_ONLY(ShouldNotReachHere());
3347    NOT_LP64(fstp_d(dst));
3348  }
3349}
3350
3351void MacroAssembler::fremr(Register tmp) {
3352  save_rax(tmp);
3353  { Label L;
3354    bind(L);
3355    fprem();
3356    fwait(); fnstsw_ax();
3357#ifdef _LP64
3358    testl(rax, 0x400);
3359    jcc(Assembler::notEqual, L);
3360#else
3361    sahf();
3362    jcc(Assembler::parity, L);
3363#endif // _LP64
3364  }
3365  restore_rax(tmp);
3366  // Result is in ST0.
3367  // Note: fxch & fpop to get rid of ST1
3368  // (otherwise FPU stack could overflow eventually)
3369  fxch(1);
3370  fpop();
3371}
3372
3373
3374void MacroAssembler::incrementl(AddressLiteral dst) {
3375  if (reachable(dst)) {
3376    incrementl(as_Address(dst));
3377  } else {
3378    lea(rscratch1, dst);
3379    incrementl(Address(rscratch1, 0));
3380  }
3381}
3382
3383void MacroAssembler::incrementl(ArrayAddress dst) {
3384  incrementl(as_Address(dst));
3385}
3386
3387void MacroAssembler::incrementl(Register reg, int value) {
3388  if (value == min_jint) {addl(reg, value) ; return; }
3389  if (value <  0) { decrementl(reg, -value); return; }
3390  if (value == 0) {                        ; return; }
3391  if (value == 1 && UseIncDec) { incl(reg) ; return; }
3392  /* else */      { addl(reg, value)       ; return; }
3393}
3394
3395void MacroAssembler::incrementl(Address dst, int value) {
3396  if (value == min_jint) {addl(dst, value) ; return; }
3397  if (value <  0) { decrementl(dst, -value); return; }
3398  if (value == 0) {                        ; return; }
3399  if (value == 1 && UseIncDec) { incl(dst) ; return; }
3400  /* else */      { addl(dst, value)       ; return; }
3401}
3402
3403void MacroAssembler::jump(AddressLiteral dst) {
3404  if (reachable(dst)) {
3405    jmp_literal(dst.target(), dst.rspec());
3406  } else {
3407    lea(rscratch1, dst);
3408    jmp(rscratch1);
3409  }
3410}
3411
3412void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3413  if (reachable(dst)) {
3414    InstructionMark im(this);
3415    relocate(dst.reloc());
3416    const int short_size = 2;
3417    const int long_size = 6;
3418    int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3419    if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3420      // 0111 tttn #8-bit disp
3421      emit_int8(0x70 | cc);
3422      emit_int8((offs - short_size) & 0xFF);
3423    } else {
3424      // 0000 1111 1000 tttn #32-bit disp
3425      emit_int8(0x0F);
3426      emit_int8((unsigned char)(0x80 | cc));
3427      emit_int32(offs - long_size);
3428    }
3429  } else {
3430#ifdef ASSERT
3431    warning("reversing conditional branch");
3432#endif /* ASSERT */
3433    Label skip;
3434    jccb(reverse[cc], skip);
3435    lea(rscratch1, dst);
3436    Assembler::jmp(rscratch1);
3437    bind(skip);
3438  }
3439}
3440
3441void MacroAssembler::ldmxcsr(AddressLiteral src) {
3442  if (reachable(src)) {
3443    Assembler::ldmxcsr(as_Address(src));
3444  } else {
3445    lea(rscratch1, src);
3446    Assembler::ldmxcsr(Address(rscratch1, 0));
3447  }
3448}
3449
3450int MacroAssembler::load_signed_byte(Register dst, Address src) {
3451  int off;
3452  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3453    off = offset();
3454    movsbl(dst, src); // movsxb
3455  } else {
3456    off = load_unsigned_byte(dst, src);
3457    shll(dst, 24);
3458    sarl(dst, 24);
3459  }
3460  return off;
3461}
3462
3463// Note: load_signed_short used to be called load_signed_word.
3464// Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3465// manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3466// The term "word" in HotSpot means a 32- or 64-bit machine word.
3467int MacroAssembler::load_signed_short(Register dst, Address src) {
3468  int off;
3469  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3470    // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3471    // version but this is what 64bit has always done. This seems to imply
3472    // that users are only using 32bits worth.
3473    off = offset();
3474    movswl(dst, src); // movsxw
3475  } else {
3476    off = load_unsigned_short(dst, src);
3477    shll(dst, 16);
3478    sarl(dst, 16);
3479  }
3480  return off;
3481}
3482
3483int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3484  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3485  // and "3.9 Partial Register Penalties", p. 22).
3486  int off;
3487  if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3488    off = offset();
3489    movzbl(dst, src); // movzxb
3490  } else {
3491    xorl(dst, dst);
3492    off = offset();
3493    movb(dst, src);
3494  }
3495  return off;
3496}
3497
3498// Note: load_unsigned_short used to be called load_unsigned_word.
3499int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3500  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3501  // and "3.9 Partial Register Penalties", p. 22).
3502  int off;
3503  if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3504    off = offset();
3505    movzwl(dst, src); // movzxw
3506  } else {
3507    xorl(dst, dst);
3508    off = offset();
3509    movw(dst, src);
3510  }
3511  return off;
3512}
3513
3514void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3515  switch (size_in_bytes) {
3516#ifndef _LP64
3517  case  8:
3518    assert(dst2 != noreg, "second dest register required");
3519    movl(dst,  src);
3520    movl(dst2, src.plus_disp(BytesPerInt));
3521    break;
3522#else
3523  case  8:  movq(dst, src); break;
3524#endif
3525  case  4:  movl(dst, src); break;
3526  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3527  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3528  default:  ShouldNotReachHere();
3529  }
3530}
3531
3532void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3533  switch (size_in_bytes) {
3534#ifndef _LP64
3535  case  8:
3536    assert(src2 != noreg, "second source register required");
3537    movl(dst,                        src);
3538    movl(dst.plus_disp(BytesPerInt), src2);
3539    break;
3540#else
3541  case  8:  movq(dst, src); break;
3542#endif
3543  case  4:  movl(dst, src); break;
3544  case  2:  movw(dst, src); break;
3545  case  1:  movb(dst, src); break;
3546  default:  ShouldNotReachHere();
3547  }
3548}
3549
3550void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3551  if (reachable(dst)) {
3552    movl(as_Address(dst), src);
3553  } else {
3554    lea(rscratch1, dst);
3555    movl(Address(rscratch1, 0), src);
3556  }
3557}
3558
3559void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3560  if (reachable(src)) {
3561    movl(dst, as_Address(src));
3562  } else {
3563    lea(rscratch1, src);
3564    movl(dst, Address(rscratch1, 0));
3565  }
3566}
3567
3568// C++ bool manipulation
3569
3570void MacroAssembler::movbool(Register dst, Address src) {
3571  if(sizeof(bool) == 1)
3572    movb(dst, src);
3573  else if(sizeof(bool) == 2)
3574    movw(dst, src);
3575  else if(sizeof(bool) == 4)
3576    movl(dst, src);
3577  else
3578    // unsupported
3579    ShouldNotReachHere();
3580}
3581
3582void MacroAssembler::movbool(Address dst, bool boolconst) {
3583  if(sizeof(bool) == 1)
3584    movb(dst, (int) boolconst);
3585  else if(sizeof(bool) == 2)
3586    movw(dst, (int) boolconst);
3587  else if(sizeof(bool) == 4)
3588    movl(dst, (int) boolconst);
3589  else
3590    // unsupported
3591    ShouldNotReachHere();
3592}
3593
3594void MacroAssembler::movbool(Address dst, Register src) {
3595  if(sizeof(bool) == 1)
3596    movb(dst, src);
3597  else if(sizeof(bool) == 2)
3598    movw(dst, src);
3599  else if(sizeof(bool) == 4)
3600    movl(dst, src);
3601  else
3602    // unsupported
3603    ShouldNotReachHere();
3604}
3605
3606void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3607  movb(as_Address(dst), src);
3608}
3609
3610void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3611  if (reachable(src)) {
3612    movdl(dst, as_Address(src));
3613  } else {
3614    lea(rscratch1, src);
3615    movdl(dst, Address(rscratch1, 0));
3616  }
3617}
3618
3619void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3620  if (reachable(src)) {
3621    movq(dst, as_Address(src));
3622  } else {
3623    lea(rscratch1, src);
3624    movq(dst, Address(rscratch1, 0));
3625  }
3626}
3627
3628void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3629  if (reachable(src)) {
3630    if (UseXmmLoadAndClearUpper) {
3631      movsd (dst, as_Address(src));
3632    } else {
3633      movlpd(dst, as_Address(src));
3634    }
3635  } else {
3636    lea(rscratch1, src);
3637    if (UseXmmLoadAndClearUpper) {
3638      movsd (dst, Address(rscratch1, 0));
3639    } else {
3640      movlpd(dst, Address(rscratch1, 0));
3641    }
3642  }
3643}
3644
3645void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3646  if (reachable(src)) {
3647    movss(dst, as_Address(src));
3648  } else {
3649    lea(rscratch1, src);
3650    movss(dst, Address(rscratch1, 0));
3651  }
3652}
3653
3654void MacroAssembler::movptr(Register dst, Register src) {
3655  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3656}
3657
3658void MacroAssembler::movptr(Register dst, Address src) {
3659  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3660}
3661
3662// src should NEVER be a real pointer. Use AddressLiteral for true pointers
3663void MacroAssembler::movptr(Register dst, intptr_t src) {
3664  LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3665}
3666
3667void MacroAssembler::movptr(Address dst, Register src) {
3668  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3669}
3670
3671void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3672  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3673    Assembler::vextractf32x4h(dst, src, 0);
3674  } else {
3675    Assembler::movdqu(dst, src);
3676  }
3677}
3678
3679void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3680  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3681    Assembler::vinsertf32x4h(dst, src, 0);
3682  } else {
3683    Assembler::movdqu(dst, src);
3684  }
3685}
3686
3687void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3688  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3689    Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3690  } else {
3691    Assembler::movdqu(dst, src);
3692  }
3693}
3694
3695void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3696  if (reachable(src)) {
3697    movdqu(dst, as_Address(src));
3698  } else {
3699    lea(rscratch1, src);
3700    movdqu(dst, Address(rscratch1, 0));
3701  }
3702}
3703
3704void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3705  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3706    Assembler::vextractf64x4h(dst, src, 0);
3707  } else {
3708    Assembler::vmovdqu(dst, src);
3709  }
3710}
3711
3712void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3713  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3714    Assembler::vinsertf64x4h(dst, src, 0);
3715  } else {
3716    Assembler::vmovdqu(dst, src);
3717  }
3718}
3719
3720void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3721  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3722    Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3723  }
3724  else {
3725    Assembler::vmovdqu(dst, src);
3726  }
3727}
3728
3729void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3730  if (reachable(src)) {
3731    vmovdqu(dst, as_Address(src));
3732  }
3733  else {
3734    lea(rscratch1, src);
3735    vmovdqu(dst, Address(rscratch1, 0));
3736  }
3737}
3738
3739void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3740  if (reachable(src)) {
3741    Assembler::movdqa(dst, as_Address(src));
3742  } else {
3743    lea(rscratch1, src);
3744    Assembler::movdqa(dst, Address(rscratch1, 0));
3745  }
3746}
3747
3748void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3749  if (reachable(src)) {
3750    Assembler::movsd(dst, as_Address(src));
3751  } else {
3752    lea(rscratch1, src);
3753    Assembler::movsd(dst, Address(rscratch1, 0));
3754  }
3755}
3756
3757void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3758  if (reachable(src)) {
3759    Assembler::movss(dst, as_Address(src));
3760  } else {
3761    lea(rscratch1, src);
3762    Assembler::movss(dst, Address(rscratch1, 0));
3763  }
3764}
3765
3766void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3767  if (reachable(src)) {
3768    Assembler::mulsd(dst, as_Address(src));
3769  } else {
3770    lea(rscratch1, src);
3771    Assembler::mulsd(dst, Address(rscratch1, 0));
3772  }
3773}
3774
3775void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3776  if (reachable(src)) {
3777    Assembler::mulss(dst, as_Address(src));
3778  } else {
3779    lea(rscratch1, src);
3780    Assembler::mulss(dst, Address(rscratch1, 0));
3781  }
3782}
3783
3784void MacroAssembler::null_check(Register reg, int offset) {
3785  if (needs_explicit_null_check(offset)) {
3786    // provoke OS NULL exception if reg = NULL by
3787    // accessing M[reg] w/o changing any (non-CC) registers
3788    // NOTE: cmpl is plenty here to provoke a segv
3789    cmpptr(rax, Address(reg, 0));
3790    // Note: should probably use testl(rax, Address(reg, 0));
3791    //       may be shorter code (however, this version of
3792    //       testl needs to be implemented first)
3793  } else {
3794    // nothing to do, (later) access of M[reg + offset]
3795    // will provoke OS NULL exception if reg = NULL
3796  }
3797}
3798
3799void MacroAssembler::os_breakpoint() {
3800  // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3801  // (e.g., MSVC can't call ps() otherwise)
3802  call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3803}
3804
3805#ifdef _LP64
3806#define XSTATE_BV 0x200
3807#endif
3808
3809void MacroAssembler::pop_CPU_state() {
3810  pop_FPU_state();
3811  pop_IU_state();
3812}
3813
3814void MacroAssembler::pop_FPU_state() {
3815#ifndef _LP64
3816  frstor(Address(rsp, 0));
3817#else
3818  fxrstor(Address(rsp, 0));
3819#endif
3820  addptr(rsp, FPUStateSizeInWords * wordSize);
3821}
3822
3823void MacroAssembler::pop_IU_state() {
3824  popa();
3825  LP64_ONLY(addq(rsp, 8));
3826  popf();
3827}
3828
3829// Save Integer and Float state
3830// Warning: Stack must be 16 byte aligned (64bit)
3831void MacroAssembler::push_CPU_state() {
3832  push_IU_state();
3833  push_FPU_state();
3834}
3835
3836void MacroAssembler::push_FPU_state() {
3837  subptr(rsp, FPUStateSizeInWords * wordSize);
3838#ifndef _LP64
3839  fnsave(Address(rsp, 0));
3840  fwait();
3841#else
3842  fxsave(Address(rsp, 0));
3843#endif // LP64
3844}
3845
3846void MacroAssembler::push_IU_state() {
3847  // Push flags first because pusha kills them
3848  pushf();
3849  // Make sure rsp stays 16-byte aligned
3850  LP64_ONLY(subq(rsp, 8));
3851  pusha();
3852}
3853
3854void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3855  // determine java_thread register
3856  if (!java_thread->is_valid()) {
3857    java_thread = rdi;
3858    get_thread(java_thread);
3859  }
3860  // we must set sp to zero to clear frame
3861  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3862  if (clear_fp) {
3863    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3864  }
3865
3866  if (clear_pc)
3867    movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3868
3869}
3870
3871void MacroAssembler::restore_rax(Register tmp) {
3872  if (tmp == noreg) pop(rax);
3873  else if (tmp != rax) mov(rax, tmp);
3874}
3875
3876void MacroAssembler::round_to(Register reg, int modulus) {
3877  addptr(reg, modulus - 1);
3878  andptr(reg, -modulus);
3879}
3880
3881void MacroAssembler::save_rax(Register tmp) {
3882  if (tmp == noreg) push(rax);
3883  else if (tmp != rax) mov(tmp, rax);
3884}
3885
3886// Write serialization page so VM thread can do a pseudo remote membar.
3887// We use the current thread pointer to calculate a thread specific
3888// offset to write to within the page. This minimizes bus traffic
3889// due to cache line collision.
3890void MacroAssembler::serialize_memory(Register thread, Register tmp) {
3891  movl(tmp, thread);
3892  shrl(tmp, os::get_serialize_page_shift_count());
3893  andl(tmp, (os::vm_page_size() - sizeof(int)));
3894
3895  Address index(noreg, tmp, Address::times_1);
3896  ExternalAddress page(os::get_memory_serialize_page());
3897
3898  // Size of store must match masking code above
3899  movl(as_Address(ArrayAddress(page, index)), tmp);
3900}
3901
3902// Calls to C land
3903//
3904// When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3905// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3906// has to be reset to 0. This is required to allow proper stack traversal.
3907void MacroAssembler::set_last_Java_frame(Register java_thread,
3908                                         Register last_java_sp,
3909                                         Register last_java_fp,
3910                                         address  last_java_pc) {
3911  // determine java_thread register
3912  if (!java_thread->is_valid()) {
3913    java_thread = rdi;
3914    get_thread(java_thread);
3915  }
3916  // determine last_java_sp register
3917  if (!last_java_sp->is_valid()) {
3918    last_java_sp = rsp;
3919  }
3920
3921  // last_java_fp is optional
3922
3923  if (last_java_fp->is_valid()) {
3924    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3925  }
3926
3927  // last_java_pc is optional
3928
3929  if (last_java_pc != NULL) {
3930    lea(Address(java_thread,
3931                 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3932        InternalAddress(last_java_pc));
3933
3934  }
3935  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3936}
3937
3938void MacroAssembler::shlptr(Register dst, int imm8) {
3939  LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3940}
3941
3942void MacroAssembler::shrptr(Register dst, int imm8) {
3943  LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3944}
3945
3946void MacroAssembler::sign_extend_byte(Register reg) {
3947  if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3948    movsbl(reg, reg); // movsxb
3949  } else {
3950    shll(reg, 24);
3951    sarl(reg, 24);
3952  }
3953}
3954
3955void MacroAssembler::sign_extend_short(Register reg) {
3956  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3957    movswl(reg, reg); // movsxw
3958  } else {
3959    shll(reg, 16);
3960    sarl(reg, 16);
3961  }
3962}
3963
3964void MacroAssembler::testl(Register dst, AddressLiteral src) {
3965  assert(reachable(src), "Address should be reachable");
3966  testl(dst, as_Address(src));
3967}
3968
3969void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3970  int dst_enc = dst->encoding();
3971  int src_enc = src->encoding();
3972  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3973    Assembler::pcmpeqb(dst, src);
3974  } else if ((dst_enc < 16) && (src_enc < 16)) {
3975    Assembler::pcmpeqb(dst, src);
3976  } else if (src_enc < 16) {
3977    subptr(rsp, 64);
3978    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3979    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3980    Assembler::pcmpeqb(xmm0, src);
3981    movdqu(dst, xmm0);
3982    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3983    addptr(rsp, 64);
3984  } else if (dst_enc < 16) {
3985    subptr(rsp, 64);
3986    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3987    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3988    Assembler::pcmpeqb(dst, xmm0);
3989    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3990    addptr(rsp, 64);
3991  } else {
3992    subptr(rsp, 64);
3993    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3994    subptr(rsp, 64);
3995    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3996    movdqu(xmm0, src);
3997    movdqu(xmm1, dst);
3998    Assembler::pcmpeqb(xmm1, xmm0);
3999    movdqu(dst, xmm1);
4000    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4001    addptr(rsp, 64);
4002    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4003    addptr(rsp, 64);
4004  }
4005}
4006
4007void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
4008  int dst_enc = dst->encoding();
4009  int src_enc = src->encoding();
4010  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4011    Assembler::pcmpeqw(dst, src);
4012  } else if ((dst_enc < 16) && (src_enc < 16)) {
4013    Assembler::pcmpeqw(dst, src);
4014  } else if (src_enc < 16) {
4015    subptr(rsp, 64);
4016    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4017    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4018    Assembler::pcmpeqw(xmm0, src);
4019    movdqu(dst, xmm0);
4020    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4021    addptr(rsp, 64);
4022  } else if (dst_enc < 16) {
4023    subptr(rsp, 64);
4024    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4025    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4026    Assembler::pcmpeqw(dst, xmm0);
4027    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4028    addptr(rsp, 64);
4029  } else {
4030    subptr(rsp, 64);
4031    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4032    subptr(rsp, 64);
4033    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4034    movdqu(xmm0, src);
4035    movdqu(xmm1, dst);
4036    Assembler::pcmpeqw(xmm1, xmm0);
4037    movdqu(dst, xmm1);
4038    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4039    addptr(rsp, 64);
4040    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4041    addptr(rsp, 64);
4042  }
4043}
4044
4045void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
4046  int dst_enc = dst->encoding();
4047  if (dst_enc < 16) {
4048    Assembler::pcmpestri(dst, src, imm8);
4049  } else {
4050    subptr(rsp, 64);
4051    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4052    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4053    Assembler::pcmpestri(xmm0, src, imm8);
4054    movdqu(dst, xmm0);
4055    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4056    addptr(rsp, 64);
4057  }
4058}
4059
4060void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
4061  int dst_enc = dst->encoding();
4062  int src_enc = src->encoding();
4063  if ((dst_enc < 16) && (src_enc < 16)) {
4064    Assembler::pcmpestri(dst, src, imm8);
4065  } else if (src_enc < 16) {
4066    subptr(rsp, 64);
4067    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4068    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4069    Assembler::pcmpestri(xmm0, src, imm8);
4070    movdqu(dst, xmm0);
4071    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4072    addptr(rsp, 64);
4073  } else if (dst_enc < 16) {
4074    subptr(rsp, 64);
4075    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4076    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4077    Assembler::pcmpestri(dst, xmm0, imm8);
4078    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4079    addptr(rsp, 64);
4080  } else {
4081    subptr(rsp, 64);
4082    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4083    subptr(rsp, 64);
4084    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4085    movdqu(xmm0, src);
4086    movdqu(xmm1, dst);
4087    Assembler::pcmpestri(xmm1, xmm0, imm8);
4088    movdqu(dst, xmm1);
4089    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4090    addptr(rsp, 64);
4091    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4092    addptr(rsp, 64);
4093  }
4094}
4095
4096void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
4097  int dst_enc = dst->encoding();
4098  int src_enc = src->encoding();
4099  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4100    Assembler::pmovzxbw(dst, src);
4101  } else if ((dst_enc < 16) && (src_enc < 16)) {
4102    Assembler::pmovzxbw(dst, src);
4103  } else if (src_enc < 16) {
4104    subptr(rsp, 64);
4105    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4106    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4107    Assembler::pmovzxbw(xmm0, src);
4108    movdqu(dst, xmm0);
4109    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4110    addptr(rsp, 64);
4111  } else if (dst_enc < 16) {
4112    subptr(rsp, 64);
4113    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4114    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4115    Assembler::pmovzxbw(dst, xmm0);
4116    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4117    addptr(rsp, 64);
4118  } else {
4119    subptr(rsp, 64);
4120    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4121    subptr(rsp, 64);
4122    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4123    movdqu(xmm0, src);
4124    movdqu(xmm1, dst);
4125    Assembler::pmovzxbw(xmm1, xmm0);
4126    movdqu(dst, xmm1);
4127    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4128    addptr(rsp, 64);
4129    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4130    addptr(rsp, 64);
4131  }
4132}
4133
4134void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
4135  int dst_enc = dst->encoding();
4136  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4137    Assembler::pmovzxbw(dst, src);
4138  } else if (dst_enc < 16) {
4139    Assembler::pmovzxbw(dst, src);
4140  } else {
4141    subptr(rsp, 64);
4142    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4143    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4144    Assembler::pmovzxbw(xmm0, src);
4145    movdqu(dst, xmm0);
4146    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4147    addptr(rsp, 64);
4148  }
4149}
4150
4151void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
4152  int src_enc = src->encoding();
4153  if (src_enc < 16) {
4154    Assembler::pmovmskb(dst, src);
4155  } else {
4156    subptr(rsp, 64);
4157    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4158    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4159    Assembler::pmovmskb(dst, xmm0);
4160    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4161    addptr(rsp, 64);
4162  }
4163}
4164
4165void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
4166  int dst_enc = dst->encoding();
4167  int src_enc = src->encoding();
4168  if ((dst_enc < 16) && (src_enc < 16)) {
4169    Assembler::ptest(dst, src);
4170  } else if (src_enc < 16) {
4171    subptr(rsp, 64);
4172    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4173    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4174    Assembler::ptest(xmm0, src);
4175    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4176    addptr(rsp, 64);
4177  } else if (dst_enc < 16) {
4178    subptr(rsp, 64);
4179    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4180    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4181    Assembler::ptest(dst, xmm0);
4182    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4183    addptr(rsp, 64);
4184  } else {
4185    subptr(rsp, 64);
4186    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4187    subptr(rsp, 64);
4188    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4189    movdqu(xmm0, src);
4190    movdqu(xmm1, dst);
4191    Assembler::ptest(xmm1, xmm0);
4192    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4193    addptr(rsp, 64);
4194    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4195    addptr(rsp, 64);
4196  }
4197}
4198
4199void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
4200  if (reachable(src)) {
4201    Assembler::sqrtsd(dst, as_Address(src));
4202  } else {
4203    lea(rscratch1, src);
4204    Assembler::sqrtsd(dst, Address(rscratch1, 0));
4205  }
4206}
4207
4208void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
4209  if (reachable(src)) {
4210    Assembler::sqrtss(dst, as_Address(src));
4211  } else {
4212    lea(rscratch1, src);
4213    Assembler::sqrtss(dst, Address(rscratch1, 0));
4214  }
4215}
4216
4217void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
4218  if (reachable(src)) {
4219    Assembler::subsd(dst, as_Address(src));
4220  } else {
4221    lea(rscratch1, src);
4222    Assembler::subsd(dst, Address(rscratch1, 0));
4223  }
4224}
4225
4226void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
4227  if (reachable(src)) {
4228    Assembler::subss(dst, as_Address(src));
4229  } else {
4230    lea(rscratch1, src);
4231    Assembler::subss(dst, Address(rscratch1, 0));
4232  }
4233}
4234
4235void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
4236  if (reachable(src)) {
4237    Assembler::ucomisd(dst, as_Address(src));
4238  } else {
4239    lea(rscratch1, src);
4240    Assembler::ucomisd(dst, Address(rscratch1, 0));
4241  }
4242}
4243
4244void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
4245  if (reachable(src)) {
4246    Assembler::ucomiss(dst, as_Address(src));
4247  } else {
4248    lea(rscratch1, src);
4249    Assembler::ucomiss(dst, Address(rscratch1, 0));
4250  }
4251}
4252
4253void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
4254  // Used in sign-bit flipping with aligned address.
4255  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4256  if (reachable(src)) {
4257    Assembler::xorpd(dst, as_Address(src));
4258  } else {
4259    lea(rscratch1, src);
4260    Assembler::xorpd(dst, Address(rscratch1, 0));
4261  }
4262}
4263
4264void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
4265  if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
4266    Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
4267  }
4268  else {
4269    Assembler::xorpd(dst, src);
4270  }
4271}
4272
4273void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
4274  if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
4275    Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
4276  } else {
4277    Assembler::xorps(dst, src);
4278  }
4279}
4280
4281void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
4282  // Used in sign-bit flipping with aligned address.
4283  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4284  if (reachable(src)) {
4285    Assembler::xorps(dst, as_Address(src));
4286  } else {
4287    lea(rscratch1, src);
4288    Assembler::xorps(dst, Address(rscratch1, 0));
4289  }
4290}
4291
4292void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
4293  // Used in sign-bit flipping with aligned address.
4294  bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
4295  assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
4296  if (reachable(src)) {
4297    Assembler::pshufb(dst, as_Address(src));
4298  } else {
4299    lea(rscratch1, src);
4300    Assembler::pshufb(dst, Address(rscratch1, 0));
4301  }
4302}
4303
4304// AVX 3-operands instructions
4305
4306void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4307  if (reachable(src)) {
4308    vaddsd(dst, nds, as_Address(src));
4309  } else {
4310    lea(rscratch1, src);
4311    vaddsd(dst, nds, Address(rscratch1, 0));
4312  }
4313}
4314
4315void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4316  if (reachable(src)) {
4317    vaddss(dst, nds, as_Address(src));
4318  } else {
4319    lea(rscratch1, src);
4320    vaddss(dst, nds, Address(rscratch1, 0));
4321  }
4322}
4323
4324void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4325  int dst_enc = dst->encoding();
4326  int nds_enc = nds->encoding();
4327  int src_enc = src->encoding();
4328  if ((dst_enc < 16) && (nds_enc < 16)) {
4329    vandps(dst, nds, negate_field, vector_len);
4330  } else if ((src_enc < 16) && (dst_enc < 16)) {
4331    movss(src, nds);
4332    vandps(dst, src, negate_field, vector_len);
4333  } else if (src_enc < 16) {
4334    movss(src, nds);
4335    vandps(src, src, negate_field, vector_len);
4336    movss(dst, src);
4337  } else if (dst_enc < 16) {
4338    movdqu(src, xmm0);
4339    movss(xmm0, nds);
4340    vandps(dst, xmm0, negate_field, vector_len);
4341    movdqu(xmm0, src);
4342  } else if (nds_enc < 16) {
4343    movdqu(src, xmm0);
4344    vandps(xmm0, nds, negate_field, vector_len);
4345    movss(dst, xmm0);
4346    movdqu(xmm0, src);
4347  } else {
4348    movdqu(src, xmm0);
4349    movss(xmm0, nds);
4350    vandps(xmm0, xmm0, negate_field, vector_len);
4351    movss(dst, xmm0);
4352    movdqu(xmm0, src);
4353  }
4354}
4355
4356void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4357  int dst_enc = dst->encoding();
4358  int nds_enc = nds->encoding();
4359  int src_enc = src->encoding();
4360  if ((dst_enc < 16) && (nds_enc < 16)) {
4361    vandpd(dst, nds, negate_field, vector_len);
4362  } else if ((src_enc < 16) && (dst_enc < 16)) {
4363    movsd(src, nds);
4364    vandpd(dst, src, negate_field, vector_len);
4365  } else if (src_enc < 16) {
4366    movsd(src, nds);
4367    vandpd(src, src, negate_field, vector_len);
4368    movsd(dst, src);
4369  } else if (dst_enc < 16) {
4370    movdqu(src, xmm0);
4371    movsd(xmm0, nds);
4372    vandpd(dst, xmm0, negate_field, vector_len);
4373    movdqu(xmm0, src);
4374  } else if (nds_enc < 16) {
4375    movdqu(src, xmm0);
4376    vandpd(xmm0, nds, negate_field, vector_len);
4377    movsd(dst, xmm0);
4378    movdqu(xmm0, src);
4379  } else {
4380    movdqu(src, xmm0);
4381    movsd(xmm0, nds);
4382    vandpd(xmm0, xmm0, negate_field, vector_len);
4383    movsd(dst, xmm0);
4384    movdqu(xmm0, src);
4385  }
4386}
4387
4388void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4389  int dst_enc = dst->encoding();
4390  int nds_enc = nds->encoding();
4391  int src_enc = src->encoding();
4392  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4393    Assembler::vpaddb(dst, nds, src, vector_len);
4394  } else if ((dst_enc < 16) && (src_enc < 16)) {
4395    Assembler::vpaddb(dst, dst, src, vector_len);
4396  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4397    // use nds as scratch for src
4398    evmovdqul(nds, src, Assembler::AVX_512bit);
4399    Assembler::vpaddb(dst, dst, nds, vector_len);
4400  } else if ((src_enc < 16) && (nds_enc < 16)) {
4401    // use nds as scratch for dst
4402    evmovdqul(nds, dst, Assembler::AVX_512bit);
4403    Assembler::vpaddb(nds, nds, src, vector_len);
4404    evmovdqul(dst, nds, Assembler::AVX_512bit);
4405  } else if (dst_enc < 16) {
4406    // use nds as scatch for xmm0 to hold src
4407    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4408    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4409    Assembler::vpaddb(dst, dst, xmm0, vector_len);
4410    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4411  } else {
4412    // worse case scenario, all regs are in the upper bank
4413    subptr(rsp, 64);
4414    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4415    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4416    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4417    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4418    Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
4419    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4420    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4421    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4422    addptr(rsp, 64);
4423  }
4424}
4425
4426void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4427  int dst_enc = dst->encoding();
4428  int nds_enc = nds->encoding();
4429  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4430    Assembler::vpaddb(dst, nds, src, vector_len);
4431  } else if (dst_enc < 16) {
4432    Assembler::vpaddb(dst, dst, src, vector_len);
4433  } else if (nds_enc < 16) {
4434    // implies dst_enc in upper bank with src as scratch
4435    evmovdqul(nds, dst, Assembler::AVX_512bit);
4436    Assembler::vpaddb(nds, nds, src, vector_len);
4437    evmovdqul(dst, nds, Assembler::AVX_512bit);
4438  } else {
4439    // worse case scenario, all regs in upper bank
4440    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4441    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4442    Assembler::vpaddb(xmm0, xmm0, src, vector_len);
4443    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4444  }
4445}
4446
4447void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4448  int dst_enc = dst->encoding();
4449  int nds_enc = nds->encoding();
4450  int src_enc = src->encoding();
4451  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4452    Assembler::vpaddw(dst, nds, src, vector_len);
4453  } else if ((dst_enc < 16) && (src_enc < 16)) {
4454    Assembler::vpaddw(dst, dst, src, vector_len);
4455  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4456    // use nds as scratch for src
4457    evmovdqul(nds, src, Assembler::AVX_512bit);
4458    Assembler::vpaddw(dst, dst, nds, vector_len);
4459  } else if ((src_enc < 16) && (nds_enc < 16)) {
4460    // use nds as scratch for dst
4461    evmovdqul(nds, dst, Assembler::AVX_512bit);
4462    Assembler::vpaddw(nds, nds, src, vector_len);
4463    evmovdqul(dst, nds, Assembler::AVX_512bit);
4464  } else if (dst_enc < 16) {
4465    // use nds as scatch for xmm0 to hold src
4466    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4467    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4468    Assembler::vpaddw(dst, dst, xmm0, vector_len);
4469    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4470  } else {
4471    // worse case scenario, all regs are in the upper bank
4472    subptr(rsp, 64);
4473    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4474    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4475    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4476    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4477    Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
4478    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4479    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4480    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4481    addptr(rsp, 64);
4482  }
4483}
4484
4485void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4486  int dst_enc = dst->encoding();
4487  int nds_enc = nds->encoding();
4488  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4489    Assembler::vpaddw(dst, nds, src, vector_len);
4490  } else if (dst_enc < 16) {
4491    Assembler::vpaddw(dst, dst, src, vector_len);
4492  } else if (nds_enc < 16) {
4493    // implies dst_enc in upper bank with src as scratch
4494    evmovdqul(nds, dst, Assembler::AVX_512bit);
4495    Assembler::vpaddw(nds, nds, src, vector_len);
4496    evmovdqul(dst, nds, Assembler::AVX_512bit);
4497  } else {
4498    // worse case scenario, all regs in upper bank
4499    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4500    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4501    Assembler::vpaddw(xmm0, xmm0, src, vector_len);
4502    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4503  }
4504}
4505
4506void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
4507  int dst_enc = dst->encoding();
4508  int src_enc = src->encoding();
4509  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4510    Assembler::vpbroadcastw(dst, src);
4511  } else if ((dst_enc < 16) && (src_enc < 16)) {
4512    Assembler::vpbroadcastw(dst, src);
4513  } else if (src_enc < 16) {
4514    subptr(rsp, 64);
4515    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4516    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4517    Assembler::vpbroadcastw(xmm0, src);
4518    movdqu(dst, xmm0);
4519    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4520    addptr(rsp, 64);
4521  } else if (dst_enc < 16) {
4522    subptr(rsp, 64);
4523    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4524    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4525    Assembler::vpbroadcastw(dst, xmm0);
4526    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4527    addptr(rsp, 64);
4528  } else {
4529    subptr(rsp, 64);
4530    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4531    subptr(rsp, 64);
4532    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4533    movdqu(xmm0, src);
4534    movdqu(xmm1, dst);
4535    Assembler::vpbroadcastw(xmm1, xmm0);
4536    movdqu(dst, xmm1);
4537    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4538    addptr(rsp, 64);
4539    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4540    addptr(rsp, 64);
4541  }
4542}
4543
4544void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4545  int dst_enc = dst->encoding();
4546  int nds_enc = nds->encoding();
4547  int src_enc = src->encoding();
4548  assert(dst_enc == nds_enc, "");
4549  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4550    Assembler::vpcmpeqb(dst, nds, src, vector_len);
4551  } else if ((dst_enc < 16) && (src_enc < 16)) {
4552    Assembler::vpcmpeqb(dst, nds, src, vector_len);
4553  } else if (src_enc < 16) {
4554    subptr(rsp, 64);
4555    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4556    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4557    Assembler::vpcmpeqb(xmm0, xmm0, src, vector_len);
4558    movdqu(dst, xmm0);
4559    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4560    addptr(rsp, 64);
4561  } else if (dst_enc < 16) {
4562    subptr(rsp, 64);
4563    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4564    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4565    Assembler::vpcmpeqb(dst, dst, xmm0, vector_len);
4566    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4567    addptr(rsp, 64);
4568  } else {
4569    subptr(rsp, 64);
4570    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4571    subptr(rsp, 64);
4572    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4573    movdqu(xmm0, src);
4574    movdqu(xmm1, dst);
4575    Assembler::vpcmpeqb(xmm1, xmm1, xmm0, vector_len);
4576    movdqu(dst, xmm1);
4577    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4578    addptr(rsp, 64);
4579    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4580    addptr(rsp, 64);
4581  }
4582}
4583
4584void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4585  int dst_enc = dst->encoding();
4586  int nds_enc = nds->encoding();
4587  int src_enc = src->encoding();
4588  assert(dst_enc == nds_enc, "");
4589  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4590    Assembler::vpcmpeqw(dst, nds, src, vector_len);
4591  } else if ((dst_enc < 16) && (src_enc < 16)) {
4592    Assembler::vpcmpeqw(dst, nds, src, vector_len);
4593  } else if (src_enc < 16) {
4594    subptr(rsp, 64);
4595    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4596    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4597    Assembler::vpcmpeqw(xmm0, xmm0, src, vector_len);
4598    movdqu(dst, xmm0);
4599    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4600    addptr(rsp, 64);
4601  } else if (dst_enc < 16) {
4602    subptr(rsp, 64);
4603    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4604    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4605    Assembler::vpcmpeqw(dst, dst, xmm0, vector_len);
4606    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4607    addptr(rsp, 64);
4608  } else {
4609    subptr(rsp, 64);
4610    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4611    subptr(rsp, 64);
4612    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4613    movdqu(xmm0, src);
4614    movdqu(xmm1, dst);
4615    Assembler::vpcmpeqw(xmm1, xmm1, xmm0, vector_len);
4616    movdqu(dst, xmm1);
4617    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4618    addptr(rsp, 64);
4619    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4620    addptr(rsp, 64);
4621  }
4622}
4623
4624void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
4625  int dst_enc = dst->encoding();
4626  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4627    Assembler::vpmovzxbw(dst, src, vector_len);
4628  } else if (dst_enc < 16) {
4629    Assembler::vpmovzxbw(dst, src, vector_len);
4630  } else {
4631    subptr(rsp, 64);
4632    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4633    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4634    Assembler::vpmovzxbw(xmm0, src, vector_len);
4635    movdqu(dst, xmm0);
4636    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4637    addptr(rsp, 64);
4638  }
4639}
4640
4641void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
4642  int src_enc = src->encoding();
4643  if (src_enc < 16) {
4644    Assembler::vpmovmskb(dst, src);
4645  } else {
4646    subptr(rsp, 64);
4647    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4648    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4649    Assembler::vpmovmskb(dst, xmm0);
4650    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4651    addptr(rsp, 64);
4652  }
4653}
4654
4655void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4656  int dst_enc = dst->encoding();
4657  int nds_enc = nds->encoding();
4658  int src_enc = src->encoding();
4659  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4660    Assembler::vpmullw(dst, nds, src, vector_len);
4661  } else if ((dst_enc < 16) && (src_enc < 16)) {
4662    Assembler::vpmullw(dst, dst, src, vector_len);
4663  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4664    // use nds as scratch for src
4665    evmovdqul(nds, src, Assembler::AVX_512bit);
4666    Assembler::vpmullw(dst, dst, nds, vector_len);
4667  } else if ((src_enc < 16) && (nds_enc < 16)) {
4668    // use nds as scratch for dst
4669    evmovdqul(nds, dst, Assembler::AVX_512bit);
4670    Assembler::vpmullw(nds, nds, src, vector_len);
4671    evmovdqul(dst, nds, Assembler::AVX_512bit);
4672  } else if (dst_enc < 16) {
4673    // use nds as scatch for xmm0 to hold src
4674    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4675    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4676    Assembler::vpmullw(dst, dst, xmm0, vector_len);
4677    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4678  } else {
4679    // worse case scenario, all regs are in the upper bank
4680    subptr(rsp, 64);
4681    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4682    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4683    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4684    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4685    Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
4686    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4687    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4688    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4689    addptr(rsp, 64);
4690  }
4691}
4692
4693void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4694  int dst_enc = dst->encoding();
4695  int nds_enc = nds->encoding();
4696  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4697    Assembler::vpmullw(dst, nds, src, vector_len);
4698  } else if (dst_enc < 16) {
4699    Assembler::vpmullw(dst, dst, src, vector_len);
4700  } else if (nds_enc < 16) {
4701    // implies dst_enc in upper bank with src as scratch
4702    evmovdqul(nds, dst, Assembler::AVX_512bit);
4703    Assembler::vpmullw(nds, nds, src, vector_len);
4704    evmovdqul(dst, nds, Assembler::AVX_512bit);
4705  } else {
4706    // worse case scenario, all regs in upper bank
4707    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4708    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4709    Assembler::vpmullw(xmm0, xmm0, src, vector_len);
4710    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4711  }
4712}
4713
4714void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4715  int dst_enc = dst->encoding();
4716  int nds_enc = nds->encoding();
4717  int src_enc = src->encoding();
4718  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4719    Assembler::vpsubb(dst, nds, src, vector_len);
4720  } else if ((dst_enc < 16) && (src_enc < 16)) {
4721    Assembler::vpsubb(dst, dst, src, vector_len);
4722  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4723    // use nds as scratch for src
4724    evmovdqul(nds, src, Assembler::AVX_512bit);
4725    Assembler::vpsubb(dst, dst, nds, vector_len);
4726  } else if ((src_enc < 16) && (nds_enc < 16)) {
4727    // use nds as scratch for dst
4728    evmovdqul(nds, dst, Assembler::AVX_512bit);
4729    Assembler::vpsubb(nds, nds, src, vector_len);
4730    evmovdqul(dst, nds, Assembler::AVX_512bit);
4731  } else if (dst_enc < 16) {
4732    // use nds as scatch for xmm0 to hold src
4733    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4734    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4735    Assembler::vpsubb(dst, dst, xmm0, vector_len);
4736    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4737  } else {
4738    // worse case scenario, all regs are in the upper bank
4739    subptr(rsp, 64);
4740    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4741    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4742    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4743    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4744    Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
4745    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4746    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4747    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4748    addptr(rsp, 64);
4749  }
4750}
4751
4752void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4753  int dst_enc = dst->encoding();
4754  int nds_enc = nds->encoding();
4755  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4756    Assembler::vpsubb(dst, nds, src, vector_len);
4757  } else if (dst_enc < 16) {
4758    Assembler::vpsubb(dst, dst, src, vector_len);
4759  } else if (nds_enc < 16) {
4760    // implies dst_enc in upper bank with src as scratch
4761    evmovdqul(nds, dst, Assembler::AVX_512bit);
4762    Assembler::vpsubb(nds, nds, src, vector_len);
4763    evmovdqul(dst, nds, Assembler::AVX_512bit);
4764  } else {
4765    // worse case scenario, all regs in upper bank
4766    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4767    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4768    Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4769    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4770  }
4771}
4772
4773void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4774  int dst_enc = dst->encoding();
4775  int nds_enc = nds->encoding();
4776  int src_enc = src->encoding();
4777  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4778    Assembler::vpsubw(dst, nds, src, vector_len);
4779  } else if ((dst_enc < 16) && (src_enc < 16)) {
4780    Assembler::vpsubw(dst, dst, src, vector_len);
4781  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4782    // use nds as scratch for src
4783    evmovdqul(nds, src, Assembler::AVX_512bit);
4784    Assembler::vpsubw(dst, dst, nds, vector_len);
4785  } else if ((src_enc < 16) && (nds_enc < 16)) {
4786    // use nds as scratch for dst
4787    evmovdqul(nds, dst, Assembler::AVX_512bit);
4788    Assembler::vpsubw(nds, nds, src, vector_len);
4789    evmovdqul(dst, nds, Assembler::AVX_512bit);
4790  } else if (dst_enc < 16) {
4791    // use nds as scatch for xmm0 to hold src
4792    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4793    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4794    Assembler::vpsubw(dst, dst, xmm0, vector_len);
4795    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4796  } else {
4797    // worse case scenario, all regs are in the upper bank
4798    subptr(rsp, 64);
4799    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4800    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4801    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4802    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4803    Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
4804    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4805    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4806    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4807    addptr(rsp, 64);
4808  }
4809}
4810
4811void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4812  int dst_enc = dst->encoding();
4813  int nds_enc = nds->encoding();
4814  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4815    Assembler::vpsubw(dst, nds, src, vector_len);
4816  } else if (dst_enc < 16) {
4817    Assembler::vpsubw(dst, dst, src, vector_len);
4818  } else if (nds_enc < 16) {
4819    // implies dst_enc in upper bank with src as scratch
4820    evmovdqul(nds, dst, Assembler::AVX_512bit);
4821    Assembler::vpsubw(nds, nds, src, vector_len);
4822    evmovdqul(dst, nds, Assembler::AVX_512bit);
4823  } else {
4824    // worse case scenario, all regs in upper bank
4825    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4826    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4827    Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4828    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4829  }
4830}
4831
4832void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4833  int dst_enc = dst->encoding();
4834  int nds_enc = nds->encoding();
4835  int shift_enc = shift->encoding();
4836  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4837    Assembler::vpsraw(dst, nds, shift, vector_len);
4838  } else if ((dst_enc < 16) && (shift_enc < 16)) {
4839    Assembler::vpsraw(dst, dst, shift, vector_len);
4840  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4841    // use nds_enc as scratch with shift
4842    evmovdqul(nds, shift, Assembler::AVX_512bit);
4843    Assembler::vpsraw(dst, dst, nds, vector_len);
4844  } else if ((shift_enc < 16) && (nds_enc < 16)) {
4845    // use nds as scratch with dst
4846    evmovdqul(nds, dst, Assembler::AVX_512bit);
4847    Assembler::vpsraw(nds, nds, shift, vector_len);
4848    evmovdqul(dst, nds, Assembler::AVX_512bit);
4849  } else if (dst_enc < 16) {
4850    // use nds to save a copy of xmm0 and hold shift
4851    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4852    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4853    Assembler::vpsraw(dst, dst, xmm0, vector_len);
4854    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4855  } else if (nds_enc < 16) {
4856    // use nds as dest as temps
4857    evmovdqul(nds, dst, Assembler::AVX_512bit);
4858    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4859    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4860    Assembler::vpsraw(nds, nds, xmm0, vector_len);
4861    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4862    evmovdqul(dst, nds, Assembler::AVX_512bit);
4863  } else {
4864    // worse case scenario, all regs are in the upper bank
4865    subptr(rsp, 64);
4866    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4867    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4868    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4869    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4870    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4871    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4872    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4873    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4874    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4875    addptr(rsp, 64);
4876  }
4877}
4878
4879void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4880  int dst_enc = dst->encoding();
4881  int nds_enc = nds->encoding();
4882  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4883    Assembler::vpsraw(dst, nds, shift, vector_len);
4884  } else if (dst_enc < 16) {
4885    Assembler::vpsraw(dst, dst, shift, vector_len);
4886  } else if (nds_enc < 16) {
4887    // use nds as scratch
4888    evmovdqul(nds, dst, Assembler::AVX_512bit);
4889    Assembler::vpsraw(nds, nds, shift, vector_len);
4890    evmovdqul(dst, nds, Assembler::AVX_512bit);
4891  } else {
4892    // use nds as scratch for xmm0
4893    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4894    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4895    Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
4896    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4897  }
4898}
4899
4900void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4901  int dst_enc = dst->encoding();
4902  int nds_enc = nds->encoding();
4903  int shift_enc = shift->encoding();
4904  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4905    Assembler::vpsrlw(dst, nds, shift, vector_len);
4906  } else if ((dst_enc < 16) && (shift_enc < 16)) {
4907    Assembler::vpsrlw(dst, dst, shift, vector_len);
4908  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4909    // use nds_enc as scratch with shift
4910    evmovdqul(nds, shift, Assembler::AVX_512bit);
4911    Assembler::vpsrlw(dst, dst, nds, vector_len);
4912  } else if ((shift_enc < 16) && (nds_enc < 16)) {
4913    // use nds as scratch with dst
4914    evmovdqul(nds, dst, Assembler::AVX_512bit);
4915    Assembler::vpsrlw(nds, nds, shift, vector_len);
4916    evmovdqul(dst, nds, Assembler::AVX_512bit);
4917  } else if (dst_enc < 16) {
4918    // use nds to save a copy of xmm0 and hold shift
4919    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4920    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4921    Assembler::vpsrlw(dst, dst, xmm0, vector_len);
4922    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4923  } else if (nds_enc < 16) {
4924    // use nds as dest as temps
4925    evmovdqul(nds, dst, Assembler::AVX_512bit);
4926    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4927    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4928    Assembler::vpsrlw(nds, nds, xmm0, vector_len);
4929    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4930    evmovdqul(dst, nds, Assembler::AVX_512bit);
4931  } else {
4932    // worse case scenario, all regs are in the upper bank
4933    subptr(rsp, 64);
4934    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4935    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4936    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4937    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4938    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4939    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4940    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4941    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4942    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4943    addptr(rsp, 64);
4944  }
4945}
4946
4947void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4948  int dst_enc = dst->encoding();
4949  int nds_enc = nds->encoding();
4950  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4951    Assembler::vpsrlw(dst, nds, shift, vector_len);
4952  } else if (dst_enc < 16) {
4953    Assembler::vpsrlw(dst, dst, shift, vector_len);
4954  } else if (nds_enc < 16) {
4955    // use nds as scratch
4956    evmovdqul(nds, dst, Assembler::AVX_512bit);
4957    Assembler::vpsrlw(nds, nds, shift, vector_len);
4958    evmovdqul(dst, nds, Assembler::AVX_512bit);
4959  } else {
4960    // use nds as scratch for xmm0
4961    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4962    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4963    Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
4964    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4965  }
4966}
4967
4968void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4969  int dst_enc = dst->encoding();
4970  int nds_enc = nds->encoding();
4971  int shift_enc = shift->encoding();
4972  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4973    Assembler::vpsllw(dst, nds, shift, vector_len);
4974  } else if ((dst_enc < 16) && (shift_enc < 16)) {
4975    Assembler::vpsllw(dst, dst, shift, vector_len);
4976  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4977    // use nds_enc as scratch with shift
4978    evmovdqul(nds, shift, Assembler::AVX_512bit);
4979    Assembler::vpsllw(dst, dst, nds, vector_len);
4980  } else if ((shift_enc < 16) && (nds_enc < 16)) {
4981    // use nds as scratch with dst
4982    evmovdqul(nds, dst, Assembler::AVX_512bit);
4983    Assembler::vpsllw(nds, nds, shift, vector_len);
4984    evmovdqul(dst, nds, Assembler::AVX_512bit);
4985  } else if (dst_enc < 16) {
4986    // use nds to save a copy of xmm0 and hold shift
4987    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4988    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4989    Assembler::vpsllw(dst, dst, xmm0, vector_len);
4990    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4991  } else if (nds_enc < 16) {
4992    // use nds as dest as temps
4993    evmovdqul(nds, dst, Assembler::AVX_512bit);
4994    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4995    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4996    Assembler::vpsllw(nds, nds, xmm0, vector_len);
4997    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4998    evmovdqul(dst, nds, Assembler::AVX_512bit);
4999  } else {
5000    // worse case scenario, all regs are in the upper bank
5001    subptr(rsp, 64);
5002    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
5003    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
5004    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
5005    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
5006    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
5007    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
5008    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
5009    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
5010    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
5011    addptr(rsp, 64);
5012  }
5013}
5014
5015void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
5016  int dst_enc = dst->encoding();
5017  int nds_enc = nds->encoding();
5018  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
5019    Assembler::vpsllw(dst, nds, shift, vector_len);
5020  } else if (dst_enc < 16) {
5021    Assembler::vpsllw(dst, dst, shift, vector_len);
5022  } else if (nds_enc < 16) {
5023    // use nds as scratch
5024    evmovdqul(nds, dst, Assembler::AVX_512bit);
5025    Assembler::vpsllw(nds, nds, shift, vector_len);
5026    evmovdqul(dst, nds, Assembler::AVX_512bit);
5027  } else {
5028    // use nds as scratch for xmm0
5029    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
5030    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
5031    Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
5032    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
5033  }
5034}
5035
5036void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
5037  int dst_enc = dst->encoding();
5038  int src_enc = src->encoding();
5039  if ((dst_enc < 16) && (src_enc < 16)) {
5040    Assembler::vptest(dst, src);
5041  } else if (src_enc < 16) {
5042    subptr(rsp, 64);
5043    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5044    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
5045    Assembler::vptest(xmm0, src);
5046    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5047    addptr(rsp, 64);
5048  } else if (dst_enc < 16) {
5049    subptr(rsp, 64);
5050    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5051    evmovdqul(xmm0, src, Assembler::AVX_512bit);
5052    Assembler::vptest(dst, xmm0);
5053    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5054    addptr(rsp, 64);
5055  } else {
5056    subptr(rsp, 64);
5057    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5058    subptr(rsp, 64);
5059    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
5060    movdqu(xmm0, src);
5061    movdqu(xmm1, dst);
5062    Assembler::vptest(xmm1, xmm0);
5063    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
5064    addptr(rsp, 64);
5065    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5066    addptr(rsp, 64);
5067  }
5068}
5069
5070// This instruction exists within macros, ergo we cannot control its input
5071// when emitted through those patterns.
5072void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
5073  if (VM_Version::supports_avx512nobw()) {
5074    int dst_enc = dst->encoding();
5075    int src_enc = src->encoding();
5076    if (dst_enc == src_enc) {
5077      if (dst_enc < 16) {
5078        Assembler::punpcklbw(dst, src);
5079      } else {
5080        subptr(rsp, 64);
5081        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5082        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
5083        Assembler::punpcklbw(xmm0, xmm0);
5084        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
5085        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5086        addptr(rsp, 64);
5087      }
5088    } else {
5089      if ((src_enc < 16) && (dst_enc < 16)) {
5090        Assembler::punpcklbw(dst, src);
5091      } else if (src_enc < 16) {
5092        subptr(rsp, 64);
5093        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5094        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
5095        Assembler::punpcklbw(xmm0, src);
5096        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
5097        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5098        addptr(rsp, 64);
5099      } else if (dst_enc < 16) {
5100        subptr(rsp, 64);
5101        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5102        evmovdqul(xmm0, src, Assembler::AVX_512bit);
5103        Assembler::punpcklbw(dst, xmm0);
5104        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5105        addptr(rsp, 64);
5106      } else {
5107        subptr(rsp, 64);
5108        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5109        subptr(rsp, 64);
5110        evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
5111        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
5112        evmovdqul(xmm1, src, Assembler::AVX_512bit);
5113        Assembler::punpcklbw(xmm0, xmm1);
5114        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
5115        evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
5116        addptr(rsp, 64);
5117        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5118        addptr(rsp, 64);
5119      }
5120    }
5121  } else {
5122    Assembler::punpcklbw(dst, src);
5123  }
5124}
5125
5126// This instruction exists within macros, ergo we cannot control its input
5127// when emitted through those patterns.
5128void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
5129  if (VM_Version::supports_avx512nobw()) {
5130    int dst_enc = dst->encoding();
5131    int src_enc = src->encoding();
5132    if (dst_enc == src_enc) {
5133      if (dst_enc < 16) {
5134        Assembler::pshuflw(dst, src, mode);
5135      } else {
5136        subptr(rsp, 64);
5137        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5138        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
5139        Assembler::pshuflw(xmm0, xmm0, mode);
5140        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
5141        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5142        addptr(rsp, 64);
5143      }
5144    } else {
5145      if ((src_enc < 16) && (dst_enc < 16)) {
5146        Assembler::pshuflw(dst, src, mode);
5147      } else if (src_enc < 16) {
5148        subptr(rsp, 64);
5149        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5150        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
5151        Assembler::pshuflw(xmm0, src, mode);
5152        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
5153        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5154        addptr(rsp, 64);
5155      } else if (dst_enc < 16) {
5156        subptr(rsp, 64);
5157        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5158        evmovdqul(xmm0, src, Assembler::AVX_512bit);
5159        Assembler::pshuflw(dst, xmm0, mode);
5160        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5161        addptr(rsp, 64);
5162      } else {
5163        subptr(rsp, 64);
5164        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5165        subptr(rsp, 64);
5166        evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
5167        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
5168        evmovdqul(xmm1, src, Assembler::AVX_512bit);
5169        Assembler::pshuflw(xmm0, xmm1, mode);
5170        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
5171        evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
5172        addptr(rsp, 64);
5173        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5174        addptr(rsp, 64);
5175      }
5176    }
5177  } else {
5178    Assembler::pshuflw(dst, src, mode);
5179  }
5180}
5181
5182void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5183  if (reachable(src)) {
5184    vandpd(dst, nds, as_Address(src), vector_len);
5185  } else {
5186    lea(rscratch1, src);
5187    vandpd(dst, nds, Address(rscratch1, 0), vector_len);
5188  }
5189}
5190
5191void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5192  if (reachable(src)) {
5193    vandps(dst, nds, as_Address(src), vector_len);
5194  } else {
5195    lea(rscratch1, src);
5196    vandps(dst, nds, Address(rscratch1, 0), vector_len);
5197  }
5198}
5199
5200void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5201  if (reachable(src)) {
5202    vdivsd(dst, nds, as_Address(src));
5203  } else {
5204    lea(rscratch1, src);
5205    vdivsd(dst, nds, Address(rscratch1, 0));
5206  }
5207}
5208
5209void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5210  if (reachable(src)) {
5211    vdivss(dst, nds, as_Address(src));
5212  } else {
5213    lea(rscratch1, src);
5214    vdivss(dst, nds, Address(rscratch1, 0));
5215  }
5216}
5217
5218void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5219  if (reachable(src)) {
5220    vmulsd(dst, nds, as_Address(src));
5221  } else {
5222    lea(rscratch1, src);
5223    vmulsd(dst, nds, Address(rscratch1, 0));
5224  }
5225}
5226
5227void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5228  if (reachable(src)) {
5229    vmulss(dst, nds, as_Address(src));
5230  } else {
5231    lea(rscratch1, src);
5232    vmulss(dst, nds, Address(rscratch1, 0));
5233  }
5234}
5235
5236void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5237  if (reachable(src)) {
5238    vsubsd(dst, nds, as_Address(src));
5239  } else {
5240    lea(rscratch1, src);
5241    vsubsd(dst, nds, Address(rscratch1, 0));
5242  }
5243}
5244
5245void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5246  if (reachable(src)) {
5247    vsubss(dst, nds, as_Address(src));
5248  } else {
5249    lea(rscratch1, src);
5250    vsubss(dst, nds, Address(rscratch1, 0));
5251  }
5252}
5253
5254void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5255  int nds_enc = nds->encoding();
5256  int dst_enc = dst->encoding();
5257  bool dst_upper_bank = (dst_enc > 15);
5258  bool nds_upper_bank = (nds_enc > 15);
5259  if (VM_Version::supports_avx512novl() &&
5260      (nds_upper_bank || dst_upper_bank)) {
5261    if (dst_upper_bank) {
5262      subptr(rsp, 64);
5263      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5264      movflt(xmm0, nds);
5265      vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
5266      movflt(dst, xmm0);
5267      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5268      addptr(rsp, 64);
5269    } else {
5270      movflt(dst, nds);
5271      vxorps(dst, dst, src, Assembler::AVX_128bit);
5272    }
5273  } else {
5274    vxorps(dst, nds, src, Assembler::AVX_128bit);
5275  }
5276}
5277
5278void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5279  int nds_enc = nds->encoding();
5280  int dst_enc = dst->encoding();
5281  bool dst_upper_bank = (dst_enc > 15);
5282  bool nds_upper_bank = (nds_enc > 15);
5283  if (VM_Version::supports_avx512novl() &&
5284      (nds_upper_bank || dst_upper_bank)) {
5285    if (dst_upper_bank) {
5286      subptr(rsp, 64);
5287      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5288      movdbl(xmm0, nds);
5289      vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
5290      movdbl(dst, xmm0);
5291      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5292      addptr(rsp, 64);
5293    } else {
5294      movdbl(dst, nds);
5295      vxorpd(dst, dst, src, Assembler::AVX_128bit);
5296    }
5297  } else {
5298    vxorpd(dst, nds, src, Assembler::AVX_128bit);
5299  }
5300}
5301
5302void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5303  if (reachable(src)) {
5304    vxorpd(dst, nds, as_Address(src), vector_len);
5305  } else {
5306    lea(rscratch1, src);
5307    vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
5308  }
5309}
5310
5311void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5312  if (reachable(src)) {
5313    vxorps(dst, nds, as_Address(src), vector_len);
5314  } else {
5315    lea(rscratch1, src);
5316    vxorps(dst, nds, Address(rscratch1, 0), vector_len);
5317  }
5318}
5319
5320
5321//////////////////////////////////////////////////////////////////////////////////
5322#if INCLUDE_ALL_GCS
5323
5324void MacroAssembler::g1_write_barrier_pre(Register obj,
5325                                          Register pre_val,
5326                                          Register thread,
5327                                          Register tmp,
5328                                          bool tosca_live,
5329                                          bool expand_call) {
5330
5331  // If expand_call is true then we expand the call_VM_leaf macro
5332  // directly to skip generating the check by
5333  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
5334
5335#ifdef _LP64
5336  assert(thread == r15_thread, "must be");
5337#endif // _LP64
5338
5339  Label done;
5340  Label runtime;
5341
5342  assert(pre_val != noreg, "check this code");
5343
5344  if (obj != noreg) {
5345    assert_different_registers(obj, pre_val, tmp);
5346    assert(pre_val != rax, "check this code");
5347  }
5348
5349  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
5350                                       SATBMarkQueue::byte_offset_of_active()));
5351  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
5352                                       SATBMarkQueue::byte_offset_of_index()));
5353  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
5354                                       SATBMarkQueue::byte_offset_of_buf()));
5355
5356
5357  // Is marking active?
5358  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
5359    cmpl(in_progress, 0);
5360  } else {
5361    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
5362    cmpb(in_progress, 0);
5363  }
5364  jcc(Assembler::equal, done);
5365
5366  // Do we need to load the previous value?
5367  if (obj != noreg) {
5368    load_heap_oop(pre_val, Address(obj, 0));
5369  }
5370
5371  // Is the previous value null?
5372  cmpptr(pre_val, (int32_t) NULL_WORD);
5373  jcc(Assembler::equal, done);
5374
5375  // Can we store original value in the thread's buffer?
5376  // Is index == 0?
5377  // (The index field is typed as size_t.)
5378
5379  movptr(tmp, index);                   // tmp := *index_adr
5380  cmpptr(tmp, 0);                       // tmp == 0?
5381  jcc(Assembler::equal, runtime);       // If yes, goto runtime
5382
5383  subptr(tmp, wordSize);                // tmp := tmp - wordSize
5384  movptr(index, tmp);                   // *index_adr := tmp
5385  addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
5386
5387  // Record the previous value
5388  movptr(Address(tmp, 0), pre_val);
5389  jmp(done);
5390
5391  bind(runtime);
5392  // save the live input values
5393  if(tosca_live) push(rax);
5394
5395  if (obj != noreg && obj != rax)
5396    push(obj);
5397
5398  if (pre_val != rax)
5399    push(pre_val);
5400
5401  // Calling the runtime using the regular call_VM_leaf mechanism generates
5402  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
5403  // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
5404  //
5405  // If we care generating the pre-barrier without a frame (e.g. in the
5406  // intrinsified Reference.get() routine) then ebp might be pointing to
5407  // the caller frame and so this check will most likely fail at runtime.
5408  //
5409  // Expanding the call directly bypasses the generation of the check.
5410  // So when we do not have have a full interpreter frame on the stack
5411  // expand_call should be passed true.
5412
5413  NOT_LP64( push(thread); )
5414
5415  if (expand_call) {
5416    LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
5417    pass_arg1(this, thread);
5418    pass_arg0(this, pre_val);
5419    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
5420  } else {
5421    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
5422  }
5423
5424  NOT_LP64( pop(thread); )
5425
5426  // save the live input values
5427  if (pre_val != rax)
5428    pop(pre_val);
5429
5430  if (obj != noreg && obj != rax)
5431    pop(obj);
5432
5433  if(tosca_live) pop(rax);
5434
5435  bind(done);
5436}
5437
5438void MacroAssembler::g1_write_barrier_post(Register store_addr,
5439                                           Register new_val,
5440                                           Register thread,
5441                                           Register tmp,
5442                                           Register tmp2) {
5443#ifdef _LP64
5444  assert(thread == r15_thread, "must be");
5445#endif // _LP64
5446
5447  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
5448                                       DirtyCardQueue::byte_offset_of_index()));
5449  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
5450                                       DirtyCardQueue::byte_offset_of_buf()));
5451
5452  CardTableModRefBS* ct =
5453    barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
5454  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
5455
5456  Label done;
5457  Label runtime;
5458
5459  // Does store cross heap regions?
5460
5461  movptr(tmp, store_addr);
5462  xorptr(tmp, new_val);
5463  shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
5464  jcc(Assembler::equal, done);
5465
5466  // crosses regions, storing NULL?
5467
5468  cmpptr(new_val, (int32_t) NULL_WORD);
5469  jcc(Assembler::equal, done);
5470
5471  // storing region crossing non-NULL, is card already dirty?
5472
5473  const Register card_addr = tmp;
5474  const Register cardtable = tmp2;
5475
5476  movptr(card_addr, store_addr);
5477  shrptr(card_addr, CardTableModRefBS::card_shift);
5478  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
5479  // a valid address and therefore is not properly handled by the relocation code.
5480  movptr(cardtable, (intptr_t)ct->byte_map_base);
5481  addptr(card_addr, cardtable);
5482
5483  cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
5484  jcc(Assembler::equal, done);
5485
5486  membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
5487  cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
5488  jcc(Assembler::equal, done);
5489
5490
5491  // storing a region crossing, non-NULL oop, card is clean.
5492  // dirty card and log.
5493
5494  movb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
5495
5496  cmpl(queue_index, 0);
5497  jcc(Assembler::equal, runtime);
5498  subl(queue_index, wordSize);
5499  movptr(tmp2, buffer);
5500#ifdef _LP64
5501  movslq(rscratch1, queue_index);
5502  addq(tmp2, rscratch1);
5503  movq(Address(tmp2, 0), card_addr);
5504#else
5505  addl(tmp2, queue_index);
5506  movl(Address(tmp2, 0), card_addr);
5507#endif
5508  jmp(done);
5509
5510  bind(runtime);
5511  // save the live input values
5512  push(store_addr);
5513  push(new_val);
5514#ifdef _LP64
5515  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
5516#else
5517  push(thread);
5518  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
5519  pop(thread);
5520#endif
5521  pop(new_val);
5522  pop(store_addr);
5523
5524  bind(done);
5525}
5526
5527#endif // INCLUDE_ALL_GCS
5528//////////////////////////////////////////////////////////////////////////////////
5529
5530
5531void MacroAssembler::store_check(Register obj, Address dst) {
5532  store_check(obj);
5533}
5534
5535void MacroAssembler::store_check(Register obj) {
5536  // Does a store check for the oop in register obj. The content of
5537  // register obj is destroyed afterwards.
5538  BarrierSet* bs = Universe::heap()->barrier_set();
5539  assert(bs->kind() == BarrierSet::CardTableForRS ||
5540         bs->kind() == BarrierSet::CardTableExtension,
5541         "Wrong barrier set kind");
5542
5543  CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
5544  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
5545
5546  shrptr(obj, CardTableModRefBS::card_shift);
5547
5548  Address card_addr;
5549
5550  // The calculation for byte_map_base is as follows:
5551  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
5552  // So this essentially converts an address to a displacement and it will
5553  // never need to be relocated. On 64bit however the value may be too
5554  // large for a 32bit displacement.
5555  intptr_t disp = (intptr_t) ct->byte_map_base;
5556  if (is_simm32(disp)) {
5557    card_addr = Address(noreg, obj, Address::times_1, disp);
5558  } else {
5559    // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
5560    // displacement and done in a single instruction given favorable mapping and a
5561    // smarter version of as_Address. However, 'ExternalAddress' generates a relocation
5562    // entry and that entry is not properly handled by the relocation code.
5563    AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none);
5564    Address index(noreg, obj, Address::times_1);
5565    card_addr = as_Address(ArrayAddress(cardtable, index));
5566  }
5567
5568  int dirty = CardTableModRefBS::dirty_card_val();
5569  if (UseCondCardMark) {
5570    Label L_already_dirty;
5571    if (UseConcMarkSweepGC) {
5572      membar(Assembler::StoreLoad);
5573    }
5574    cmpb(card_addr, dirty);
5575    jcc(Assembler::equal, L_already_dirty);
5576    movb(card_addr, dirty);
5577    bind(L_already_dirty);
5578  } else {
5579    movb(card_addr, dirty);
5580  }
5581}
5582
5583void MacroAssembler::subptr(Register dst, int32_t imm32) {
5584  LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
5585}
5586
5587// Force generation of a 4 byte immediate value even if it fits into 8bit
5588void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
5589  LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
5590}
5591
5592void MacroAssembler::subptr(Register dst, Register src) {
5593  LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
5594}
5595
5596// C++ bool manipulation
5597void MacroAssembler::testbool(Register dst) {
5598  if(sizeof(bool) == 1)
5599    testb(dst, 0xff);
5600  else if(sizeof(bool) == 2) {
5601    // testw implementation needed for two byte bools
5602    ShouldNotReachHere();
5603  } else if(sizeof(bool) == 4)
5604    testl(dst, dst);
5605  else
5606    // unsupported
5607    ShouldNotReachHere();
5608}
5609
5610void MacroAssembler::testptr(Register dst, Register src) {
5611  LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
5612}
5613
5614// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
5615void MacroAssembler::tlab_allocate(Register obj,
5616                                   Register var_size_in_bytes,
5617                                   int con_size_in_bytes,
5618                                   Register t1,
5619                                   Register t2,
5620                                   Label& slow_case) {
5621  assert_different_registers(obj, t1, t2);
5622  assert_different_registers(obj, var_size_in_bytes, t1);
5623  Register end = t2;
5624  Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
5625
5626  verify_tlab();
5627
5628  NOT_LP64(get_thread(thread));
5629
5630  movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
5631  if (var_size_in_bytes == noreg) {
5632    lea(end, Address(obj, con_size_in_bytes));
5633  } else {
5634    lea(end, Address(obj, var_size_in_bytes, Address::times_1));
5635  }
5636  cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
5637  jcc(Assembler::above, slow_case);
5638
5639  // update the tlab top pointer
5640  movptr(Address(thread, JavaThread::tlab_top_offset()), end);
5641
5642  // recover var_size_in_bytes if necessary
5643  if (var_size_in_bytes == end) {
5644    subptr(var_size_in_bytes, obj);
5645  }
5646  verify_tlab();
5647}
5648
5649// Preserves rbx, and rdx.
5650Register MacroAssembler::tlab_refill(Label& retry,
5651                                     Label& try_eden,
5652                                     Label& slow_case) {
5653  Register top = rax;
5654  Register t1  = rcx;
5655  Register t2  = rsi;
5656  Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
5657  assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
5658  Label do_refill, discard_tlab;
5659
5660  if (!Universe::heap()->supports_inline_contig_alloc()) {
5661    // No allocation in the shared eden.
5662    jmp(slow_case);
5663  }
5664
5665  NOT_LP64(get_thread(thread_reg));
5666
5667  movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5668  movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
5669
5670  // calculate amount of free space
5671  subptr(t1, top);
5672  shrptr(t1, LogHeapWordSize);
5673
5674  // Retain tlab and allocate object in shared space if
5675  // the amount free in the tlab is too large to discard.
5676  cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
5677  jcc(Assembler::lessEqual, discard_tlab);
5678
5679  // Retain
5680  // %%% yuck as movptr...
5681  movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
5682  addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
5683  if (TLABStats) {
5684    // increment number of slow_allocations
5685    addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
5686  }
5687  jmp(try_eden);
5688
5689  bind(discard_tlab);
5690  if (TLABStats) {
5691    // increment number of refills
5692    addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
5693    // accumulate wastage -- t1 is amount free in tlab
5694    addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
5695  }
5696
5697  // if tlab is currently allocated (top or end != null) then
5698  // fill [top, end + alignment_reserve) with array object
5699  testptr(top, top);
5700  jcc(Assembler::zero, do_refill);
5701
5702  // set up the mark word
5703  movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
5704  // set the length to the remaining space
5705  subptr(t1, typeArrayOopDesc::header_size(T_INT));
5706  addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
5707  shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
5708  movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
5709  // set klass to intArrayKlass
5710  // dubious reloc why not an oop reloc?
5711  movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
5712  // store klass last.  concurrent gcs assumes klass length is valid if
5713  // klass field is not null.
5714  store_klass(top, t1);
5715
5716  movptr(t1, top);
5717  subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5718  incr_allocated_bytes(thread_reg, t1, 0);
5719
5720  // refill the tlab with an eden allocation
5721  bind(do_refill);
5722  movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
5723  shlptr(t1, LogHeapWordSize);
5724  // allocate new tlab, address returned in top
5725  eden_allocate(top, t1, 0, t2, slow_case);
5726
5727  // Check that t1 was preserved in eden_allocate.
5728#ifdef ASSERT
5729  if (UseTLAB) {
5730    Label ok;
5731    Register tsize = rsi;
5732    assert_different_registers(tsize, thread_reg, t1);
5733    push(tsize);
5734    movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
5735    shlptr(tsize, LogHeapWordSize);
5736    cmpptr(t1, tsize);
5737    jcc(Assembler::equal, ok);
5738    STOP("assert(t1 != tlab size)");
5739    should_not_reach_here();
5740
5741    bind(ok);
5742    pop(tsize);
5743  }
5744#endif
5745  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
5746  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
5747  addptr(top, t1);
5748  subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
5749  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
5750  verify_tlab();
5751  jmp(retry);
5752
5753  return thread_reg; // for use by caller
5754}
5755
5756void MacroAssembler::incr_allocated_bytes(Register thread,
5757                                          Register var_size_in_bytes,
5758                                          int con_size_in_bytes,
5759                                          Register t1) {
5760  if (!thread->is_valid()) {
5761#ifdef _LP64
5762    thread = r15_thread;
5763#else
5764    assert(t1->is_valid(), "need temp reg");
5765    thread = t1;
5766    get_thread(thread);
5767#endif
5768  }
5769
5770#ifdef _LP64
5771  if (var_size_in_bytes->is_valid()) {
5772    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
5773  } else {
5774    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
5775  }
5776#else
5777  if (var_size_in_bytes->is_valid()) {
5778    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
5779  } else {
5780    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
5781  }
5782  adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
5783#endif
5784}
5785
5786void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
5787  pusha();
5788
5789  // if we are coming from c1, xmm registers may be live
5790  int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
5791  if (UseAVX > 2) {
5792    num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
5793  }
5794
5795  if (UseSSE == 1)  {
5796    subptr(rsp, sizeof(jdouble)*8);
5797    for (int n = 0; n < 8; n++) {
5798      movflt(Address(rsp, n*sizeof(jdouble)), as_XMMRegister(n));
5799    }
5800  } else if (UseSSE >= 2)  {
5801    if (UseAVX > 2) {
5802      push(rbx);
5803      movl(rbx, 0xffff);
5804      kmovwl(k1, rbx);
5805      pop(rbx);
5806    }
5807#ifdef COMPILER2
5808    if (MaxVectorSize > 16) {
5809      if(UseAVX > 2) {
5810        // Save upper half of ZMM registers
5811        subptr(rsp, 32*num_xmm_regs);
5812        for (int n = 0; n < num_xmm_regs; n++) {
5813          vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
5814        }
5815      }
5816      assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
5817      // Save upper half of YMM registers
5818      subptr(rsp, 16*num_xmm_regs);
5819      for (int n = 0; n < num_xmm_regs; n++) {
5820        vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
5821      }
5822    }
5823#endif
5824    // Save whole 128bit (16 bytes) XMM registers
5825    subptr(rsp, 16*num_xmm_regs);
5826#ifdef _LP64
5827    if (VM_Version::supports_evex()) {
5828      for (int n = 0; n < num_xmm_regs; n++) {
5829        vextractf32x4h(Address(rsp, n*16), as_XMMRegister(n), 0);
5830      }
5831    } else {
5832      for (int n = 0; n < num_xmm_regs; n++) {
5833        movdqu(Address(rsp, n*16), as_XMMRegister(n));
5834      }
5835    }
5836#else
5837    for (int n = 0; n < num_xmm_regs; n++) {
5838      movdqu(Address(rsp, n*16), as_XMMRegister(n));
5839    }
5840#endif
5841  }
5842
5843  // Preserve registers across runtime call
5844  int incoming_argument_and_return_value_offset = -1;
5845  if (num_fpu_regs_in_use > 1) {
5846    // Must preserve all other FPU regs (could alternatively convert
5847    // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
5848    // FPU state, but can not trust C compiler)
5849    NEEDS_CLEANUP;
5850    // NOTE that in this case we also push the incoming argument(s) to
5851    // the stack and restore it later; we also use this stack slot to
5852    // hold the return value from dsin, dcos etc.
5853    for (int i = 0; i < num_fpu_regs_in_use; i++) {
5854      subptr(rsp, sizeof(jdouble));
5855      fstp_d(Address(rsp, 0));
5856    }
5857    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
5858    for (int i = nb_args-1; i >= 0; i--) {
5859      fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
5860    }
5861  }
5862
5863  subptr(rsp, nb_args*sizeof(jdouble));
5864  for (int i = 0; i < nb_args; i++) {
5865    fstp_d(Address(rsp, i*sizeof(jdouble)));
5866  }
5867
5868#ifdef _LP64
5869  if (nb_args > 0) {
5870    movdbl(xmm0, Address(rsp, 0));
5871  }
5872  if (nb_args > 1) {
5873    movdbl(xmm1, Address(rsp, sizeof(jdouble)));
5874  }
5875  assert(nb_args <= 2, "unsupported number of args");
5876#endif // _LP64
5877
5878  // NOTE: we must not use call_VM_leaf here because that requires a
5879  // complete interpreter frame in debug mode -- same bug as 4387334
5880  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
5881  // do proper 64bit abi
5882
5883  NEEDS_CLEANUP;
5884  // Need to add stack banging before this runtime call if it needs to
5885  // be taken; however, there is no generic stack banging routine at
5886  // the MacroAssembler level
5887
5888  MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
5889
5890#ifdef _LP64
5891  movsd(Address(rsp, 0), xmm0);
5892  fld_d(Address(rsp, 0));
5893#endif // _LP64
5894  addptr(rsp, sizeof(jdouble)*nb_args);
5895  if (num_fpu_regs_in_use > 1) {
5896    // Must save return value to stack and then restore entire FPU
5897    // stack except incoming arguments
5898    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
5899    for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
5900      fld_d(Address(rsp, 0));
5901      addptr(rsp, sizeof(jdouble));
5902    }
5903    fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
5904    addptr(rsp, sizeof(jdouble)*nb_args);
5905  }
5906
5907  if (UseSSE == 1)  {
5908    for (int n = 0; n < 8; n++) {
5909      movflt(as_XMMRegister(n), Address(rsp, n*sizeof(jdouble)));
5910    }
5911    addptr(rsp, sizeof(jdouble)*8);
5912  } else if (UseSSE >= 2)  {
5913    // Restore whole 128bit (16 bytes) XMM registers
5914#ifdef _LP64
5915  if (VM_Version::supports_evex()) {
5916    for (int n = 0; n < num_xmm_regs; n++) {
5917      vinsertf32x4h(as_XMMRegister(n), Address(rsp, n*16), 0);
5918    }
5919  } else {
5920    for (int n = 0; n < num_xmm_regs; n++) {
5921      movdqu(as_XMMRegister(n), Address(rsp, n*16));
5922    }
5923  }
5924#else
5925  for (int n = 0; n < num_xmm_regs; n++) {
5926    movdqu(as_XMMRegister(n), Address(rsp, n*16));
5927  }
5928#endif
5929    addptr(rsp, 16*num_xmm_regs);
5930
5931#ifdef COMPILER2
5932    if (MaxVectorSize > 16) {
5933      // Restore upper half of YMM registers.
5934      for (int n = 0; n < num_xmm_regs; n++) {
5935        vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
5936      }
5937      addptr(rsp, 16*num_xmm_regs);
5938      if(UseAVX > 2) {
5939        for (int n = 0; n < num_xmm_regs; n++) {
5940          vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
5941        }
5942        addptr(rsp, 32*num_xmm_regs);
5943      }
5944    }
5945#endif
5946  }
5947  popa();
5948}
5949
5950static const double     pi_4 =  0.7853981633974483;
5951
5952void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
5953  // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
5954  // was attempted in this code; unfortunately it appears that the
5955  // switch to 80-bit precision and back causes this to be
5956  // unprofitable compared with simply performing a runtime call if
5957  // the argument is out of the (-pi/4, pi/4) range.
5958
5959  Register tmp = noreg;
5960  if (!VM_Version::supports_cmov()) {
5961    // fcmp needs a temporary so preserve rbx,
5962    tmp = rbx;
5963    push(tmp);
5964  }
5965
5966  Label slow_case, done;
5967
5968  ExternalAddress pi4_adr = (address)&pi_4;
5969  if (reachable(pi4_adr)) {
5970    // x ?<= pi/4
5971    fld_d(pi4_adr);
5972    fld_s(1);                // Stack:  X  PI/4  X
5973    fabs();                  // Stack: |X| PI/4  X
5974    fcmp(tmp);
5975    jcc(Assembler::above, slow_case);
5976
5977    // fastest case: -pi/4 <= x <= pi/4
5978    switch(trig) {
5979    case 's':
5980      fsin();
5981      break;
5982    case 'c':
5983      fcos();
5984      break;
5985    case 't':
5986      ftan();
5987      break;
5988    default:
5989      assert(false, "bad intrinsic");
5990      break;
5991    }
5992    jmp(done);
5993  }
5994
5995  // slow case: runtime call
5996  bind(slow_case);
5997
5998  switch(trig) {
5999  case 's':
6000    {
6001      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
6002    }
6003    break;
6004  case 'c':
6005    {
6006      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
6007    }
6008    break;
6009  case 't':
6010    {
6011      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
6012    }
6013    break;
6014  default:
6015    assert(false, "bad intrinsic");
6016    break;
6017  }
6018
6019  // Come here with result in F-TOS
6020  bind(done);
6021
6022  if (tmp != noreg) {
6023    pop(tmp);
6024  }
6025}
6026
6027
6028// Look up the method for a megamorphic invokeinterface call.
6029// The target method is determined by <intf_klass, itable_index>.
6030// The receiver klass is in recv_klass.
6031// On success, the result will be in method_result, and execution falls through.
6032// On failure, execution transfers to the given label.
6033void MacroAssembler::lookup_interface_method(Register recv_klass,
6034                                             Register intf_klass,
6035                                             RegisterOrConstant itable_index,
6036                                             Register method_result,
6037                                             Register scan_temp,
6038                                             Label& L_no_such_interface) {
6039  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
6040  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
6041         "caller must use same register for non-constant itable index as for method");
6042
6043  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
6044  int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
6045  int itentry_off = itableMethodEntry::method_offset_in_bytes();
6046  int scan_step   = itableOffsetEntry::size() * wordSize;
6047  int vte_size    = vtableEntry::size() * wordSize;
6048  Address::ScaleFactor times_vte_scale = Address::times_ptr;
6049  assert(vte_size == wordSize, "else adjust times_vte_scale");
6050
6051  movl(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
6052
6053  // %%% Could store the aligned, prescaled offset in the klassoop.
6054  lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
6055  if (HeapWordsPerLong > 1) {
6056    // Round up to align_object_offset boundary
6057    // see code for InstanceKlass::start_of_itable!
6058    round_to(scan_temp, BytesPerLong);
6059  }
6060
6061  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
6062  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
6063  lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
6064
6065  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
6066  //   if (scan->interface() == intf) {
6067  //     result = (klass + scan->offset() + itable_index);
6068  //   }
6069  // }
6070  Label search, found_method;
6071
6072  for (int peel = 1; peel >= 0; peel--) {
6073    movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
6074    cmpptr(intf_klass, method_result);
6075
6076    if (peel) {
6077      jccb(Assembler::equal, found_method);
6078    } else {
6079      jccb(Assembler::notEqual, search);
6080      // (invert the test to fall through to found_method...)
6081    }
6082
6083    if (!peel)  break;
6084
6085    bind(search);
6086
6087    // Check that the previous entry is non-null.  A null entry means that
6088    // the receiver class doesn't implement the interface, and wasn't the
6089    // same as when the caller was compiled.
6090    testptr(method_result, method_result);
6091    jcc(Assembler::zero, L_no_such_interface);
6092    addptr(scan_temp, scan_step);
6093  }
6094
6095  bind(found_method);
6096
6097  // Got a hit.
6098  movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
6099  movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
6100}
6101
6102
6103// virtual method calling
6104void MacroAssembler::lookup_virtual_method(Register recv_klass,
6105                                           RegisterOrConstant vtable_index,
6106                                           Register method_result) {
6107  const int base = InstanceKlass::vtable_start_offset() * wordSize;
6108  assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
6109  Address vtable_entry_addr(recv_klass,
6110                            vtable_index, Address::times_ptr,
6111                            base + vtableEntry::method_offset_in_bytes());
6112  movptr(method_result, vtable_entry_addr);
6113}
6114
6115
6116void MacroAssembler::check_klass_subtype(Register sub_klass,
6117                           Register super_klass,
6118                           Register temp_reg,
6119                           Label& L_success) {
6120  Label L_failure;
6121  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
6122  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
6123  bind(L_failure);
6124}
6125
6126
6127void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
6128                                                   Register super_klass,
6129                                                   Register temp_reg,
6130                                                   Label* L_success,
6131                                                   Label* L_failure,
6132                                                   Label* L_slow_path,
6133                                        RegisterOrConstant super_check_offset) {
6134  assert_different_registers(sub_klass, super_klass, temp_reg);
6135  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
6136  if (super_check_offset.is_register()) {
6137    assert_different_registers(sub_klass, super_klass,
6138                               super_check_offset.as_register());
6139  } else if (must_load_sco) {
6140    assert(temp_reg != noreg, "supply either a temp or a register offset");
6141  }
6142
6143  Label L_fallthrough;
6144  int label_nulls = 0;
6145  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
6146  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
6147  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
6148  assert(label_nulls <= 1, "at most one NULL in the batch");
6149
6150  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
6151  int sco_offset = in_bytes(Klass::super_check_offset_offset());
6152  Address super_check_offset_addr(super_klass, sco_offset);
6153
6154  // Hacked jcc, which "knows" that L_fallthrough, at least, is in
6155  // range of a jccb.  If this routine grows larger, reconsider at
6156  // least some of these.
6157#define local_jcc(assembler_cond, label)                                \
6158  if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
6159  else                             jcc( assembler_cond, label) /*omit semi*/
6160
6161  // Hacked jmp, which may only be used just before L_fallthrough.
6162#define final_jmp(label)                                                \
6163  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
6164  else                            jmp(label)                /*omit semi*/
6165
6166  // If the pointers are equal, we are done (e.g., String[] elements).
6167  // This self-check enables sharing of secondary supertype arrays among
6168  // non-primary types such as array-of-interface.  Otherwise, each such
6169  // type would need its own customized SSA.
6170  // We move this check to the front of the fast path because many
6171  // type checks are in fact trivially successful in this manner,
6172  // so we get a nicely predicted branch right at the start of the check.
6173  cmpptr(sub_klass, super_klass);
6174  local_jcc(Assembler::equal, *L_success);
6175
6176  // Check the supertype display:
6177  if (must_load_sco) {
6178    // Positive movl does right thing on LP64.
6179    movl(temp_reg, super_check_offset_addr);
6180    super_check_offset = RegisterOrConstant(temp_reg);
6181  }
6182  Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
6183  cmpptr(super_klass, super_check_addr); // load displayed supertype
6184
6185  // This check has worked decisively for primary supers.
6186  // Secondary supers are sought in the super_cache ('super_cache_addr').
6187  // (Secondary supers are interfaces and very deeply nested subtypes.)
6188  // This works in the same check above because of a tricky aliasing
6189  // between the super_cache and the primary super display elements.
6190  // (The 'super_check_addr' can address either, as the case requires.)
6191  // Note that the cache is updated below if it does not help us find
6192  // what we need immediately.
6193  // So if it was a primary super, we can just fail immediately.
6194  // Otherwise, it's the slow path for us (no success at this point).
6195
6196  if (super_check_offset.is_register()) {
6197    local_jcc(Assembler::equal, *L_success);
6198    cmpl(super_check_offset.as_register(), sc_offset);
6199    if (L_failure == &L_fallthrough) {
6200      local_jcc(Assembler::equal, *L_slow_path);
6201    } else {
6202      local_jcc(Assembler::notEqual, *L_failure);
6203      final_jmp(*L_slow_path);
6204    }
6205  } else if (super_check_offset.as_constant() == sc_offset) {
6206    // Need a slow path; fast failure is impossible.
6207    if (L_slow_path == &L_fallthrough) {
6208      local_jcc(Assembler::equal, *L_success);
6209    } else {
6210      local_jcc(Assembler::notEqual, *L_slow_path);
6211      final_jmp(*L_success);
6212    }
6213  } else {
6214    // No slow path; it's a fast decision.
6215    if (L_failure == &L_fallthrough) {
6216      local_jcc(Assembler::equal, *L_success);
6217    } else {
6218      local_jcc(Assembler::notEqual, *L_failure);
6219      final_jmp(*L_success);
6220    }
6221  }
6222
6223  bind(L_fallthrough);
6224
6225#undef local_jcc
6226#undef final_jmp
6227}
6228
6229
6230void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
6231                                                   Register super_klass,
6232                                                   Register temp_reg,
6233                                                   Register temp2_reg,
6234                                                   Label* L_success,
6235                                                   Label* L_failure,
6236                                                   bool set_cond_codes) {
6237  assert_different_registers(sub_klass, super_klass, temp_reg);
6238  if (temp2_reg != noreg)
6239    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
6240#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
6241
6242  Label L_fallthrough;
6243  int label_nulls = 0;
6244  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
6245  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
6246  assert(label_nulls <= 1, "at most one NULL in the batch");
6247
6248  // a couple of useful fields in sub_klass:
6249  int ss_offset = in_bytes(Klass::secondary_supers_offset());
6250  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
6251  Address secondary_supers_addr(sub_klass, ss_offset);
6252  Address super_cache_addr(     sub_klass, sc_offset);
6253
6254  // Do a linear scan of the secondary super-klass chain.
6255  // This code is rarely used, so simplicity is a virtue here.
6256  // The repne_scan instruction uses fixed registers, which we must spill.
6257  // Don't worry too much about pre-existing connections with the input regs.
6258
6259  assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
6260  assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
6261
6262  // Get super_klass value into rax (even if it was in rdi or rcx).
6263  bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
6264  if (super_klass != rax || UseCompressedOops) {
6265    if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
6266    mov(rax, super_klass);
6267  }
6268  if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
6269  if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
6270
6271#ifndef PRODUCT
6272  int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
6273  ExternalAddress pst_counter_addr((address) pst_counter);
6274  NOT_LP64(  incrementl(pst_counter_addr) );
6275  LP64_ONLY( lea(rcx, pst_counter_addr) );
6276  LP64_ONLY( incrementl(Address(rcx, 0)) );
6277#endif //PRODUCT
6278
6279  // We will consult the secondary-super array.
6280  movptr(rdi, secondary_supers_addr);
6281  // Load the array length.  (Positive movl does right thing on LP64.)
6282  movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
6283  // Skip to start of data.
6284  addptr(rdi, Array<Klass*>::base_offset_in_bytes());
6285
6286  // Scan RCX words at [RDI] for an occurrence of RAX.
6287  // Set NZ/Z based on last compare.
6288  // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
6289  // not change flags (only scas instruction which is repeated sets flags).
6290  // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
6291
6292    testptr(rax,rax); // Set Z = 0
6293    repne_scan();
6294
6295  // Unspill the temp. registers:
6296  if (pushed_rdi)  pop(rdi);
6297  if (pushed_rcx)  pop(rcx);
6298  if (pushed_rax)  pop(rax);
6299
6300  if (set_cond_codes) {
6301    // Special hack for the AD files:  rdi is guaranteed non-zero.
6302    assert(!pushed_rdi, "rdi must be left non-NULL");
6303    // Also, the condition codes are properly set Z/NZ on succeed/failure.
6304  }
6305
6306  if (L_failure == &L_fallthrough)
6307        jccb(Assembler::notEqual, *L_failure);
6308  else  jcc(Assembler::notEqual, *L_failure);
6309
6310  // Success.  Cache the super we found and proceed in triumph.
6311  movptr(super_cache_addr, super_klass);
6312
6313  if (L_success != &L_fallthrough) {
6314    jmp(*L_success);
6315  }
6316
6317#undef IS_A_TEMP
6318
6319  bind(L_fallthrough);
6320}
6321
6322
6323void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
6324  if (VM_Version::supports_cmov()) {
6325    cmovl(cc, dst, src);
6326  } else {
6327    Label L;
6328    jccb(negate_condition(cc), L);
6329    movl(dst, src);
6330    bind(L);
6331  }
6332}
6333
6334void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
6335  if (VM_Version::supports_cmov()) {
6336    cmovl(cc, dst, src);
6337  } else {
6338    Label L;
6339    jccb(negate_condition(cc), L);
6340    movl(dst, src);
6341    bind(L);
6342  }
6343}
6344
6345void MacroAssembler::verify_oop(Register reg, const char* s) {
6346  if (!VerifyOops) return;
6347
6348  // Pass register number to verify_oop_subroutine
6349  const char* b = NULL;
6350  {
6351    ResourceMark rm;
6352    stringStream ss;
6353    ss.print("verify_oop: %s: %s", reg->name(), s);
6354    b = code_string(ss.as_string());
6355  }
6356  BLOCK_COMMENT("verify_oop {");
6357#ifdef _LP64
6358  push(rscratch1);                    // save r10, trashed by movptr()
6359#endif
6360  push(rax);                          // save rax,
6361  push(reg);                          // pass register argument
6362  ExternalAddress buffer((address) b);
6363  // avoid using pushptr, as it modifies scratch registers
6364  // and our contract is not to modify anything
6365  movptr(rax, buffer.addr());
6366  push(rax);
6367  // call indirectly to solve generation ordering problem
6368  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
6369  call(rax);
6370  // Caller pops the arguments (oop, message) and restores rax, r10
6371  BLOCK_COMMENT("} verify_oop");
6372}
6373
6374
6375RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
6376                                                      Register tmp,
6377                                                      int offset) {
6378  intptr_t value = *delayed_value_addr;
6379  if (value != 0)
6380    return RegisterOrConstant(value + offset);
6381
6382  // load indirectly to solve generation ordering problem
6383  movptr(tmp, ExternalAddress((address) delayed_value_addr));
6384
6385#ifdef ASSERT
6386  { Label L;
6387    testptr(tmp, tmp);
6388    if (WizardMode) {
6389      const char* buf = NULL;
6390      {
6391        ResourceMark rm;
6392        stringStream ss;
6393        ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
6394        buf = code_string(ss.as_string());
6395      }
6396      jcc(Assembler::notZero, L);
6397      STOP(buf);
6398    } else {
6399      jccb(Assembler::notZero, L);
6400      hlt();
6401    }
6402    bind(L);
6403  }
6404#endif
6405
6406  if (offset != 0)
6407    addptr(tmp, offset);
6408
6409  return RegisterOrConstant(tmp);
6410}
6411
6412
6413Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
6414                                         int extra_slot_offset) {
6415  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
6416  int stackElementSize = Interpreter::stackElementSize;
6417  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
6418#ifdef ASSERT
6419  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
6420  assert(offset1 - offset == stackElementSize, "correct arithmetic");
6421#endif
6422  Register             scale_reg    = noreg;
6423  Address::ScaleFactor scale_factor = Address::no_scale;
6424  if (arg_slot.is_constant()) {
6425    offset += arg_slot.as_constant() * stackElementSize;
6426  } else {
6427    scale_reg    = arg_slot.as_register();
6428    scale_factor = Address::times(stackElementSize);
6429  }
6430  offset += wordSize;           // return PC is on stack
6431  return Address(rsp, scale_reg, scale_factor, offset);
6432}
6433
6434
6435void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
6436  if (!VerifyOops) return;
6437
6438  // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
6439  // Pass register number to verify_oop_subroutine
6440  const char* b = NULL;
6441  {
6442    ResourceMark rm;
6443    stringStream ss;
6444    ss.print("verify_oop_addr: %s", s);
6445    b = code_string(ss.as_string());
6446  }
6447#ifdef _LP64
6448  push(rscratch1);                    // save r10, trashed by movptr()
6449#endif
6450  push(rax);                          // save rax,
6451  // addr may contain rsp so we will have to adjust it based on the push
6452  // we just did (and on 64 bit we do two pushes)
6453  // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
6454  // stores rax into addr which is backwards of what was intended.
6455  if (addr.uses(rsp)) {
6456    lea(rax, addr);
6457    pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
6458  } else {
6459    pushptr(addr);
6460  }
6461
6462  ExternalAddress buffer((address) b);
6463  // pass msg argument
6464  // avoid using pushptr, as it modifies scratch registers
6465  // and our contract is not to modify anything
6466  movptr(rax, buffer.addr());
6467  push(rax);
6468
6469  // call indirectly to solve generation ordering problem
6470  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
6471  call(rax);
6472  // Caller pops the arguments (addr, message) and restores rax, r10.
6473}
6474
6475void MacroAssembler::verify_tlab() {
6476#ifdef ASSERT
6477  if (UseTLAB && VerifyOops) {
6478    Label next, ok;
6479    Register t1 = rsi;
6480    Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
6481
6482    push(t1);
6483    NOT_LP64(push(thread_reg));
6484    NOT_LP64(get_thread(thread_reg));
6485
6486    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
6487    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
6488    jcc(Assembler::aboveEqual, next);
6489    STOP("assert(top >= start)");
6490    should_not_reach_here();
6491
6492    bind(next);
6493    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
6494    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
6495    jcc(Assembler::aboveEqual, ok);
6496    STOP("assert(top <= end)");
6497    should_not_reach_here();
6498
6499    bind(ok);
6500    NOT_LP64(pop(thread_reg));
6501    pop(t1);
6502  }
6503#endif
6504}
6505
6506class ControlWord {
6507 public:
6508  int32_t _value;
6509
6510  int  rounding_control() const        { return  (_value >> 10) & 3      ; }
6511  int  precision_control() const       { return  (_value >>  8) & 3      ; }
6512  bool precision() const               { return ((_value >>  5) & 1) != 0; }
6513  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
6514  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
6515  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
6516  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
6517  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
6518
6519  void print() const {
6520    // rounding control
6521    const char* rc;
6522    switch (rounding_control()) {
6523      case 0: rc = "round near"; break;
6524      case 1: rc = "round down"; break;
6525      case 2: rc = "round up  "; break;
6526      case 3: rc = "chop      "; break;
6527    };
6528    // precision control
6529    const char* pc;
6530    switch (precision_control()) {
6531      case 0: pc = "24 bits "; break;
6532      case 1: pc = "reserved"; break;
6533      case 2: pc = "53 bits "; break;
6534      case 3: pc = "64 bits "; break;
6535    };
6536    // flags
6537    char f[9];
6538    f[0] = ' ';
6539    f[1] = ' ';
6540    f[2] = (precision   ()) ? 'P' : 'p';
6541    f[3] = (underflow   ()) ? 'U' : 'u';
6542    f[4] = (overflow    ()) ? 'O' : 'o';
6543    f[5] = (zero_divide ()) ? 'Z' : 'z';
6544    f[6] = (denormalized()) ? 'D' : 'd';
6545    f[7] = (invalid     ()) ? 'I' : 'i';
6546    f[8] = '\x0';
6547    // output
6548    printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
6549  }
6550
6551};
6552
6553class StatusWord {
6554 public:
6555  int32_t _value;
6556
6557  bool busy() const                    { return ((_value >> 15) & 1) != 0; }
6558  bool C3() const                      { return ((_value >> 14) & 1) != 0; }
6559  bool C2() const                      { return ((_value >> 10) & 1) != 0; }
6560  bool C1() const                      { return ((_value >>  9) & 1) != 0; }
6561  bool C0() const                      { return ((_value >>  8) & 1) != 0; }
6562  int  top() const                     { return  (_value >> 11) & 7      ; }
6563  bool error_status() const            { return ((_value >>  7) & 1) != 0; }
6564  bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
6565  bool precision() const               { return ((_value >>  5) & 1) != 0; }
6566  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
6567  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
6568  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
6569  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
6570  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
6571
6572  void print() const {
6573    // condition codes
6574    char c[5];
6575    c[0] = (C3()) ? '3' : '-';
6576    c[1] = (C2()) ? '2' : '-';
6577    c[2] = (C1()) ? '1' : '-';
6578    c[3] = (C0()) ? '0' : '-';
6579    c[4] = '\x0';
6580    // flags
6581    char f[9];
6582    f[0] = (error_status()) ? 'E' : '-';
6583    f[1] = (stack_fault ()) ? 'S' : '-';
6584    f[2] = (precision   ()) ? 'P' : '-';
6585    f[3] = (underflow   ()) ? 'U' : '-';
6586    f[4] = (overflow    ()) ? 'O' : '-';
6587    f[5] = (zero_divide ()) ? 'Z' : '-';
6588    f[6] = (denormalized()) ? 'D' : '-';
6589    f[7] = (invalid     ()) ? 'I' : '-';
6590    f[8] = '\x0';
6591    // output
6592    printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
6593  }
6594
6595};
6596
6597class TagWord {
6598 public:
6599  int32_t _value;
6600
6601  int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
6602
6603  void print() const {
6604    printf("%04x", _value & 0xFFFF);
6605  }
6606
6607};
6608
6609class FPU_Register {
6610 public:
6611  int32_t _m0;
6612  int32_t _m1;
6613  int16_t _ex;
6614
6615  bool is_indefinite() const           {
6616    return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
6617  }
6618
6619  void print() const {
6620    char  sign = (_ex < 0) ? '-' : '+';
6621    const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
6622    printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
6623  };
6624
6625};
6626
6627class FPU_State {
6628 public:
6629  enum {
6630    register_size       = 10,
6631    number_of_registers =  8,
6632    register_mask       =  7
6633  };
6634
6635  ControlWord  _control_word;
6636  StatusWord   _status_word;
6637  TagWord      _tag_word;
6638  int32_t      _error_offset;
6639  int32_t      _error_selector;
6640  int32_t      _data_offset;
6641  int32_t      _data_selector;
6642  int8_t       _register[register_size * number_of_registers];
6643
6644  int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
6645  FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
6646
6647  const char* tag_as_string(int tag) const {
6648    switch (tag) {
6649      case 0: return "valid";
6650      case 1: return "zero";
6651      case 2: return "special";
6652      case 3: return "empty";
6653    }
6654    ShouldNotReachHere();
6655    return NULL;
6656  }
6657
6658  void print() const {
6659    // print computation registers
6660    { int t = _status_word.top();
6661      for (int i = 0; i < number_of_registers; i++) {
6662        int j = (i - t) & register_mask;
6663        printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
6664        st(j)->print();
6665        printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
6666      }
6667    }
6668    printf("\n");
6669    // print control registers
6670    printf("ctrl = "); _control_word.print(); printf("\n");
6671    printf("stat = "); _status_word .print(); printf("\n");
6672    printf("tags = "); _tag_word    .print(); printf("\n");
6673  }
6674
6675};
6676
6677class Flag_Register {
6678 public:
6679  int32_t _value;
6680
6681  bool overflow() const                { return ((_value >> 11) & 1) != 0; }
6682  bool direction() const               { return ((_value >> 10) & 1) != 0; }
6683  bool sign() const                    { return ((_value >>  7) & 1) != 0; }
6684  bool zero() const                    { return ((_value >>  6) & 1) != 0; }
6685  bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
6686  bool parity() const                  { return ((_value >>  2) & 1) != 0; }
6687  bool carry() const                   { return ((_value >>  0) & 1) != 0; }
6688
6689  void print() const {
6690    // flags
6691    char f[8];
6692    f[0] = (overflow       ()) ? 'O' : '-';
6693    f[1] = (direction      ()) ? 'D' : '-';
6694    f[2] = (sign           ()) ? 'S' : '-';
6695    f[3] = (zero           ()) ? 'Z' : '-';
6696    f[4] = (auxiliary_carry()) ? 'A' : '-';
6697    f[5] = (parity         ()) ? 'P' : '-';
6698    f[6] = (carry          ()) ? 'C' : '-';
6699    f[7] = '\x0';
6700    // output
6701    printf("%08x  flags = %s", _value, f);
6702  }
6703
6704};
6705
6706class IU_Register {
6707 public:
6708  int32_t _value;
6709
6710  void print() const {
6711    printf("%08x  %11d", _value, _value);
6712  }
6713
6714};
6715
6716class IU_State {
6717 public:
6718  Flag_Register _eflags;
6719  IU_Register   _rdi;
6720  IU_Register   _rsi;
6721  IU_Register   _rbp;
6722  IU_Register   _rsp;
6723  IU_Register   _rbx;
6724  IU_Register   _rdx;
6725  IU_Register   _rcx;
6726  IU_Register   _rax;
6727
6728  void print() const {
6729    // computation registers
6730    printf("rax,  = "); _rax.print(); printf("\n");
6731    printf("rbx,  = "); _rbx.print(); printf("\n");
6732    printf("rcx  = "); _rcx.print(); printf("\n");
6733    printf("rdx  = "); _rdx.print(); printf("\n");
6734    printf("rdi  = "); _rdi.print(); printf("\n");
6735    printf("rsi  = "); _rsi.print(); printf("\n");
6736    printf("rbp,  = "); _rbp.print(); printf("\n");
6737    printf("rsp  = "); _rsp.print(); printf("\n");
6738    printf("\n");
6739    // control registers
6740    printf("flgs = "); _eflags.print(); printf("\n");
6741  }
6742};
6743
6744
6745class CPU_State {
6746 public:
6747  FPU_State _fpu_state;
6748  IU_State  _iu_state;
6749
6750  void print() const {
6751    printf("--------------------------------------------------\n");
6752    _iu_state .print();
6753    printf("\n");
6754    _fpu_state.print();
6755    printf("--------------------------------------------------\n");
6756  }
6757
6758};
6759
6760
6761static void _print_CPU_state(CPU_State* state) {
6762  state->print();
6763};
6764
6765
6766void MacroAssembler::print_CPU_state() {
6767  push_CPU_state();
6768  push(rsp);                // pass CPU state
6769  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
6770  addptr(rsp, wordSize);       // discard argument
6771  pop_CPU_state();
6772}
6773
6774
6775static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
6776  static int counter = 0;
6777  FPU_State* fs = &state->_fpu_state;
6778  counter++;
6779  // For leaf calls, only verify that the top few elements remain empty.
6780  // We only need 1 empty at the top for C2 code.
6781  if( stack_depth < 0 ) {
6782    if( fs->tag_for_st(7) != 3 ) {
6783      printf("FPR7 not empty\n");
6784      state->print();
6785      assert(false, "error");
6786      return false;
6787    }
6788    return true;                // All other stack states do not matter
6789  }
6790
6791  assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
6792         "bad FPU control word");
6793
6794  // compute stack depth
6795  int i = 0;
6796  while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
6797  int d = i;
6798  while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
6799  // verify findings
6800  if (i != FPU_State::number_of_registers) {
6801    // stack not contiguous
6802    printf("%s: stack not contiguous at ST%d\n", s, i);
6803    state->print();
6804    assert(false, "error");
6805    return false;
6806  }
6807  // check if computed stack depth corresponds to expected stack depth
6808  if (stack_depth < 0) {
6809    // expected stack depth is -stack_depth or less
6810    if (d > -stack_depth) {
6811      // too many elements on the stack
6812      printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
6813      state->print();
6814      assert(false, "error");
6815      return false;
6816    }
6817  } else {
6818    // expected stack depth is stack_depth
6819    if (d != stack_depth) {
6820      // wrong stack depth
6821      printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
6822      state->print();
6823      assert(false, "error");
6824      return false;
6825    }
6826  }
6827  // everything is cool
6828  return true;
6829}
6830
6831
6832void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
6833  if (!VerifyFPU) return;
6834  push_CPU_state();
6835  push(rsp);                // pass CPU state
6836  ExternalAddress msg((address) s);
6837  // pass message string s
6838  pushptr(msg.addr());
6839  push(stack_depth);        // pass stack depth
6840  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
6841  addptr(rsp, 3 * wordSize);   // discard arguments
6842  // check for error
6843  { Label L;
6844    testl(rax, rax);
6845    jcc(Assembler::notZero, L);
6846    int3();                  // break if error condition
6847    bind(L);
6848  }
6849  pop_CPU_state();
6850}
6851
6852void MacroAssembler::restore_cpu_control_state_after_jni() {
6853  // Either restore the MXCSR register after returning from the JNI Call
6854  // or verify that it wasn't changed (with -Xcheck:jni flag).
6855  if (VM_Version::supports_sse()) {
6856    if (RestoreMXCSROnJNICalls) {
6857      ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
6858    } else if (CheckJNICalls) {
6859      call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
6860    }
6861  }
6862  if (VM_Version::supports_avx()) {
6863    // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
6864    vzeroupper();
6865  }
6866
6867#ifndef _LP64
6868  // Either restore the x87 floating pointer control word after returning
6869  // from the JNI call or verify that it wasn't changed.
6870  if (CheckJNICalls) {
6871    call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
6872  }
6873#endif // _LP64
6874}
6875
6876
6877void MacroAssembler::load_klass(Register dst, Register src) {
6878#ifdef _LP64
6879  if (UseCompressedClassPointers) {
6880    movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
6881    decode_klass_not_null(dst);
6882  } else
6883#endif
6884    movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
6885}
6886
6887void MacroAssembler::load_prototype_header(Register dst, Register src) {
6888  load_klass(dst, src);
6889  movptr(dst, Address(dst, Klass::prototype_header_offset()));
6890}
6891
6892void MacroAssembler::store_klass(Register dst, Register src) {
6893#ifdef _LP64
6894  if (UseCompressedClassPointers) {
6895    encode_klass_not_null(src);
6896    movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
6897  } else
6898#endif
6899    movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
6900}
6901
6902void MacroAssembler::load_heap_oop(Register dst, Address src) {
6903#ifdef _LP64
6904  // FIXME: Must change all places where we try to load the klass.
6905  if (UseCompressedOops) {
6906    movl(dst, src);
6907    decode_heap_oop(dst);
6908  } else
6909#endif
6910    movptr(dst, src);
6911}
6912
6913// Doesn't do verfication, generates fixed size code
6914void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
6915#ifdef _LP64
6916  if (UseCompressedOops) {
6917    movl(dst, src);
6918    decode_heap_oop_not_null(dst);
6919  } else
6920#endif
6921    movptr(dst, src);
6922}
6923
6924void MacroAssembler::store_heap_oop(Address dst, Register src) {
6925#ifdef _LP64
6926  if (UseCompressedOops) {
6927    assert(!dst.uses(src), "not enough registers");
6928    encode_heap_oop(src);
6929    movl(dst, src);
6930  } else
6931#endif
6932    movptr(dst, src);
6933}
6934
6935void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
6936  assert_different_registers(src1, tmp);
6937#ifdef _LP64
6938  if (UseCompressedOops) {
6939    bool did_push = false;
6940    if (tmp == noreg) {
6941      tmp = rax;
6942      push(tmp);
6943      did_push = true;
6944      assert(!src2.uses(rsp), "can't push");
6945    }
6946    load_heap_oop(tmp, src2);
6947    cmpptr(src1, tmp);
6948    if (did_push)  pop(tmp);
6949  } else
6950#endif
6951    cmpptr(src1, src2);
6952}
6953
6954// Used for storing NULLs.
6955void MacroAssembler::store_heap_oop_null(Address dst) {
6956#ifdef _LP64
6957  if (UseCompressedOops) {
6958    movl(dst, (int32_t)NULL_WORD);
6959  } else {
6960    movslq(dst, (int32_t)NULL_WORD);
6961  }
6962#else
6963  movl(dst, (int32_t)NULL_WORD);
6964#endif
6965}
6966
6967#ifdef _LP64
6968void MacroAssembler::store_klass_gap(Register dst, Register src) {
6969  if (UseCompressedClassPointers) {
6970    // Store to klass gap in destination
6971    movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
6972  }
6973}
6974
6975#ifdef ASSERT
6976void MacroAssembler::verify_heapbase(const char* msg) {
6977  assert (UseCompressedOops, "should be compressed");
6978  assert (Universe::heap() != NULL, "java heap should be initialized");
6979  if (CheckCompressedOops) {
6980    Label ok;
6981    push(rscratch1); // cmpptr trashes rscratch1
6982    cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
6983    jcc(Assembler::equal, ok);
6984    STOP(msg);
6985    bind(ok);
6986    pop(rscratch1);
6987  }
6988}
6989#endif
6990
6991// Algorithm must match oop.inline.hpp encode_heap_oop.
6992void MacroAssembler::encode_heap_oop(Register r) {
6993#ifdef ASSERT
6994  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
6995#endif
6996  verify_oop(r, "broken oop in encode_heap_oop");
6997  if (Universe::narrow_oop_base() == NULL) {
6998    if (Universe::narrow_oop_shift() != 0) {
6999      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
7000      shrq(r, LogMinObjAlignmentInBytes);
7001    }
7002    return;
7003  }
7004  testq(r, r);
7005  cmovq(Assembler::equal, r, r12_heapbase);
7006  subq(r, r12_heapbase);
7007  shrq(r, LogMinObjAlignmentInBytes);
7008}
7009
7010void MacroAssembler::encode_heap_oop_not_null(Register r) {
7011#ifdef ASSERT
7012  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
7013  if (CheckCompressedOops) {
7014    Label ok;
7015    testq(r, r);
7016    jcc(Assembler::notEqual, ok);
7017    STOP("null oop passed to encode_heap_oop_not_null");
7018    bind(ok);
7019  }
7020#endif
7021  verify_oop(r, "broken oop in encode_heap_oop_not_null");
7022  if (Universe::narrow_oop_base() != NULL) {
7023    subq(r, r12_heapbase);
7024  }
7025  if (Universe::narrow_oop_shift() != 0) {
7026    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
7027    shrq(r, LogMinObjAlignmentInBytes);
7028  }
7029}
7030
7031void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
7032#ifdef ASSERT
7033  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
7034  if (CheckCompressedOops) {
7035    Label ok;
7036    testq(src, src);
7037    jcc(Assembler::notEqual, ok);
7038    STOP("null oop passed to encode_heap_oop_not_null2");
7039    bind(ok);
7040  }
7041#endif
7042  verify_oop(src, "broken oop in encode_heap_oop_not_null2");
7043  if (dst != src) {
7044    movq(dst, src);
7045  }
7046  if (Universe::narrow_oop_base() != NULL) {
7047    subq(dst, r12_heapbase);
7048  }
7049  if (Universe::narrow_oop_shift() != 0) {
7050    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
7051    shrq(dst, LogMinObjAlignmentInBytes);
7052  }
7053}
7054
7055void  MacroAssembler::decode_heap_oop(Register r) {
7056#ifdef ASSERT
7057  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
7058#endif
7059  if (Universe::narrow_oop_base() == NULL) {
7060    if (Universe::narrow_oop_shift() != 0) {
7061      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
7062      shlq(r, LogMinObjAlignmentInBytes);
7063    }
7064  } else {
7065    Label done;
7066    shlq(r, LogMinObjAlignmentInBytes);
7067    jccb(Assembler::equal, done);
7068    addq(r, r12_heapbase);
7069    bind(done);
7070  }
7071  verify_oop(r, "broken oop in decode_heap_oop");
7072}
7073
7074void  MacroAssembler::decode_heap_oop_not_null(Register r) {
7075  // Note: it will change flags
7076  assert (UseCompressedOops, "should only be used for compressed headers");
7077  assert (Universe::heap() != NULL, "java heap should be initialized");
7078  // Cannot assert, unverified entry point counts instructions (see .ad file)
7079  // vtableStubs also counts instructions in pd_code_size_limit.
7080  // Also do not verify_oop as this is called by verify_oop.
7081  if (Universe::narrow_oop_shift() != 0) {
7082    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
7083    shlq(r, LogMinObjAlignmentInBytes);
7084    if (Universe::narrow_oop_base() != NULL) {
7085      addq(r, r12_heapbase);
7086    }
7087  } else {
7088    assert (Universe::narrow_oop_base() == NULL, "sanity");
7089  }
7090}
7091
7092void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
7093  // Note: it will change flags
7094  assert (UseCompressedOops, "should only be used for compressed headers");
7095  assert (Universe::heap() != NULL, "java heap should be initialized");
7096  // Cannot assert, unverified entry point counts instructions (see .ad file)
7097  // vtableStubs also counts instructions in pd_code_size_limit.
7098  // Also do not verify_oop as this is called by verify_oop.
7099  if (Universe::narrow_oop_shift() != 0) {
7100    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
7101    if (LogMinObjAlignmentInBytes == Address::times_8) {
7102      leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
7103    } else {
7104      if (dst != src) {
7105        movq(dst, src);
7106      }
7107      shlq(dst, LogMinObjAlignmentInBytes);
7108      if (Universe::narrow_oop_base() != NULL) {
7109        addq(dst, r12_heapbase);
7110      }
7111    }
7112  } else {
7113    assert (Universe::narrow_oop_base() == NULL, "sanity");
7114    if (dst != src) {
7115      movq(dst, src);
7116    }
7117  }
7118}
7119
7120void MacroAssembler::encode_klass_not_null(Register r) {
7121  if (Universe::narrow_klass_base() != NULL) {
7122    // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
7123    assert(r != r12_heapbase, "Encoding a klass in r12");
7124    mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
7125    subq(r, r12_heapbase);
7126  }
7127  if (Universe::narrow_klass_shift() != 0) {
7128    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
7129    shrq(r, LogKlassAlignmentInBytes);
7130  }
7131  if (Universe::narrow_klass_base() != NULL) {
7132    reinit_heapbase();
7133  }
7134}
7135
7136void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
7137  if (dst == src) {
7138    encode_klass_not_null(src);
7139  } else {
7140    if (Universe::narrow_klass_base() != NULL) {
7141      mov64(dst, (int64_t)Universe::narrow_klass_base());
7142      negq(dst);
7143      addq(dst, src);
7144    } else {
7145      movptr(dst, src);
7146    }
7147    if (Universe::narrow_klass_shift() != 0) {
7148      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
7149      shrq(dst, LogKlassAlignmentInBytes);
7150    }
7151  }
7152}
7153
7154// Function instr_size_for_decode_klass_not_null() counts the instructions
7155// generated by decode_klass_not_null(register r) and reinit_heapbase(),
7156// when (Universe::heap() != NULL).  Hence, if the instructions they
7157// generate change, then this method needs to be updated.
7158int MacroAssembler::instr_size_for_decode_klass_not_null() {
7159  assert (UseCompressedClassPointers, "only for compressed klass ptrs");
7160  if (Universe::narrow_klass_base() != NULL) {
7161    // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
7162    return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
7163  } else {
7164    // longest load decode klass function, mov64, leaq
7165    return 16;
7166  }
7167}
7168
7169// !!! If the instructions that get generated here change then function
7170// instr_size_for_decode_klass_not_null() needs to get updated.
7171void  MacroAssembler::decode_klass_not_null(Register r) {
7172  // Note: it will change flags
7173  assert (UseCompressedClassPointers, "should only be used for compressed headers");
7174  assert(r != r12_heapbase, "Decoding a klass in r12");
7175  // Cannot assert, unverified entry point counts instructions (see .ad file)
7176  // vtableStubs also counts instructions in pd_code_size_limit.
7177  // Also do not verify_oop as this is called by verify_oop.
7178  if (Universe::narrow_klass_shift() != 0) {
7179    assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
7180    shlq(r, LogKlassAlignmentInBytes);
7181  }
7182  // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
7183  if (Universe::narrow_klass_base() != NULL) {
7184    mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
7185    addq(r, r12_heapbase);
7186    reinit_heapbase();
7187  }
7188}
7189
7190void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
7191  // Note: it will change flags
7192  assert (UseCompressedClassPointers, "should only be used for compressed headers");
7193  if (dst == src) {
7194    decode_klass_not_null(dst);
7195  } else {
7196    // Cannot assert, unverified entry point counts instructions (see .ad file)
7197    // vtableStubs also counts instructions in pd_code_size_limit.
7198    // Also do not verify_oop as this is called by verify_oop.
7199    mov64(dst, (int64_t)Universe::narrow_klass_base());
7200    if (Universe::narrow_klass_shift() != 0) {
7201      assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
7202      assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
7203      leaq(dst, Address(dst, src, Address::times_8, 0));
7204    } else {
7205      addq(dst, src);
7206    }
7207  }
7208}
7209
7210void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
7211  assert (UseCompressedOops, "should only be used for compressed headers");
7212  assert (Universe::heap() != NULL, "java heap should be initialized");
7213  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7214  int oop_index = oop_recorder()->find_index(obj);
7215  RelocationHolder rspec = oop_Relocation::spec(oop_index);
7216  mov_narrow_oop(dst, oop_index, rspec);
7217}
7218
7219void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
7220  assert (UseCompressedOops, "should only be used for compressed headers");
7221  assert (Universe::heap() != NULL, "java heap should be initialized");
7222  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7223  int oop_index = oop_recorder()->find_index(obj);
7224  RelocationHolder rspec = oop_Relocation::spec(oop_index);
7225  mov_narrow_oop(dst, oop_index, rspec);
7226}
7227
7228void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
7229  assert (UseCompressedClassPointers, "should only be used for compressed headers");
7230  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7231  int klass_index = oop_recorder()->find_index(k);
7232  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
7233  mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
7234}
7235
7236void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
7237  assert (UseCompressedClassPointers, "should only be used for compressed headers");
7238  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7239  int klass_index = oop_recorder()->find_index(k);
7240  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
7241  mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
7242}
7243
7244void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
7245  assert (UseCompressedOops, "should only be used for compressed headers");
7246  assert (Universe::heap() != NULL, "java heap should be initialized");
7247  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7248  int oop_index = oop_recorder()->find_index(obj);
7249  RelocationHolder rspec = oop_Relocation::spec(oop_index);
7250  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
7251}
7252
7253void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
7254  assert (UseCompressedOops, "should only be used for compressed headers");
7255  assert (Universe::heap() != NULL, "java heap should be initialized");
7256  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7257  int oop_index = oop_recorder()->find_index(obj);
7258  RelocationHolder rspec = oop_Relocation::spec(oop_index);
7259  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
7260}
7261
7262void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
7263  assert (UseCompressedClassPointers, "should only be used for compressed headers");
7264  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7265  int klass_index = oop_recorder()->find_index(k);
7266  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
7267  Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
7268}
7269
7270void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
7271  assert (UseCompressedClassPointers, "should only be used for compressed headers");
7272  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7273  int klass_index = oop_recorder()->find_index(k);
7274  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
7275  Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
7276}
7277
7278void MacroAssembler::reinit_heapbase() {
7279  if (UseCompressedOops || UseCompressedClassPointers) {
7280    if (Universe::heap() != NULL) {
7281      if (Universe::narrow_oop_base() == NULL) {
7282        MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
7283      } else {
7284        mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
7285      }
7286    } else {
7287      movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
7288    }
7289  }
7290}
7291
7292#endif // _LP64
7293
7294
7295// C2 compiled method's prolog code.
7296void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) {
7297
7298  // WARNING: Initial instruction MUST be 5 bytes or longer so that
7299  // NativeJump::patch_verified_entry will be able to patch out the entry
7300  // code safely. The push to verify stack depth is ok at 5 bytes,
7301  // the frame allocation can be either 3 or 6 bytes. So if we don't do
7302  // stack bang then we must use the 6 byte frame allocation even if
7303  // we have no frame. :-(
7304  assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
7305
7306  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
7307  // Remove word for return addr
7308  framesize -= wordSize;
7309  stack_bang_size -= wordSize;
7310
7311  // Calls to C2R adapters often do not accept exceptional returns.
7312  // We require that their callers must bang for them.  But be careful, because
7313  // some VM calls (such as call site linkage) can use several kilobytes of
7314  // stack.  But the stack safety zone should account for that.
7315  // See bugs 4446381, 4468289, 4497237.
7316  if (stack_bang_size > 0) {
7317    generate_stack_overflow_check(stack_bang_size);
7318
7319    // We always push rbp, so that on return to interpreter rbp, will be
7320    // restored correctly and we can correct the stack.
7321    push(rbp);
7322    // Save caller's stack pointer into RBP if the frame pointer is preserved.
7323    if (PreserveFramePointer) {
7324      mov(rbp, rsp);
7325    }
7326    // Remove word for ebp
7327    framesize -= wordSize;
7328
7329    // Create frame
7330    if (framesize) {
7331      subptr(rsp, framesize);
7332    }
7333  } else {
7334    // Create frame (force generation of a 4 byte immediate value)
7335    subptr_imm32(rsp, framesize);
7336
7337    // Save RBP register now.
7338    framesize -= wordSize;
7339    movptr(Address(rsp, framesize), rbp);
7340    // Save caller's stack pointer into RBP if the frame pointer is preserved.
7341    if (PreserveFramePointer) {
7342      movptr(rbp, rsp);
7343      if (framesize > 0) {
7344        addptr(rbp, framesize);
7345      }
7346    }
7347  }
7348
7349  if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
7350    framesize -= wordSize;
7351    movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
7352  }
7353
7354#ifndef _LP64
7355  // If method sets FPU control word do it now
7356  if (fp_mode_24b) {
7357    fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
7358  }
7359  if (UseSSE >= 2 && VerifyFPU) {
7360    verify_FPU(0, "FPU stack must be clean on entry");
7361  }
7362#endif
7363
7364#ifdef ASSERT
7365  if (VerifyStackAtCalls) {
7366    Label L;
7367    push(rax);
7368    mov(rax, rsp);
7369    andptr(rax, StackAlignmentInBytes-1);
7370    cmpptr(rax, StackAlignmentInBytes-wordSize);
7371    pop(rax);
7372    jcc(Assembler::equal, L);
7373    STOP("Stack is not properly aligned!");
7374    bind(L);
7375  }
7376#endif
7377
7378}
7379
7380void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
7381  // cnt - number of qwords (8-byte words).
7382  // base - start address, qword aligned.
7383  assert(base==rdi, "base register must be edi for rep stos");
7384  assert(tmp==rax,   "tmp register must be eax for rep stos");
7385  assert(cnt==rcx,   "cnt register must be ecx for rep stos");
7386
7387  xorptr(tmp, tmp);
7388  if (UseFastStosb) {
7389    shlptr(cnt,3); // convert to number of bytes
7390    rep_stosb();
7391  } else {
7392    NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
7393    rep_stos();
7394  }
7395}
7396
7397#ifdef COMPILER2
7398
7399// IndexOf for constant substrings with size >= 8 chars
7400// which don't need to be loaded through stack.
7401void MacroAssembler::string_indexofC8(Register str1, Register str2,
7402                                      Register cnt1, Register cnt2,
7403                                      int int_cnt2,  Register result,
7404                                      XMMRegister vec, Register tmp,
7405                                      int ae) {
7406  ShortBranchVerifier sbv(this);
7407  assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7408  assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7409  assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
7410
7411  // This method uses the pcmpestri instruction with bound registers
7412  //   inputs:
7413  //     xmm - substring
7414  //     rax - substring length (elements count)
7415  //     mem - scanned string
7416  //     rdx - string length (elements count)
7417  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
7418  //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
7419  //   outputs:
7420  //     rcx - matched index in string
7421  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
7422  int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
7423  int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
7424  Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
7425  Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
7426
7427  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
7428        RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
7429        MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
7430
7431  // Note, inline_string_indexOf() generates checks:
7432  // if (substr.count > string.count) return -1;
7433  // if (substr.count == 0) return 0;
7434  assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
7435
7436  // Load substring.
7437  if (ae == StrIntrinsicNode::UL) {
7438    pmovzxbw(vec, Address(str2, 0));
7439  } else {
7440    movdqu(vec, Address(str2, 0));
7441  }
7442  movl(cnt2, int_cnt2);
7443  movptr(result, str1); // string addr
7444
7445  if (int_cnt2 > stride) {
7446    jmpb(SCAN_TO_SUBSTR);
7447
7448    // Reload substr for rescan, this code
7449    // is executed only for large substrings (> 8 chars)
7450    bind(RELOAD_SUBSTR);
7451    if (ae == StrIntrinsicNode::UL) {
7452      pmovzxbw(vec, Address(str2, 0));
7453    } else {
7454      movdqu(vec, Address(str2, 0));
7455    }
7456    negptr(cnt2); // Jumped here with negative cnt2, convert to positive
7457
7458    bind(RELOAD_STR);
7459    // We came here after the beginning of the substring was
7460    // matched but the rest of it was not so we need to search
7461    // again. Start from the next element after the previous match.
7462
7463    // cnt2 is number of substring reminding elements and
7464    // cnt1 is number of string reminding elements when cmp failed.
7465    // Restored cnt1 = cnt1 - cnt2 + int_cnt2
7466    subl(cnt1, cnt2);
7467    addl(cnt1, int_cnt2);
7468    movl(cnt2, int_cnt2); // Now restore cnt2
7469
7470    decrementl(cnt1);     // Shift to next element
7471    cmpl(cnt1, cnt2);
7472    jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7473
7474    addptr(result, (1<<scale1));
7475
7476  } // (int_cnt2 > 8)
7477
7478  // Scan string for start of substr in 16-byte vectors
7479  bind(SCAN_TO_SUBSTR);
7480  pcmpestri(vec, Address(result, 0), mode);
7481  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
7482  subl(cnt1, stride);
7483  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
7484  cmpl(cnt1, cnt2);
7485  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7486  addptr(result, 16);
7487  jmpb(SCAN_TO_SUBSTR);
7488
7489  // Found a potential substr
7490  bind(FOUND_CANDIDATE);
7491  // Matched whole vector if first element matched (tmp(rcx) == 0).
7492  if (int_cnt2 == stride) {
7493    jccb(Assembler::overflow, RET_FOUND);    // OF == 1
7494  } else { // int_cnt2 > 8
7495    jccb(Assembler::overflow, FOUND_SUBSTR);
7496  }
7497  // After pcmpestri tmp(rcx) contains matched element index
7498  // Compute start addr of substr
7499  lea(result, Address(result, tmp, scale1));
7500
7501  // Make sure string is still long enough
7502  subl(cnt1, tmp);
7503  cmpl(cnt1, cnt2);
7504  if (int_cnt2 == stride) {
7505    jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
7506  } else { // int_cnt2 > 8
7507    jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
7508  }
7509  // Left less then substring.
7510
7511  bind(RET_NOT_FOUND);
7512  movl(result, -1);
7513  jmpb(EXIT);
7514
7515  if (int_cnt2 > stride) {
7516    // This code is optimized for the case when whole substring
7517    // is matched if its head is matched.
7518    bind(MATCH_SUBSTR_HEAD);
7519    pcmpestri(vec, Address(result, 0), mode);
7520    // Reload only string if does not match
7521    jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
7522
7523    Label CONT_SCAN_SUBSTR;
7524    // Compare the rest of substring (> 8 chars).
7525    bind(FOUND_SUBSTR);
7526    // First 8 chars are already matched.
7527    negptr(cnt2);
7528    addptr(cnt2, stride);
7529
7530    bind(SCAN_SUBSTR);
7531    subl(cnt1, stride);
7532    cmpl(cnt2, -stride); // Do not read beyond substring
7533    jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
7534    // Back-up strings to avoid reading beyond substring:
7535    // cnt1 = cnt1 - cnt2 + 8
7536    addl(cnt1, cnt2); // cnt2 is negative
7537    addl(cnt1, stride);
7538    movl(cnt2, stride); negptr(cnt2);
7539    bind(CONT_SCAN_SUBSTR);
7540    if (int_cnt2 < (int)G) {
7541      int tail_off1 = int_cnt2<<scale1;
7542      int tail_off2 = int_cnt2<<scale2;
7543      if (ae == StrIntrinsicNode::UL) {
7544        pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
7545      } else {
7546        movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
7547      }
7548      pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
7549    } else {
7550      // calculate index in register to avoid integer overflow (int_cnt2*2)
7551      movl(tmp, int_cnt2);
7552      addptr(tmp, cnt2);
7553      if (ae == StrIntrinsicNode::UL) {
7554        pmovzxbw(vec, Address(str2, tmp, scale2, 0));
7555      } else {
7556        movdqu(vec, Address(str2, tmp, scale2, 0));
7557      }
7558      pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
7559    }
7560    // Need to reload strings pointers if not matched whole vector
7561    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
7562    addptr(cnt2, stride);
7563    jcc(Assembler::negative, SCAN_SUBSTR);
7564    // Fall through if found full substring
7565
7566  } // (int_cnt2 > 8)
7567
7568  bind(RET_FOUND);
7569  // Found result if we matched full small substring.
7570  // Compute substr offset
7571  subptr(result, str1);
7572  if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7573    shrl(result, 1); // index
7574  }
7575  bind(EXIT);
7576
7577} // string_indexofC8
7578
7579// Small strings are loaded through stack if they cross page boundary.
7580void MacroAssembler::string_indexof(Register str1, Register str2,
7581                                    Register cnt1, Register cnt2,
7582                                    int int_cnt2,  Register result,
7583                                    XMMRegister vec, Register tmp,
7584                                    int ae) {
7585  ShortBranchVerifier sbv(this);
7586  assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7587  assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7588  assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
7589
7590  //
7591  // int_cnt2 is length of small (< 8 chars) constant substring
7592  // or (-1) for non constant substring in which case its length
7593  // is in cnt2 register.
7594  //
7595  // Note, inline_string_indexOf() generates checks:
7596  // if (substr.count > string.count) return -1;
7597  // if (substr.count == 0) return 0;
7598  //
7599  int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
7600  assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
7601  // This method uses the pcmpestri instruction with bound registers
7602  //   inputs:
7603  //     xmm - substring
7604  //     rax - substring length (elements count)
7605  //     mem - scanned string
7606  //     rdx - string length (elements count)
7607  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
7608  //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
7609  //   outputs:
7610  //     rcx - matched index in string
7611  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
7612  int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
7613  Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
7614  Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
7615
7616  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
7617        RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
7618        FOUND_CANDIDATE;
7619
7620  { //========================================================
7621    // We don't know where these strings are located
7622    // and we can't read beyond them. Load them through stack.
7623    Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
7624
7625    movptr(tmp, rsp); // save old SP
7626
7627    if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
7628      if (int_cnt2 == (1>>scale2)) { // One byte
7629        assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
7630        load_unsigned_byte(result, Address(str2, 0));
7631        movdl(vec, result); // move 32 bits
7632      } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
7633        // Not enough header space in 32-bit VM: 12+3 = 15.
7634        movl(result, Address(str2, -1));
7635        shrl(result, 8);
7636        movdl(vec, result); // move 32 bits
7637      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
7638        load_unsigned_short(result, Address(str2, 0));
7639        movdl(vec, result); // move 32 bits
7640      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
7641        movdl(vec, Address(str2, 0)); // move 32 bits
7642      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
7643        movq(vec, Address(str2, 0));  // move 64 bits
7644      } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
7645        // Array header size is 12 bytes in 32-bit VM
7646        // + 6 bytes for 3 chars == 18 bytes,
7647        // enough space to load vec and shift.
7648        assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
7649        if (ae == StrIntrinsicNode::UL) {
7650          int tail_off = int_cnt2-8;
7651          pmovzxbw(vec, Address(str2, tail_off));
7652          psrldq(vec, -2*tail_off);
7653        }
7654        else {
7655          int tail_off = int_cnt2*(1<<scale2);
7656          movdqu(vec, Address(str2, tail_off-16));
7657          psrldq(vec, 16-tail_off);
7658        }
7659      }
7660    } else { // not constant substring
7661      cmpl(cnt2, stride);
7662      jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
7663
7664      // We can read beyond string if srt+16 does not cross page boundary
7665      // since heaps are aligned and mapped by pages.
7666      assert(os::vm_page_size() < (int)G, "default page should be small");
7667      movl(result, str2); // We need only low 32 bits
7668      andl(result, (os::vm_page_size()-1));
7669      cmpl(result, (os::vm_page_size()-16));
7670      jccb(Assembler::belowEqual, CHECK_STR);
7671
7672      // Move small strings to stack to allow load 16 bytes into vec.
7673      subptr(rsp, 16);
7674      int stk_offset = wordSize-(1<<scale2);
7675      push(cnt2);
7676
7677      bind(COPY_SUBSTR);
7678      if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
7679        load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
7680        movb(Address(rsp, cnt2, scale2, stk_offset), result);
7681      } else if (ae == StrIntrinsicNode::UU) {
7682        load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
7683        movw(Address(rsp, cnt2, scale2, stk_offset), result);
7684      }
7685      decrement(cnt2);
7686      jccb(Assembler::notZero, COPY_SUBSTR);
7687
7688      pop(cnt2);
7689      movptr(str2, rsp);  // New substring address
7690    } // non constant
7691
7692    bind(CHECK_STR);
7693    cmpl(cnt1, stride);
7694    jccb(Assembler::aboveEqual, BIG_STRINGS);
7695
7696    // Check cross page boundary.
7697    movl(result, str1); // We need only low 32 bits
7698    andl(result, (os::vm_page_size()-1));
7699    cmpl(result, (os::vm_page_size()-16));
7700    jccb(Assembler::belowEqual, BIG_STRINGS);
7701
7702    subptr(rsp, 16);
7703    int stk_offset = -(1<<scale1);
7704    if (int_cnt2 < 0) { // not constant
7705      push(cnt2);
7706      stk_offset += wordSize;
7707    }
7708    movl(cnt2, cnt1);
7709
7710    bind(COPY_STR);
7711    if (ae == StrIntrinsicNode::LL) {
7712      load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
7713      movb(Address(rsp, cnt2, scale1, stk_offset), result);
7714    } else {
7715      load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
7716      movw(Address(rsp, cnt2, scale1, stk_offset), result);
7717    }
7718    decrement(cnt2);
7719    jccb(Assembler::notZero, COPY_STR);
7720
7721    if (int_cnt2 < 0) { // not constant
7722      pop(cnt2);
7723    }
7724    movptr(str1, rsp);  // New string address
7725
7726    bind(BIG_STRINGS);
7727    // Load substring.
7728    if (int_cnt2 < 0) { // -1
7729      if (ae == StrIntrinsicNode::UL) {
7730        pmovzxbw(vec, Address(str2, 0));
7731      } else {
7732        movdqu(vec, Address(str2, 0));
7733      }
7734      push(cnt2);       // substr count
7735      push(str2);       // substr addr
7736      push(str1);       // string addr
7737    } else {
7738      // Small (< 8 chars) constant substrings are loaded already.
7739      movl(cnt2, int_cnt2);
7740    }
7741    push(tmp);  // original SP
7742
7743  } // Finished loading
7744
7745  //========================================================
7746  // Start search
7747  //
7748
7749  movptr(result, str1); // string addr
7750
7751  if (int_cnt2  < 0) {  // Only for non constant substring
7752    jmpb(SCAN_TO_SUBSTR);
7753
7754    // SP saved at sp+0
7755    // String saved at sp+1*wordSize
7756    // Substr saved at sp+2*wordSize
7757    // Substr count saved at sp+3*wordSize
7758
7759    // Reload substr for rescan, this code
7760    // is executed only for large substrings (> 8 chars)
7761    bind(RELOAD_SUBSTR);
7762    movptr(str2, Address(rsp, 2*wordSize));
7763    movl(cnt2, Address(rsp, 3*wordSize));
7764    if (ae == StrIntrinsicNode::UL) {
7765      pmovzxbw(vec, Address(str2, 0));
7766    } else {
7767      movdqu(vec, Address(str2, 0));
7768    }
7769    // We came here after the beginning of the substring was
7770    // matched but the rest of it was not so we need to search
7771    // again. Start from the next element after the previous match.
7772    subptr(str1, result); // Restore counter
7773    if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7774      shrl(str1, 1);
7775    }
7776    addl(cnt1, str1);
7777    decrementl(cnt1);   // Shift to next element
7778    cmpl(cnt1, cnt2);
7779    jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7780
7781    addptr(result, (1<<scale1));
7782  } // non constant
7783
7784  // Scan string for start of substr in 16-byte vectors
7785  bind(SCAN_TO_SUBSTR);
7786  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
7787  pcmpestri(vec, Address(result, 0), mode);
7788  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
7789  subl(cnt1, stride);
7790  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
7791  cmpl(cnt1, cnt2);
7792  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7793  addptr(result, 16);
7794
7795  bind(ADJUST_STR);
7796  cmpl(cnt1, stride); // Do not read beyond string
7797  jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
7798  // Back-up string to avoid reading beyond string.
7799  lea(result, Address(result, cnt1, scale1, -16));
7800  movl(cnt1, stride);
7801  jmpb(SCAN_TO_SUBSTR);
7802
7803  // Found a potential substr
7804  bind(FOUND_CANDIDATE);
7805  // After pcmpestri tmp(rcx) contains matched element index
7806
7807  // Make sure string is still long enough
7808  subl(cnt1, tmp);
7809  cmpl(cnt1, cnt2);
7810  jccb(Assembler::greaterEqual, FOUND_SUBSTR);
7811  // Left less then substring.
7812
7813  bind(RET_NOT_FOUND);
7814  movl(result, -1);
7815  jmpb(CLEANUP);
7816
7817  bind(FOUND_SUBSTR);
7818  // Compute start addr of substr
7819  lea(result, Address(result, tmp, scale1));
7820  if (int_cnt2 > 0) { // Constant substring
7821    // Repeat search for small substring (< 8 chars)
7822    // from new point without reloading substring.
7823    // Have to check that we don't read beyond string.
7824    cmpl(tmp, stride-int_cnt2);
7825    jccb(Assembler::greater, ADJUST_STR);
7826    // Fall through if matched whole substring.
7827  } else { // non constant
7828    assert(int_cnt2 == -1, "should be != 0");
7829
7830    addl(tmp, cnt2);
7831    // Found result if we matched whole substring.
7832    cmpl(tmp, stride);
7833    jccb(Assembler::lessEqual, RET_FOUND);
7834
7835    // Repeat search for small substring (<= 8 chars)
7836    // from new point 'str1' without reloading substring.
7837    cmpl(cnt2, stride);
7838    // Have to check that we don't read beyond string.
7839    jccb(Assembler::lessEqual, ADJUST_STR);
7840
7841    Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
7842    // Compare the rest of substring (> 8 chars).
7843    movptr(str1, result);
7844
7845    cmpl(tmp, cnt2);
7846    // First 8 chars are already matched.
7847    jccb(Assembler::equal, CHECK_NEXT);
7848
7849    bind(SCAN_SUBSTR);
7850    pcmpestri(vec, Address(str1, 0), mode);
7851    // Need to reload strings pointers if not matched whole vector
7852    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
7853
7854    bind(CHECK_NEXT);
7855    subl(cnt2, stride);
7856    jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
7857    addptr(str1, 16);
7858    if (ae == StrIntrinsicNode::UL) {
7859      addptr(str2, 8);
7860    } else {
7861      addptr(str2, 16);
7862    }
7863    subl(cnt1, stride);
7864    cmpl(cnt2, stride); // Do not read beyond substring
7865    jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
7866    // Back-up strings to avoid reading beyond substring.
7867
7868    if (ae == StrIntrinsicNode::UL) {
7869      lea(str2, Address(str2, cnt2, scale2, -8));
7870      lea(str1, Address(str1, cnt2, scale1, -16));
7871    } else {
7872      lea(str2, Address(str2, cnt2, scale2, -16));
7873      lea(str1, Address(str1, cnt2, scale1, -16));
7874    }
7875    subl(cnt1, cnt2);
7876    movl(cnt2, stride);
7877    addl(cnt1, stride);
7878    bind(CONT_SCAN_SUBSTR);
7879    if (ae == StrIntrinsicNode::UL) {
7880      pmovzxbw(vec, Address(str2, 0));
7881    } else {
7882      movdqu(vec, Address(str2, 0));
7883    }
7884    jmpb(SCAN_SUBSTR);
7885
7886    bind(RET_FOUND_LONG);
7887    movptr(str1, Address(rsp, wordSize));
7888  } // non constant
7889
7890  bind(RET_FOUND);
7891  // Compute substr offset
7892  subptr(result, str1);
7893  if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7894    shrl(result, 1); // index
7895  }
7896  bind(CLEANUP);
7897  pop(rsp); // restore SP
7898
7899} // string_indexof
7900
7901void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
7902                                         XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
7903  ShortBranchVerifier sbv(this);
7904  assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7905  assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7906
7907  int stride = 8;
7908
7909  Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
7910        SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
7911        RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
7912        FOUND_SEQ_CHAR, DONE_LABEL;
7913
7914  movptr(result, str1);
7915  if (UseAVX >= 2) {
7916    cmpl(cnt1, stride);
7917    jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
7918    cmpl(cnt1, 2*stride);
7919    jccb(Assembler::less, SCAN_TO_8_CHAR_INIT);
7920    movdl(vec1, ch);
7921    vpbroadcastw(vec1, vec1);
7922    vpxor(vec2, vec2);
7923    movl(tmp, cnt1);
7924    andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
7925    andl(cnt1,0x0000000F);  //tail count (in chars)
7926
7927    bind(SCAN_TO_16_CHAR_LOOP);
7928    vmovdqu(vec3, Address(result, 0));
7929    vpcmpeqw(vec3, vec3, vec1, 1);
7930    vptest(vec2, vec3);
7931    jcc(Assembler::carryClear, FOUND_CHAR);
7932    addptr(result, 32);
7933    subl(tmp, 2*stride);
7934    jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
7935    jmp(SCAN_TO_8_CHAR);
7936    bind(SCAN_TO_8_CHAR_INIT);
7937    movdl(vec1, ch);
7938    pshuflw(vec1, vec1, 0x00);
7939    pshufd(vec1, vec1, 0);
7940    pxor(vec2, vec2);
7941  }
7942  bind(SCAN_TO_8_CHAR);
7943  cmpl(cnt1, stride);
7944  if (UseAVX >= 2) {
7945    jccb(Assembler::less, SCAN_TO_CHAR);
7946  } else {
7947    jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
7948    movdl(vec1, ch);
7949    pshuflw(vec1, vec1, 0x00);
7950    pshufd(vec1, vec1, 0);
7951    pxor(vec2, vec2);
7952  }
7953  movl(tmp, cnt1);
7954  andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
7955  andl(cnt1,0x00000007);  //tail count (in chars)
7956
7957  bind(SCAN_TO_8_CHAR_LOOP);
7958  movdqu(vec3, Address(result, 0));
7959  pcmpeqw(vec3, vec1);
7960  ptest(vec2, vec3);
7961  jcc(Assembler::carryClear, FOUND_CHAR);
7962  addptr(result, 16);
7963  subl(tmp, stride);
7964  jccb(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
7965  bind(SCAN_TO_CHAR);
7966  testl(cnt1, cnt1);
7967  jcc(Assembler::zero, RET_NOT_FOUND);
7968  bind(SCAN_TO_CHAR_LOOP);
7969  load_unsigned_short(tmp, Address(result, 0));
7970  cmpl(ch, tmp);
7971  jccb(Assembler::equal, FOUND_SEQ_CHAR);
7972  addptr(result, 2);
7973  subl(cnt1, 1);
7974  jccb(Assembler::zero, RET_NOT_FOUND);
7975  jmp(SCAN_TO_CHAR_LOOP);
7976
7977  bind(RET_NOT_FOUND);
7978  movl(result, -1);
7979  jmpb(DONE_LABEL);
7980
7981  bind(FOUND_CHAR);
7982  if (UseAVX >= 2) {
7983    vpmovmskb(tmp, vec3);
7984  } else {
7985    pmovmskb(tmp, vec3);
7986  }
7987  bsfl(ch, tmp);
7988  addl(result, ch);
7989
7990  bind(FOUND_SEQ_CHAR);
7991  subptr(result, str1);
7992  shrl(result, 1);
7993
7994  bind(DONE_LABEL);
7995} // string_indexof_char
7996
7997// helper function for string_compare
7998void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
7999                                        Address::ScaleFactor scale, Address::ScaleFactor scale1,
8000                                        Address::ScaleFactor scale2, Register index, int ae) {
8001  if (ae == StrIntrinsicNode::LL) {
8002    load_unsigned_byte(elem1, Address(str1, index, scale, 0));
8003    load_unsigned_byte(elem2, Address(str2, index, scale, 0));
8004  } else if (ae == StrIntrinsicNode::UU) {
8005    load_unsigned_short(elem1, Address(str1, index, scale, 0));
8006    load_unsigned_short(elem2, Address(str2, index, scale, 0));
8007  } else {
8008    load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
8009    load_unsigned_short(elem2, Address(str2, index, scale2, 0));
8010  }
8011}
8012
8013// Compare strings, used for char[] and byte[].
8014void MacroAssembler::string_compare(Register str1, Register str2,
8015                                    Register cnt1, Register cnt2, Register result,
8016                                    XMMRegister vec1, int ae) {
8017  ShortBranchVerifier sbv(this);
8018  Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
8019  int stride, stride2, adr_stride, adr_stride1, adr_stride2;
8020  Address::ScaleFactor scale, scale1, scale2;
8021
8022  if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
8023    shrl(cnt2, 1);
8024  }
8025  // Compute the minimum of the string lengths and the
8026  // difference of the string lengths (stack).
8027  // Do the conditional move stuff
8028  movl(result, cnt1);
8029  subl(cnt1, cnt2);
8030  push(cnt1);
8031  cmov32(Assembler::lessEqual, cnt2, result);
8032
8033  // Is the minimum length zero?
8034  testl(cnt2, cnt2);
8035  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
8036  if (ae == StrIntrinsicNode::LL) {
8037    // Load first bytes
8038    load_unsigned_byte(result, Address(str1, 0));
8039    load_unsigned_byte(cnt1, Address(str2, 0));
8040  } else if (ae == StrIntrinsicNode::UU) {
8041    // Load first characters
8042    load_unsigned_short(result, Address(str1, 0));
8043    load_unsigned_short(cnt1, Address(str2, 0));
8044  } else {
8045    load_unsigned_byte(result, Address(str1, 0));
8046    load_unsigned_short(cnt1, Address(str2, 0));
8047  }
8048  subl(result, cnt1);
8049  jcc(Assembler::notZero,  POP_LABEL);
8050
8051  if (ae == StrIntrinsicNode::UU) {
8052    // Divide length by 2 to get number of chars
8053    shrl(cnt2, 1);
8054  }
8055  cmpl(cnt2, 1);
8056  jcc(Assembler::equal, LENGTH_DIFF_LABEL);
8057
8058  // Check if the strings start at the same location and setup scale and stride
8059  if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8060    cmpptr(str1, str2);
8061    jcc(Assembler::equal, LENGTH_DIFF_LABEL);
8062    if (ae == StrIntrinsicNode::LL) {
8063      scale = Address::times_1;
8064      stride = 16;
8065    } else {
8066      scale = Address::times_2;
8067      stride = 8;
8068    }
8069  } else {
8070    scale = Address::no_scale;  // not used
8071    scale1 = Address::times_1;
8072    scale2 = Address::times_2;
8073    stride = 8;
8074  }
8075
8076  if (UseAVX >= 2 && UseSSE42Intrinsics) {
8077    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8078    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
8079    Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
8080    Label COMPARE_TAIL_LONG;
8081    int pcmpmask = 0x19;
8082    if (ae == StrIntrinsicNode::LL) {
8083      pcmpmask &= ~0x01;
8084    }
8085
8086    // Setup to compare 16-chars (32-bytes) vectors,
8087    // start from first character again because it has aligned address.
8088    if (ae == StrIntrinsicNode::LL) {
8089      stride2 = 32;
8090    } else {
8091      stride2 = 16;
8092    }
8093    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8094      adr_stride = stride << scale;
8095    } else {
8096      adr_stride1 = 8;  //stride << scale1;
8097      adr_stride2 = 16; //stride << scale2;
8098    }
8099
8100    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
8101    // rax and rdx are used by pcmpestri as elements counters
8102    movl(result, cnt2);
8103    andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
8104    jcc(Assembler::zero, COMPARE_TAIL_LONG);
8105
8106    // fast path : compare first 2 8-char vectors.
8107    bind(COMPARE_16_CHARS);
8108    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8109      movdqu(vec1, Address(str1, 0));
8110    } else {
8111      pmovzxbw(vec1, Address(str1, 0));
8112    }
8113    pcmpestri(vec1, Address(str2, 0), pcmpmask);
8114    jccb(Assembler::below, COMPARE_INDEX_CHAR);
8115
8116    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8117      movdqu(vec1, Address(str1, adr_stride));
8118      pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
8119    } else {
8120      pmovzxbw(vec1, Address(str1, adr_stride1));
8121      pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
8122    }
8123    jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
8124    addl(cnt1, stride);
8125
8126    // Compare the characters at index in cnt1
8127    bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
8128    load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
8129    subl(result, cnt2);
8130    jmp(POP_LABEL);
8131
8132    // Setup the registers to start vector comparison loop
8133    bind(COMPARE_WIDE_VECTORS);
8134    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8135      lea(str1, Address(str1, result, scale));
8136      lea(str2, Address(str2, result, scale));
8137    } else {
8138      lea(str1, Address(str1, result, scale1));
8139      lea(str2, Address(str2, result, scale2));
8140    }
8141    subl(result, stride2);
8142    subl(cnt2, stride2);
8143    jccb(Assembler::zero, COMPARE_WIDE_TAIL);
8144    negptr(result);
8145
8146    //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
8147    bind(COMPARE_WIDE_VECTORS_LOOP);
8148    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8149      vmovdqu(vec1, Address(str1, result, scale));
8150      vpxor(vec1, Address(str2, result, scale));
8151    } else {
8152      vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
8153      vpxor(vec1, Address(str2, result, scale2));
8154    }
8155    vptest(vec1, vec1);
8156    jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
8157    addptr(result, stride2);
8158    subl(cnt2, stride2);
8159    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
8160    // clean upper bits of YMM registers
8161    vpxor(vec1, vec1);
8162
8163    // compare wide vectors tail
8164    bind(COMPARE_WIDE_TAIL);
8165    testptr(result, result);
8166    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
8167
8168    movl(result, stride2);
8169    movl(cnt2, result);
8170    negptr(result);
8171    jmpb(COMPARE_WIDE_VECTORS_LOOP);
8172
8173    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
8174    bind(VECTOR_NOT_EQUAL);
8175    // clean upper bits of YMM registers
8176    vpxor(vec1, vec1);
8177    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8178      lea(str1, Address(str1, result, scale));
8179      lea(str2, Address(str2, result, scale));
8180    } else {
8181      lea(str1, Address(str1, result, scale1));
8182      lea(str2, Address(str2, result, scale2));
8183    }
8184    jmp(COMPARE_16_CHARS);
8185
8186    // Compare tail chars, length between 1 to 15 chars
8187    bind(COMPARE_TAIL_LONG);
8188    movl(cnt2, result);
8189    cmpl(cnt2, stride);
8190    jccb(Assembler::less, COMPARE_SMALL_STR);
8191
8192    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8193      movdqu(vec1, Address(str1, 0));
8194    } else {
8195      pmovzxbw(vec1, Address(str1, 0));
8196    }
8197    pcmpestri(vec1, Address(str2, 0), pcmpmask);
8198    jcc(Assembler::below, COMPARE_INDEX_CHAR);
8199    subptr(cnt2, stride);
8200    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
8201    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8202      lea(str1, Address(str1, result, scale));
8203      lea(str2, Address(str2, result, scale));
8204    } else {
8205      lea(str1, Address(str1, result, scale1));
8206      lea(str2, Address(str2, result, scale2));
8207    }
8208    negptr(cnt2);
8209    jmpb(WHILE_HEAD_LABEL);
8210
8211    bind(COMPARE_SMALL_STR);
8212  } else if (UseSSE42Intrinsics) {
8213    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8214    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
8215    int pcmpmask = 0x19;
8216    // Setup to compare 8-char (16-byte) vectors,
8217    // start from first character again because it has aligned address.
8218    movl(result, cnt2);
8219    andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
8220    if (ae == StrIntrinsicNode::LL) {
8221      pcmpmask &= ~0x01;
8222    }
8223    jccb(Assembler::zero, COMPARE_TAIL);
8224    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8225      lea(str1, Address(str1, result, scale));
8226      lea(str2, Address(str2, result, scale));
8227    } else {
8228      lea(str1, Address(str1, result, scale1));
8229      lea(str2, Address(str2, result, scale2));
8230    }
8231    negptr(result);
8232
8233    // pcmpestri
8234    //   inputs:
8235    //     vec1- substring
8236    //     rax - negative string length (elements count)
8237    //     mem - scanned string
8238    //     rdx - string length (elements count)
8239    //     pcmpmask - cmp mode: 11000 (string compare with negated result)
8240    //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
8241    //   outputs:
8242    //     rcx - first mismatched element index
8243    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
8244
8245    bind(COMPARE_WIDE_VECTORS);
8246    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8247      movdqu(vec1, Address(str1, result, scale));
8248      pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
8249    } else {
8250      pmovzxbw(vec1, Address(str1, result, scale1));
8251      pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
8252    }
8253    // After pcmpestri cnt1(rcx) contains mismatched element index
8254
8255    jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
8256    addptr(result, stride);
8257    subptr(cnt2, stride);
8258    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
8259
8260    // compare wide vectors tail
8261    testptr(result, result);
8262    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
8263
8264    movl(cnt2, stride);
8265    movl(result, stride);
8266    negptr(result);
8267    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8268      movdqu(vec1, Address(str1, result, scale));
8269      pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
8270    } else {
8271      pmovzxbw(vec1, Address(str1, result, scale1));
8272      pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
8273    }
8274    jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
8275
8276    // Mismatched characters in the vectors
8277    bind(VECTOR_NOT_EQUAL);
8278    addptr(cnt1, result);
8279    load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
8280    subl(result, cnt2);
8281    jmpb(POP_LABEL);
8282
8283    bind(COMPARE_TAIL); // limit is zero
8284    movl(cnt2, result);
8285    // Fallthru to tail compare
8286  }
8287  // Shift str2 and str1 to the end of the arrays, negate min
8288  if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8289    lea(str1, Address(str1, cnt2, scale));
8290    lea(str2, Address(str2, cnt2, scale));
8291  } else {
8292    lea(str1, Address(str1, cnt2, scale1));
8293    lea(str2, Address(str2, cnt2, scale2));
8294  }
8295  decrementl(cnt2);  // first character was compared already
8296  negptr(cnt2);
8297
8298  // Compare the rest of the elements
8299  bind(WHILE_HEAD_LABEL);
8300  load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
8301  subl(result, cnt1);
8302  jccb(Assembler::notZero, POP_LABEL);
8303  increment(cnt2);
8304  jccb(Assembler::notZero, WHILE_HEAD_LABEL);
8305
8306  // Strings are equal up to min length.  Return the length difference.
8307  bind(LENGTH_DIFF_LABEL);
8308  pop(result);
8309  if (ae == StrIntrinsicNode::UU) {
8310    // Divide diff by 2 to get number of chars
8311    sarl(result, 1);
8312  }
8313  jmpb(DONE_LABEL);
8314
8315  // Discard the stored length difference
8316  bind(POP_LABEL);
8317  pop(cnt1);
8318
8319  // That's it
8320  bind(DONE_LABEL);
8321  if(ae == StrIntrinsicNode::UL) {
8322    negl(result);
8323  }
8324}
8325
8326// Search for Non-ASCII character (Negative byte value) in a byte array,
8327// return true if it has any and false otherwise.
8328void MacroAssembler::has_negatives(Register ary1, Register len,
8329                                   Register result, Register tmp1,
8330                                   XMMRegister vec1, XMMRegister vec2) {
8331
8332  // rsi: byte array
8333  // rcx: len
8334  // rax: result
8335  ShortBranchVerifier sbv(this);
8336  assert_different_registers(ary1, len, result, tmp1);
8337  assert_different_registers(vec1, vec2);
8338  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
8339
8340  // len == 0
8341  testl(len, len);
8342  jcc(Assembler::zero, FALSE_LABEL);
8343
8344  movl(result, len); // copy
8345
8346  if (UseAVX >= 2 && UseSSE >= 2) {
8347    // With AVX2, use 32-byte vector compare
8348    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8349
8350    // Compare 32-byte vectors
8351    andl(result, 0x0000001f);  //   tail count (in bytes)
8352    andl(len, 0xffffffe0);   // vector count (in bytes)
8353    jccb(Assembler::zero, COMPARE_TAIL);
8354
8355    lea(ary1, Address(ary1, len, Address::times_1));
8356    negptr(len);
8357
8358    movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
8359    movdl(vec2, tmp1);
8360    vpbroadcastd(vec2, vec2);
8361
8362    bind(COMPARE_WIDE_VECTORS);
8363    vmovdqu(vec1, Address(ary1, len, Address::times_1));
8364    vptest(vec1, vec2);
8365    jccb(Assembler::notZero, TRUE_LABEL);
8366    addptr(len, 32);
8367    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8368
8369    testl(result, result);
8370    jccb(Assembler::zero, FALSE_LABEL);
8371
8372    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
8373    vptest(vec1, vec2);
8374    jccb(Assembler::notZero, TRUE_LABEL);
8375    jmpb(FALSE_LABEL);
8376
8377    bind(COMPARE_TAIL); // len is zero
8378    movl(len, result);
8379    // Fallthru to tail compare
8380  } else if (UseSSE42Intrinsics) {
8381    assert(UseSSE >= 4, "SSE4 must be  for SSE4.2 intrinsics to be available");
8382    // With SSE4.2, use double quad vector compare
8383    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8384
8385    // Compare 16-byte vectors
8386    andl(result, 0x0000000f);  //   tail count (in bytes)
8387    andl(len, 0xfffffff0);   // vector count (in bytes)
8388    jccb(Assembler::zero, COMPARE_TAIL);
8389
8390    lea(ary1, Address(ary1, len, Address::times_1));
8391    negptr(len);
8392
8393    movl(tmp1, 0x80808080);
8394    movdl(vec2, tmp1);
8395    pshufd(vec2, vec2, 0);
8396
8397    bind(COMPARE_WIDE_VECTORS);
8398    movdqu(vec1, Address(ary1, len, Address::times_1));
8399    ptest(vec1, vec2);
8400    jccb(Assembler::notZero, TRUE_LABEL);
8401    addptr(len, 16);
8402    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8403
8404    testl(result, result);
8405    jccb(Assembler::zero, FALSE_LABEL);
8406
8407    movdqu(vec1, Address(ary1, result, Address::times_1, -16));
8408    ptest(vec1, vec2);
8409    jccb(Assembler::notZero, TRUE_LABEL);
8410    jmpb(FALSE_LABEL);
8411
8412    bind(COMPARE_TAIL); // len is zero
8413    movl(len, result);
8414    // Fallthru to tail compare
8415  }
8416
8417  // Compare 4-byte vectors
8418  andl(len, 0xfffffffc); // vector count (in bytes)
8419  jccb(Assembler::zero, COMPARE_CHAR);
8420
8421  lea(ary1, Address(ary1, len, Address::times_1));
8422  negptr(len);
8423
8424  bind(COMPARE_VECTORS);
8425  movl(tmp1, Address(ary1, len, Address::times_1));
8426  andl(tmp1, 0x80808080);
8427  jccb(Assembler::notZero, TRUE_LABEL);
8428  addptr(len, 4);
8429  jcc(Assembler::notZero, COMPARE_VECTORS);
8430
8431  // Compare trailing char (final 2 bytes), if any
8432  bind(COMPARE_CHAR);
8433  testl(result, 0x2);   // tail  char
8434  jccb(Assembler::zero, COMPARE_BYTE);
8435  load_unsigned_short(tmp1, Address(ary1, 0));
8436  andl(tmp1, 0x00008080);
8437  jccb(Assembler::notZero, TRUE_LABEL);
8438  subptr(result, 2);
8439  lea(ary1, Address(ary1, 2));
8440
8441  bind(COMPARE_BYTE);
8442  testl(result, 0x1);   // tail  byte
8443  jccb(Assembler::zero, FALSE_LABEL);
8444  load_unsigned_byte(tmp1, Address(ary1, 0));
8445  andl(tmp1, 0x00000080);
8446  jccb(Assembler::notEqual, TRUE_LABEL);
8447  jmpb(FALSE_LABEL);
8448
8449  bind(TRUE_LABEL);
8450  movl(result, 1);   // return true
8451  jmpb(DONE);
8452
8453  bind(FALSE_LABEL);
8454  xorl(result, result); // return false
8455
8456  // That's it
8457  bind(DONE);
8458  if (UseAVX >= 2 && UseSSE >= 2) {
8459    // clean upper bits of YMM registers
8460    vpxor(vec1, vec1);
8461    vpxor(vec2, vec2);
8462  }
8463}
8464
8465// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
8466void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
8467                                   Register limit, Register result, Register chr,
8468                                   XMMRegister vec1, XMMRegister vec2, bool is_char) {
8469  ShortBranchVerifier sbv(this);
8470  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
8471
8472  int length_offset  = arrayOopDesc::length_offset_in_bytes();
8473  int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
8474
8475  if (is_array_equ) {
8476    // Check the input args
8477    cmpptr(ary1, ary2);
8478    jcc(Assembler::equal, TRUE_LABEL);
8479
8480    // Need additional checks for arrays_equals.
8481    testptr(ary1, ary1);
8482    jcc(Assembler::zero, FALSE_LABEL);
8483    testptr(ary2, ary2);
8484    jcc(Assembler::zero, FALSE_LABEL);
8485
8486    // Check the lengths
8487    movl(limit, Address(ary1, length_offset));
8488    cmpl(limit, Address(ary2, length_offset));
8489    jcc(Assembler::notEqual, FALSE_LABEL);
8490  }
8491
8492  // count == 0
8493  testl(limit, limit);
8494  jcc(Assembler::zero, TRUE_LABEL);
8495
8496  if (is_array_equ) {
8497    // Load array address
8498    lea(ary1, Address(ary1, base_offset));
8499    lea(ary2, Address(ary2, base_offset));
8500  }
8501
8502  if (is_array_equ && is_char) {
8503    // arrays_equals when used for char[].
8504    shll(limit, 1);      // byte count != 0
8505  }
8506  movl(result, limit); // copy
8507
8508  if (UseAVX >= 2) {
8509    // With AVX2, use 32-byte vector compare
8510    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8511
8512    // Compare 32-byte vectors
8513    andl(result, 0x0000001f);  //   tail count (in bytes)
8514    andl(limit, 0xffffffe0);   // vector count (in bytes)
8515    jccb(Assembler::zero, COMPARE_TAIL);
8516
8517    lea(ary1, Address(ary1, limit, Address::times_1));
8518    lea(ary2, Address(ary2, limit, Address::times_1));
8519    negptr(limit);
8520
8521    bind(COMPARE_WIDE_VECTORS);
8522    vmovdqu(vec1, Address(ary1, limit, Address::times_1));
8523    vmovdqu(vec2, Address(ary2, limit, Address::times_1));
8524    vpxor(vec1, vec2);
8525
8526    vptest(vec1, vec1);
8527    jccb(Assembler::notZero, FALSE_LABEL);
8528    addptr(limit, 32);
8529    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8530
8531    testl(result, result);
8532    jccb(Assembler::zero, TRUE_LABEL);
8533
8534    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
8535    vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
8536    vpxor(vec1, vec2);
8537
8538    vptest(vec1, vec1);
8539    jccb(Assembler::notZero, FALSE_LABEL);
8540    jmpb(TRUE_LABEL);
8541
8542    bind(COMPARE_TAIL); // limit is zero
8543    movl(limit, result);
8544    // Fallthru to tail compare
8545  } else if (UseSSE42Intrinsics) {
8546    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8547    // With SSE4.2, use double quad vector compare
8548    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8549
8550    // Compare 16-byte vectors
8551    andl(result, 0x0000000f);  //   tail count (in bytes)
8552    andl(limit, 0xfffffff0);   // vector count (in bytes)
8553    jccb(Assembler::zero, COMPARE_TAIL);
8554
8555    lea(ary1, Address(ary1, limit, Address::times_1));
8556    lea(ary2, Address(ary2, limit, Address::times_1));
8557    negptr(limit);
8558
8559    bind(COMPARE_WIDE_VECTORS);
8560    movdqu(vec1, Address(ary1, limit, Address::times_1));
8561    movdqu(vec2, Address(ary2, limit, Address::times_1));
8562    pxor(vec1, vec2);
8563
8564    ptest(vec1, vec1);
8565    jccb(Assembler::notZero, FALSE_LABEL);
8566    addptr(limit, 16);
8567    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8568
8569    testl(result, result);
8570    jccb(Assembler::zero, TRUE_LABEL);
8571
8572    movdqu(vec1, Address(ary1, result, Address::times_1, -16));
8573    movdqu(vec2, Address(ary2, result, Address::times_1, -16));
8574    pxor(vec1, vec2);
8575
8576    ptest(vec1, vec1);
8577    jccb(Assembler::notZero, FALSE_LABEL);
8578    jmpb(TRUE_LABEL);
8579
8580    bind(COMPARE_TAIL); // limit is zero
8581    movl(limit, result);
8582    // Fallthru to tail compare
8583  }
8584
8585  // Compare 4-byte vectors
8586  andl(limit, 0xfffffffc); // vector count (in bytes)
8587  jccb(Assembler::zero, COMPARE_CHAR);
8588
8589  lea(ary1, Address(ary1, limit, Address::times_1));
8590  lea(ary2, Address(ary2, limit, Address::times_1));
8591  negptr(limit);
8592
8593  bind(COMPARE_VECTORS);
8594  movl(chr, Address(ary1, limit, Address::times_1));
8595  cmpl(chr, Address(ary2, limit, Address::times_1));
8596  jccb(Assembler::notEqual, FALSE_LABEL);
8597  addptr(limit, 4);
8598  jcc(Assembler::notZero, COMPARE_VECTORS);
8599
8600  // Compare trailing char (final 2 bytes), if any
8601  bind(COMPARE_CHAR);
8602  testl(result, 0x2);   // tail  char
8603  jccb(Assembler::zero, COMPARE_BYTE);
8604  load_unsigned_short(chr, Address(ary1, 0));
8605  load_unsigned_short(limit, Address(ary2, 0));
8606  cmpl(chr, limit);
8607  jccb(Assembler::notEqual, FALSE_LABEL);
8608
8609  if (is_array_equ && is_char) {
8610    bind(COMPARE_BYTE);
8611  } else {
8612    lea(ary1, Address(ary1, 2));
8613    lea(ary2, Address(ary2, 2));
8614
8615    bind(COMPARE_BYTE);
8616    testl(result, 0x1);   // tail  byte
8617    jccb(Assembler::zero, TRUE_LABEL);
8618    load_unsigned_byte(chr, Address(ary1, 0));
8619    load_unsigned_byte(limit, Address(ary2, 0));
8620    cmpl(chr, limit);
8621    jccb(Assembler::notEqual, FALSE_LABEL);
8622  }
8623  bind(TRUE_LABEL);
8624  movl(result, 1);   // return true
8625  jmpb(DONE);
8626
8627  bind(FALSE_LABEL);
8628  xorl(result, result); // return false
8629
8630  // That's it
8631  bind(DONE);
8632  if (UseAVX >= 2) {
8633    // clean upper bits of YMM registers
8634    vpxor(vec1, vec1);
8635    vpxor(vec2, vec2);
8636  }
8637}
8638
8639#endif
8640
8641void MacroAssembler::generate_fill(BasicType t, bool aligned,
8642                                   Register to, Register value, Register count,
8643                                   Register rtmp, XMMRegister xtmp) {
8644  ShortBranchVerifier sbv(this);
8645  assert_different_registers(to, value, count, rtmp);
8646  Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
8647  Label L_fill_2_bytes, L_fill_4_bytes;
8648
8649  int shift = -1;
8650  switch (t) {
8651    case T_BYTE:
8652      shift = 2;
8653      break;
8654    case T_SHORT:
8655      shift = 1;
8656      break;
8657    case T_INT:
8658      shift = 0;
8659      break;
8660    default: ShouldNotReachHere();
8661  }
8662
8663  if (t == T_BYTE) {
8664    andl(value, 0xff);
8665    movl(rtmp, value);
8666    shll(rtmp, 8);
8667    orl(value, rtmp);
8668  }
8669  if (t == T_SHORT) {
8670    andl(value, 0xffff);
8671  }
8672  if (t == T_BYTE || t == T_SHORT) {
8673    movl(rtmp, value);
8674    shll(rtmp, 16);
8675    orl(value, rtmp);
8676  }
8677
8678  cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
8679  jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
8680  if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
8681    // align source address at 4 bytes address boundary
8682    if (t == T_BYTE) {
8683      // One byte misalignment happens only for byte arrays
8684      testptr(to, 1);
8685      jccb(Assembler::zero, L_skip_align1);
8686      movb(Address(to, 0), value);
8687      increment(to);
8688      decrement(count);
8689      BIND(L_skip_align1);
8690    }
8691    // Two bytes misalignment happens only for byte and short (char) arrays
8692    testptr(to, 2);
8693    jccb(Assembler::zero, L_skip_align2);
8694    movw(Address(to, 0), value);
8695    addptr(to, 2);
8696    subl(count, 1<<(shift-1));
8697    BIND(L_skip_align2);
8698  }
8699  if (UseSSE < 2) {
8700    Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8701    // Fill 32-byte chunks
8702    subl(count, 8 << shift);
8703    jcc(Assembler::less, L_check_fill_8_bytes);
8704    align(16);
8705
8706    BIND(L_fill_32_bytes_loop);
8707
8708    for (int i = 0; i < 32; i += 4) {
8709      movl(Address(to, i), value);
8710    }
8711
8712    addptr(to, 32);
8713    subl(count, 8 << shift);
8714    jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
8715    BIND(L_check_fill_8_bytes);
8716    addl(count, 8 << shift);
8717    jccb(Assembler::zero, L_exit);
8718    jmpb(L_fill_8_bytes);
8719
8720    //
8721    // length is too short, just fill qwords
8722    //
8723    BIND(L_fill_8_bytes_loop);
8724    movl(Address(to, 0), value);
8725    movl(Address(to, 4), value);
8726    addptr(to, 8);
8727    BIND(L_fill_8_bytes);
8728    subl(count, 1 << (shift + 1));
8729    jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
8730    // fall through to fill 4 bytes
8731  } else {
8732    Label L_fill_32_bytes;
8733    if (!UseUnalignedLoadStores) {
8734      // align to 8 bytes, we know we are 4 byte aligned to start
8735      testptr(to, 4);
8736      jccb(Assembler::zero, L_fill_32_bytes);
8737      movl(Address(to, 0), value);
8738      addptr(to, 4);
8739      subl(count, 1<<shift);
8740    }
8741    BIND(L_fill_32_bytes);
8742    {
8743      assert( UseSSE >= 2, "supported cpu only" );
8744      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8745      if (UseAVX > 2) {
8746        movl(rtmp, 0xffff);
8747        kmovwl(k1, rtmp);
8748      }
8749      movdl(xtmp, value);
8750      if (UseAVX > 2 && UseUnalignedLoadStores) {
8751        // Fill 64-byte chunks
8752        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8753        evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
8754
8755        subl(count, 16 << shift);
8756        jcc(Assembler::less, L_check_fill_32_bytes);
8757        align(16);
8758
8759        BIND(L_fill_64_bytes_loop);
8760        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
8761        addptr(to, 64);
8762        subl(count, 16 << shift);
8763        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8764
8765        BIND(L_check_fill_32_bytes);
8766        addl(count, 8 << shift);
8767        jccb(Assembler::less, L_check_fill_8_bytes);
8768        vmovdqu(Address(to, 0), xtmp);
8769        addptr(to, 32);
8770        subl(count, 8 << shift);
8771
8772        BIND(L_check_fill_8_bytes);
8773      } else if (UseAVX == 2 && UseUnalignedLoadStores) {
8774        // Fill 64-byte chunks
8775        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8776        vpbroadcastd(xtmp, xtmp);
8777
8778        subl(count, 16 << shift);
8779        jcc(Assembler::less, L_check_fill_32_bytes);
8780        align(16);
8781
8782        BIND(L_fill_64_bytes_loop);
8783        vmovdqu(Address(to, 0), xtmp);
8784        vmovdqu(Address(to, 32), xtmp);
8785        addptr(to, 64);
8786        subl(count, 16 << shift);
8787        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8788
8789        BIND(L_check_fill_32_bytes);
8790        addl(count, 8 << shift);
8791        jccb(Assembler::less, L_check_fill_8_bytes);
8792        vmovdqu(Address(to, 0), xtmp);
8793        addptr(to, 32);
8794        subl(count, 8 << shift);
8795
8796        BIND(L_check_fill_8_bytes);
8797        // clean upper bits of YMM registers
8798        movdl(xtmp, value);
8799        pshufd(xtmp, xtmp, 0);
8800      } else {
8801        // Fill 32-byte chunks
8802        pshufd(xtmp, xtmp, 0);
8803
8804        subl(count, 8 << shift);
8805        jcc(Assembler::less, L_check_fill_8_bytes);
8806        align(16);
8807
8808        BIND(L_fill_32_bytes_loop);
8809
8810        if (UseUnalignedLoadStores) {
8811          movdqu(Address(to, 0), xtmp);
8812          movdqu(Address(to, 16), xtmp);
8813        } else {
8814          movq(Address(to, 0), xtmp);
8815          movq(Address(to, 8), xtmp);
8816          movq(Address(to, 16), xtmp);
8817          movq(Address(to, 24), xtmp);
8818        }
8819
8820        addptr(to, 32);
8821        subl(count, 8 << shift);
8822        jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
8823
8824        BIND(L_check_fill_8_bytes);
8825      }
8826      addl(count, 8 << shift);
8827      jccb(Assembler::zero, L_exit);
8828      jmpb(L_fill_8_bytes);
8829
8830      //
8831      // length is too short, just fill qwords
8832      //
8833      BIND(L_fill_8_bytes_loop);
8834      movq(Address(to, 0), xtmp);
8835      addptr(to, 8);
8836      BIND(L_fill_8_bytes);
8837      subl(count, 1 << (shift + 1));
8838      jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
8839    }
8840  }
8841  // fill trailing 4 bytes
8842  BIND(L_fill_4_bytes);
8843  testl(count, 1<<shift);
8844  jccb(Assembler::zero, L_fill_2_bytes);
8845  movl(Address(to, 0), value);
8846  if (t == T_BYTE || t == T_SHORT) {
8847    addptr(to, 4);
8848    BIND(L_fill_2_bytes);
8849    // fill trailing 2 bytes
8850    testl(count, 1<<(shift-1));
8851    jccb(Assembler::zero, L_fill_byte);
8852    movw(Address(to, 0), value);
8853    if (t == T_BYTE) {
8854      addptr(to, 2);
8855      BIND(L_fill_byte);
8856      // fill trailing byte
8857      testl(count, 1);
8858      jccb(Assembler::zero, L_exit);
8859      movb(Address(to, 0), value);
8860    } else {
8861      BIND(L_fill_byte);
8862    }
8863  } else {
8864    BIND(L_fill_2_bytes);
8865  }
8866  BIND(L_exit);
8867}
8868
8869// encode char[] to byte[] in ISO_8859_1
8870void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
8871                                      XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8872                                      XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8873                                      Register tmp5, Register result) {
8874  // rsi: src
8875  // rdi: dst
8876  // rdx: len
8877  // rcx: tmp5
8878  // rax: result
8879  ShortBranchVerifier sbv(this);
8880  assert_different_registers(src, dst, len, tmp5, result);
8881  Label L_done, L_copy_1_char, L_copy_1_char_exit;
8882
8883  // set result
8884  xorl(result, result);
8885  // check for zero length
8886  testl(len, len);
8887  jcc(Assembler::zero, L_done);
8888  movl(result, len);
8889
8890  // Setup pointers
8891  lea(src, Address(src, len, Address::times_2)); // char[]
8892  lea(dst, Address(dst, len, Address::times_1)); // byte[]
8893  negptr(len);
8894
8895  if (UseSSE42Intrinsics || UseAVX >= 2) {
8896    assert(UseSSE42Intrinsics ? UseSSE >= 4 : true, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8897    Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8898    Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8899
8900    if (UseAVX >= 2) {
8901      Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8902      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8903      movdl(tmp1Reg, tmp5);
8904      vpbroadcastd(tmp1Reg, tmp1Reg);
8905      jmpb(L_chars_32_check);
8906
8907      bind(L_copy_32_chars);
8908      vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
8909      vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
8910      vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8911      vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8912      jccb(Assembler::notZero, L_copy_32_chars_exit);
8913      vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8914      vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
8915      vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
8916
8917      bind(L_chars_32_check);
8918      addptr(len, 32);
8919      jccb(Assembler::lessEqual, L_copy_32_chars);
8920
8921      bind(L_copy_32_chars_exit);
8922      subptr(len, 16);
8923      jccb(Assembler::greater, L_copy_16_chars_exit);
8924
8925    } else if (UseSSE42Intrinsics) {
8926      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8927      movdl(tmp1Reg, tmp5);
8928      pshufd(tmp1Reg, tmp1Reg, 0);
8929      jmpb(L_chars_16_check);
8930    }
8931
8932    bind(L_copy_16_chars);
8933    if (UseAVX >= 2) {
8934      vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
8935      vptest(tmp2Reg, tmp1Reg);
8936      jccb(Assembler::notZero, L_copy_16_chars_exit);
8937      vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
8938      vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
8939    } else {
8940      if (UseAVX > 0) {
8941        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8942        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8943        vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
8944      } else {
8945        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8946        por(tmp2Reg, tmp3Reg);
8947        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8948        por(tmp2Reg, tmp4Reg);
8949      }
8950      ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8951      jccb(Assembler::notZero, L_copy_16_chars_exit);
8952      packuswb(tmp3Reg, tmp4Reg);
8953    }
8954    movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
8955
8956    bind(L_chars_16_check);
8957    addptr(len, 16);
8958    jccb(Assembler::lessEqual, L_copy_16_chars);
8959
8960    bind(L_copy_16_chars_exit);
8961    if (UseAVX >= 2) {
8962      // clean upper bits of YMM registers
8963      vpxor(tmp2Reg, tmp2Reg);
8964      vpxor(tmp3Reg, tmp3Reg);
8965      vpxor(tmp4Reg, tmp4Reg);
8966      movdl(tmp1Reg, tmp5);
8967      pshufd(tmp1Reg, tmp1Reg, 0);
8968    }
8969    subptr(len, 8);
8970    jccb(Assembler::greater, L_copy_8_chars_exit);
8971
8972    bind(L_copy_8_chars);
8973    movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
8974    ptest(tmp3Reg, tmp1Reg);
8975    jccb(Assembler::notZero, L_copy_8_chars_exit);
8976    packuswb(tmp3Reg, tmp1Reg);
8977    movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
8978    addptr(len, 8);
8979    jccb(Assembler::lessEqual, L_copy_8_chars);
8980
8981    bind(L_copy_8_chars_exit);
8982    subptr(len, 8);
8983    jccb(Assembler::zero, L_done);
8984  }
8985
8986  bind(L_copy_1_char);
8987  load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
8988  testl(tmp5, 0xff00);      // check if Unicode char
8989  jccb(Assembler::notZero, L_copy_1_char_exit);
8990  movb(Address(dst, len, Address::times_1, 0), tmp5);
8991  addptr(len, 1);
8992  jccb(Assembler::less, L_copy_1_char);
8993
8994  bind(L_copy_1_char_exit);
8995  addptr(result, len); // len is negative count of not processed elements
8996  bind(L_done);
8997}
8998
8999#ifdef _LP64
9000/**
9001 * Helper for multiply_to_len().
9002 */
9003void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
9004  addq(dest_lo, src1);
9005  adcq(dest_hi, 0);
9006  addq(dest_lo, src2);
9007  adcq(dest_hi, 0);
9008}
9009
9010/**
9011 * Multiply 64 bit by 64 bit first loop.
9012 */
9013void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
9014                                           Register y, Register y_idx, Register z,
9015                                           Register carry, Register product,
9016                                           Register idx, Register kdx) {
9017  //
9018  //  jlong carry, x[], y[], z[];
9019  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
9020  //    huge_128 product = y[idx] * x[xstart] + carry;
9021  //    z[kdx] = (jlong)product;
9022  //    carry  = (jlong)(product >>> 64);
9023  //  }
9024  //  z[xstart] = carry;
9025  //
9026
9027  Label L_first_loop, L_first_loop_exit;
9028  Label L_one_x, L_one_y, L_multiply;
9029
9030  decrementl(xstart);
9031  jcc(Assembler::negative, L_one_x);
9032
9033  movq(x_xstart, Address(x, xstart, Address::times_4,  0));
9034  rorq(x_xstart, 32); // convert big-endian to little-endian
9035
9036  bind(L_first_loop);
9037  decrementl(idx);
9038  jcc(Assembler::negative, L_first_loop_exit);
9039  decrementl(idx);
9040  jcc(Assembler::negative, L_one_y);
9041  movq(y_idx, Address(y, idx, Address::times_4,  0));
9042  rorq(y_idx, 32); // convert big-endian to little-endian
9043  bind(L_multiply);
9044  movq(product, x_xstart);
9045  mulq(y_idx); // product(rax) * y_idx -> rdx:rax
9046  addq(product, carry);
9047  adcq(rdx, 0);
9048  subl(kdx, 2);
9049  movl(Address(z, kdx, Address::times_4,  4), product);
9050  shrq(product, 32);
9051  movl(Address(z, kdx, Address::times_4,  0), product);
9052  movq(carry, rdx);
9053  jmp(L_first_loop);
9054
9055  bind(L_one_y);
9056  movl(y_idx, Address(y,  0));
9057  jmp(L_multiply);
9058
9059  bind(L_one_x);
9060  movl(x_xstart, Address(x,  0));
9061  jmp(L_first_loop);
9062
9063  bind(L_first_loop_exit);
9064}
9065
9066/**
9067 * Multiply 64 bit by 64 bit and add 128 bit.
9068 */
9069void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
9070                                            Register yz_idx, Register idx,
9071                                            Register carry, Register product, int offset) {
9072  //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
9073  //     z[kdx] = (jlong)product;
9074
9075  movq(yz_idx, Address(y, idx, Address::times_4,  offset));
9076  rorq(yz_idx, 32); // convert big-endian to little-endian
9077  movq(product, x_xstart);
9078  mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
9079  movq(yz_idx, Address(z, idx, Address::times_4,  offset));
9080  rorq(yz_idx, 32); // convert big-endian to little-endian
9081
9082  add2_with_carry(rdx, product, carry, yz_idx);
9083
9084  movl(Address(z, idx, Address::times_4,  offset+4), product);
9085  shrq(product, 32);
9086  movl(Address(z, idx, Address::times_4,  offset), product);
9087
9088}
9089
9090/**
9091 * Multiply 128 bit by 128 bit. Unrolled inner loop.
9092 */
9093void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
9094                                             Register yz_idx, Register idx, Register jdx,
9095                                             Register carry, Register product,
9096                                             Register carry2) {
9097  //   jlong carry, x[], y[], z[];
9098  //   int kdx = ystart+1;
9099  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
9100  //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
9101  //     z[kdx+idx+1] = (jlong)product;
9102  //     jlong carry2  = (jlong)(product >>> 64);
9103  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
9104  //     z[kdx+idx] = (jlong)product;
9105  //     carry  = (jlong)(product >>> 64);
9106  //   }
9107  //   idx += 2;
9108  //   if (idx > 0) {
9109  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
9110  //     z[kdx+idx] = (jlong)product;
9111  //     carry  = (jlong)(product >>> 64);
9112  //   }
9113  //
9114
9115  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
9116
9117  movl(jdx, idx);
9118  andl(jdx, 0xFFFFFFFC);
9119  shrl(jdx, 2);
9120
9121  bind(L_third_loop);
9122  subl(jdx, 1);
9123  jcc(Assembler::negative, L_third_loop_exit);
9124  subl(idx, 4);
9125
9126  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
9127  movq(carry2, rdx);
9128
9129  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
9130  movq(carry, rdx);
9131  jmp(L_third_loop);
9132
9133  bind (L_third_loop_exit);
9134
9135  andl (idx, 0x3);
9136  jcc(Assembler::zero, L_post_third_loop_done);
9137
9138  Label L_check_1;
9139  subl(idx, 2);
9140  jcc(Assembler::negative, L_check_1);
9141
9142  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
9143  movq(carry, rdx);
9144
9145  bind (L_check_1);
9146  addl (idx, 0x2);
9147  andl (idx, 0x1);
9148  subl(idx, 1);
9149  jcc(Assembler::negative, L_post_third_loop_done);
9150
9151  movl(yz_idx, Address(y, idx, Address::times_4,  0));
9152  movq(product, x_xstart);
9153  mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
9154  movl(yz_idx, Address(z, idx, Address::times_4,  0));
9155
9156  add2_with_carry(rdx, product, yz_idx, carry);
9157
9158  movl(Address(z, idx, Address::times_4,  0), product);
9159  shrq(product, 32);
9160
9161  shlq(rdx, 32);
9162  orq(product, rdx);
9163  movq(carry, product);
9164
9165  bind(L_post_third_loop_done);
9166}
9167
9168/**
9169 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
9170 *
9171 */
9172void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
9173                                                  Register carry, Register carry2,
9174                                                  Register idx, Register jdx,
9175                                                  Register yz_idx1, Register yz_idx2,
9176                                                  Register tmp, Register tmp3, Register tmp4) {
9177  assert(UseBMI2Instructions, "should be used only when BMI2 is available");
9178
9179  //   jlong carry, x[], y[], z[];
9180  //   int kdx = ystart+1;
9181  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
9182  //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
9183  //     jlong carry2  = (jlong)(tmp3 >>> 64);
9184  //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
9185  //     carry  = (jlong)(tmp4 >>> 64);
9186  //     z[kdx+idx+1] = (jlong)tmp3;
9187  //     z[kdx+idx] = (jlong)tmp4;
9188  //   }
9189  //   idx += 2;
9190  //   if (idx > 0) {
9191  //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
9192  //     z[kdx+idx] = (jlong)yz_idx1;
9193  //     carry  = (jlong)(yz_idx1 >>> 64);
9194  //   }
9195  //
9196
9197  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
9198
9199  movl(jdx, idx);
9200  andl(jdx, 0xFFFFFFFC);
9201  shrl(jdx, 2);
9202
9203  bind(L_third_loop);
9204  subl(jdx, 1);
9205  jcc(Assembler::negative, L_third_loop_exit);
9206  subl(idx, 4);
9207
9208  movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
9209  rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
9210  movq(yz_idx2, Address(y, idx, Address::times_4,  0));
9211  rorxq(yz_idx2, yz_idx2, 32);
9212
9213  mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
9214  mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
9215
9216  movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
9217  rorxq(yz_idx1, yz_idx1, 32);
9218  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
9219  rorxq(yz_idx2, yz_idx2, 32);
9220
9221  if (VM_Version::supports_adx()) {
9222    adcxq(tmp3, carry);
9223    adoxq(tmp3, yz_idx1);
9224
9225    adcxq(tmp4, tmp);
9226    adoxq(tmp4, yz_idx2);
9227
9228    movl(carry, 0); // does not affect flags
9229    adcxq(carry2, carry);
9230    adoxq(carry2, carry);
9231  } else {
9232    add2_with_carry(tmp4, tmp3, carry, yz_idx1);
9233    add2_with_carry(carry2, tmp4, tmp, yz_idx2);
9234  }
9235  movq(carry, carry2);
9236
9237  movl(Address(z, idx, Address::times_4, 12), tmp3);
9238  shrq(tmp3, 32);
9239  movl(Address(z, idx, Address::times_4,  8), tmp3);
9240
9241  movl(Address(z, idx, Address::times_4,  4), tmp4);
9242  shrq(tmp4, 32);
9243  movl(Address(z, idx, Address::times_4,  0), tmp4);
9244
9245  jmp(L_third_loop);
9246
9247  bind (L_third_loop_exit);
9248
9249  andl (idx, 0x3);
9250  jcc(Assembler::zero, L_post_third_loop_done);
9251
9252  Label L_check_1;
9253  subl(idx, 2);
9254  jcc(Assembler::negative, L_check_1);
9255
9256  movq(yz_idx1, Address(y, idx, Address::times_4,  0));
9257  rorxq(yz_idx1, yz_idx1, 32);
9258  mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
9259  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
9260  rorxq(yz_idx2, yz_idx2, 32);
9261
9262  add2_with_carry(tmp4, tmp3, carry, yz_idx2);
9263
9264  movl(Address(z, idx, Address::times_4,  4), tmp3);
9265  shrq(tmp3, 32);
9266  movl(Address(z, idx, Address::times_4,  0), tmp3);
9267  movq(carry, tmp4);
9268
9269  bind (L_check_1);
9270  addl (idx, 0x2);
9271  andl (idx, 0x1);
9272  subl(idx, 1);
9273  jcc(Assembler::negative, L_post_third_loop_done);
9274  movl(tmp4, Address(y, idx, Address::times_4,  0));
9275  mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
9276  movl(tmp4, Address(z, idx, Address::times_4,  0));
9277
9278  add2_with_carry(carry2, tmp3, tmp4, carry);
9279
9280  movl(Address(z, idx, Address::times_4,  0), tmp3);
9281  shrq(tmp3, 32);
9282
9283  shlq(carry2, 32);
9284  orq(tmp3, carry2);
9285  movq(carry, tmp3);
9286
9287  bind(L_post_third_loop_done);
9288}
9289
9290/**
9291 * Code for BigInteger::multiplyToLen() instrinsic.
9292 *
9293 * rdi: x
9294 * rax: xlen
9295 * rsi: y
9296 * rcx: ylen
9297 * r8:  z
9298 * r11: zlen
9299 * r12: tmp1
9300 * r13: tmp2
9301 * r14: tmp3
9302 * r15: tmp4
9303 * rbx: tmp5
9304 *
9305 */
9306void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
9307                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
9308  ShortBranchVerifier sbv(this);
9309  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
9310
9311  push(tmp1);
9312  push(tmp2);
9313  push(tmp3);
9314  push(tmp4);
9315  push(tmp5);
9316
9317  push(xlen);
9318  push(zlen);
9319
9320  const Register idx = tmp1;
9321  const Register kdx = tmp2;
9322  const Register xstart = tmp3;
9323
9324  const Register y_idx = tmp4;
9325  const Register carry = tmp5;
9326  const Register product  = xlen;
9327  const Register x_xstart = zlen;  // reuse register
9328
9329  // First Loop.
9330  //
9331  //  final static long LONG_MASK = 0xffffffffL;
9332  //  int xstart = xlen - 1;
9333  //  int ystart = ylen - 1;
9334  //  long carry = 0;
9335  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
9336  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
9337  //    z[kdx] = (int)product;
9338  //    carry = product >>> 32;
9339  //  }
9340  //  z[xstart] = (int)carry;
9341  //
9342
9343  movl(idx, ylen);      // idx = ylen;
9344  movl(kdx, zlen);      // kdx = xlen+ylen;
9345  xorq(carry, carry);   // carry = 0;
9346
9347  Label L_done;
9348
9349  movl(xstart, xlen);
9350  decrementl(xstart);
9351  jcc(Assembler::negative, L_done);
9352
9353  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
9354
9355  Label L_second_loop;
9356  testl(kdx, kdx);
9357  jcc(Assembler::zero, L_second_loop);
9358
9359  Label L_carry;
9360  subl(kdx, 1);
9361  jcc(Assembler::zero, L_carry);
9362
9363  movl(Address(z, kdx, Address::times_4,  0), carry);
9364  shrq(carry, 32);
9365  subl(kdx, 1);
9366
9367  bind(L_carry);
9368  movl(Address(z, kdx, Address::times_4,  0), carry);
9369
9370  // Second and third (nested) loops.
9371  //
9372  // for (int i = xstart-1; i >= 0; i--) { // Second loop
9373  //   carry = 0;
9374  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
9375  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
9376  //                    (z[k] & LONG_MASK) + carry;
9377  //     z[k] = (int)product;
9378  //     carry = product >>> 32;
9379  //   }
9380  //   z[i] = (int)carry;
9381  // }
9382  //
9383  // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
9384
9385  const Register jdx = tmp1;
9386
9387  bind(L_second_loop);
9388  xorl(carry, carry);    // carry = 0;
9389  movl(jdx, ylen);       // j = ystart+1
9390
9391  subl(xstart, 1);       // i = xstart-1;
9392  jcc(Assembler::negative, L_done);
9393
9394  push (z);
9395
9396  Label L_last_x;
9397  lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
9398  subl(xstart, 1);       // i = xstart-1;
9399  jcc(Assembler::negative, L_last_x);
9400
9401  if (UseBMI2Instructions) {
9402    movq(rdx,  Address(x, xstart, Address::times_4,  0));
9403    rorxq(rdx, rdx, 32); // convert big-endian to little-endian
9404  } else {
9405    movq(x_xstart, Address(x, xstart, Address::times_4,  0));
9406    rorq(x_xstart, 32);  // convert big-endian to little-endian
9407  }
9408
9409  Label L_third_loop_prologue;
9410  bind(L_third_loop_prologue);
9411
9412  push (x);
9413  push (xstart);
9414  push (ylen);
9415
9416
9417  if (UseBMI2Instructions) {
9418    multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
9419  } else { // !UseBMI2Instructions
9420    multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
9421  }
9422
9423  pop(ylen);
9424  pop(xlen);
9425  pop(x);
9426  pop(z);
9427
9428  movl(tmp3, xlen);
9429  addl(tmp3, 1);
9430  movl(Address(z, tmp3, Address::times_4,  0), carry);
9431  subl(tmp3, 1);
9432  jccb(Assembler::negative, L_done);
9433
9434  shrq(carry, 32);
9435  movl(Address(z, tmp3, Address::times_4,  0), carry);
9436  jmp(L_second_loop);
9437
9438  // Next infrequent code is moved outside loops.
9439  bind(L_last_x);
9440  if (UseBMI2Instructions) {
9441    movl(rdx, Address(x,  0));
9442  } else {
9443    movl(x_xstart, Address(x,  0));
9444  }
9445  jmp(L_third_loop_prologue);
9446
9447  bind(L_done);
9448
9449  pop(zlen);
9450  pop(xlen);
9451
9452  pop(tmp5);
9453  pop(tmp4);
9454  pop(tmp3);
9455  pop(tmp2);
9456  pop(tmp1);
9457}
9458
9459//Helper functions for square_to_len()
9460
9461/**
9462 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
9463 * Preserves x and z and modifies rest of the registers.
9464 */
9465
9466void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9467  // Perform square and right shift by 1
9468  // Handle odd xlen case first, then for even xlen do the following
9469  // jlong carry = 0;
9470  // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
9471  //     huge_128 product = x[j:j+1] * x[j:j+1];
9472  //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
9473  //     z[i+2:i+3] = (jlong)(product >>> 1);
9474  //     carry = (jlong)product;
9475  // }
9476
9477  xorq(tmp5, tmp5);     // carry
9478  xorq(rdxReg, rdxReg);
9479  xorl(tmp1, tmp1);     // index for x
9480  xorl(tmp4, tmp4);     // index for z
9481
9482  Label L_first_loop, L_first_loop_exit;
9483
9484  testl(xlen, 1);
9485  jccb(Assembler::zero, L_first_loop); //jump if xlen is even
9486
9487  // Square and right shift by 1 the odd element using 32 bit multiply
9488  movl(raxReg, Address(x, tmp1, Address::times_4, 0));
9489  imulq(raxReg, raxReg);
9490  shrq(raxReg, 1);
9491  adcq(tmp5, 0);
9492  movq(Address(z, tmp4, Address::times_4, 0), raxReg);
9493  incrementl(tmp1);
9494  addl(tmp4, 2);
9495
9496  // Square and  right shift by 1 the rest using 64 bit multiply
9497  bind(L_first_loop);
9498  cmpptr(tmp1, xlen);
9499  jccb(Assembler::equal, L_first_loop_exit);
9500
9501  // Square
9502  movq(raxReg, Address(x, tmp1, Address::times_4,  0));
9503  rorq(raxReg, 32);    // convert big-endian to little-endian
9504  mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
9505
9506  // Right shift by 1 and save carry
9507  shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
9508  rcrq(rdxReg, 1);
9509  rcrq(raxReg, 1);
9510  adcq(tmp5, 0);
9511
9512  // Store result in z
9513  movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
9514  movq(Address(z, tmp4, Address::times_4, 8), raxReg);
9515
9516  // Update indices for x and z
9517  addl(tmp1, 2);
9518  addl(tmp4, 4);
9519  jmp(L_first_loop);
9520
9521  bind(L_first_loop_exit);
9522}
9523
9524
9525/**
9526 * Perform the following multiply add operation using BMI2 instructions
9527 * carry:sum = sum + op1*op2 + carry
9528 * op2 should be in rdx
9529 * op2 is preserved, all other registers are modified
9530 */
9531void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
9532  // assert op2 is rdx
9533  mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
9534  addq(sum, carry);
9535  adcq(tmp2, 0);
9536  addq(sum, op1);
9537  adcq(tmp2, 0);
9538  movq(carry, tmp2);
9539}
9540
9541/**
9542 * Perform the following multiply add operation:
9543 * carry:sum = sum + op1*op2 + carry
9544 * Preserves op1, op2 and modifies rest of registers
9545 */
9546void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
9547  // rdx:rax = op1 * op2
9548  movq(raxReg, op2);
9549  mulq(op1);
9550
9551  //  rdx:rax = sum + carry + rdx:rax
9552  addq(sum, carry);
9553  adcq(rdxReg, 0);
9554  addq(sum, raxReg);
9555  adcq(rdxReg, 0);
9556
9557  // carry:sum = rdx:sum
9558  movq(carry, rdxReg);
9559}
9560
9561/**
9562 * Add 64 bit long carry into z[] with carry propogation.
9563 * Preserves z and carry register values and modifies rest of registers.
9564 *
9565 */
9566void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
9567  Label L_fourth_loop, L_fourth_loop_exit;
9568
9569  movl(tmp1, 1);
9570  subl(zlen, 2);
9571  addq(Address(z, zlen, Address::times_4, 0), carry);
9572
9573  bind(L_fourth_loop);
9574  jccb(Assembler::carryClear, L_fourth_loop_exit);
9575  subl(zlen, 2);
9576  jccb(Assembler::negative, L_fourth_loop_exit);
9577  addq(Address(z, zlen, Address::times_4, 0), tmp1);
9578  jmp(L_fourth_loop);
9579  bind(L_fourth_loop_exit);
9580}
9581
9582/**
9583 * Shift z[] left by 1 bit.
9584 * Preserves x, len, z and zlen registers and modifies rest of the registers.
9585 *
9586 */
9587void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
9588
9589  Label L_fifth_loop, L_fifth_loop_exit;
9590
9591  // Fifth loop
9592  // Perform primitiveLeftShift(z, zlen, 1)
9593
9594  const Register prev_carry = tmp1;
9595  const Register new_carry = tmp4;
9596  const Register value = tmp2;
9597  const Register zidx = tmp3;
9598
9599  // int zidx, carry;
9600  // long value;
9601  // carry = 0;
9602  // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
9603  //    (carry:value)  = (z[i] << 1) | carry ;
9604  //    z[i] = value;
9605  // }
9606
9607  movl(zidx, zlen);
9608  xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
9609
9610  bind(L_fifth_loop);
9611  decl(zidx);  // Use decl to preserve carry flag
9612  decl(zidx);
9613  jccb(Assembler::negative, L_fifth_loop_exit);
9614
9615  if (UseBMI2Instructions) {
9616     movq(value, Address(z, zidx, Address::times_4, 0));
9617     rclq(value, 1);
9618     rorxq(value, value, 32);
9619     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
9620  }
9621  else {
9622    // clear new_carry
9623    xorl(new_carry, new_carry);
9624
9625    // Shift z[i] by 1, or in previous carry and save new carry
9626    movq(value, Address(z, zidx, Address::times_4, 0));
9627    shlq(value, 1);
9628    adcl(new_carry, 0);
9629
9630    orq(value, prev_carry);
9631    rorq(value, 0x20);
9632    movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
9633
9634    // Set previous carry = new carry
9635    movl(prev_carry, new_carry);
9636  }
9637  jmp(L_fifth_loop);
9638
9639  bind(L_fifth_loop_exit);
9640}
9641
9642
9643/**
9644 * Code for BigInteger::squareToLen() intrinsic
9645 *
9646 * rdi: x
9647 * rsi: len
9648 * r8:  z
9649 * rcx: zlen
9650 * r12: tmp1
9651 * r13: tmp2
9652 * r14: tmp3
9653 * r15: tmp4
9654 * rbx: tmp5
9655 *
9656 */
9657void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9658
9659  Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
9660  push(tmp1);
9661  push(tmp2);
9662  push(tmp3);
9663  push(tmp4);
9664  push(tmp5);
9665
9666  // First loop
9667  // Store the squares, right shifted one bit (i.e., divided by 2).
9668  square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
9669
9670  // Add in off-diagonal sums.
9671  //
9672  // Second, third (nested) and fourth loops.
9673  // zlen +=2;
9674  // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
9675  //    carry = 0;
9676  //    long op2 = x[xidx:xidx+1];
9677  //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
9678  //       k -= 2;
9679  //       long op1 = x[j:j+1];
9680  //       long sum = z[k:k+1];
9681  //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
9682  //       z[k:k+1] = sum;
9683  //    }
9684  //    add_one_64(z, k, carry, tmp_regs);
9685  // }
9686
9687  const Register carry = tmp5;
9688  const Register sum = tmp3;
9689  const Register op1 = tmp4;
9690  Register op2 = tmp2;
9691
9692  push(zlen);
9693  push(len);
9694  addl(zlen,2);
9695  bind(L_second_loop);
9696  xorq(carry, carry);
9697  subl(zlen, 4);
9698  subl(len, 2);
9699  push(zlen);
9700  push(len);
9701  cmpl(len, 0);
9702  jccb(Assembler::lessEqual, L_second_loop_exit);
9703
9704  // Multiply an array by one 64 bit long.
9705  if (UseBMI2Instructions) {
9706    op2 = rdxReg;
9707    movq(op2, Address(x, len, Address::times_4,  0));
9708    rorxq(op2, op2, 32);
9709  }
9710  else {
9711    movq(op2, Address(x, len, Address::times_4,  0));
9712    rorq(op2, 32);
9713  }
9714
9715  bind(L_third_loop);
9716  decrementl(len);
9717  jccb(Assembler::negative, L_third_loop_exit);
9718  decrementl(len);
9719  jccb(Assembler::negative, L_last_x);
9720
9721  movq(op1, Address(x, len, Address::times_4,  0));
9722  rorq(op1, 32);
9723
9724  bind(L_multiply);
9725  subl(zlen, 2);
9726  movq(sum, Address(z, zlen, Address::times_4,  0));
9727
9728  // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
9729  if (UseBMI2Instructions) {
9730    multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
9731  }
9732  else {
9733    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9734  }
9735
9736  movq(Address(z, zlen, Address::times_4, 0), sum);
9737
9738  jmp(L_third_loop);
9739  bind(L_third_loop_exit);
9740
9741  // Fourth loop
9742  // Add 64 bit long carry into z with carry propogation.
9743  // Uses offsetted zlen.
9744  add_one_64(z, zlen, carry, tmp1);
9745
9746  pop(len);
9747  pop(zlen);
9748  jmp(L_second_loop);
9749
9750  // Next infrequent code is moved outside loops.
9751  bind(L_last_x);
9752  movl(op1, Address(x, 0));
9753  jmp(L_multiply);
9754
9755  bind(L_second_loop_exit);
9756  pop(len);
9757  pop(zlen);
9758  pop(len);
9759  pop(zlen);
9760
9761  // Fifth loop
9762  // Shift z left 1 bit.
9763  lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
9764
9765  // z[zlen-1] |= x[len-1] & 1;
9766  movl(tmp3, Address(x, len, Address::times_4, -4));
9767  andl(tmp3, 1);
9768  orl(Address(z, zlen, Address::times_4,  -4), tmp3);
9769
9770  pop(tmp5);
9771  pop(tmp4);
9772  pop(tmp3);
9773  pop(tmp2);
9774  pop(tmp1);
9775}
9776
9777/**
9778 * Helper function for mul_add()
9779 * Multiply the in[] by int k and add to out[] starting at offset offs using
9780 * 128 bit by 32 bit multiply and return the carry in tmp5.
9781 * Only quad int aligned length of in[] is operated on in this function.
9782 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
9783 * This function preserves out, in and k registers.
9784 * len and offset point to the appropriate index in "in" & "out" correspondingly
9785 * tmp5 has the carry.
9786 * other registers are temporary and are modified.
9787 *
9788 */
9789void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
9790  Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
9791  Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9792
9793  Label L_first_loop, L_first_loop_exit;
9794
9795  movl(tmp1, len);
9796  shrl(tmp1, 2);
9797
9798  bind(L_first_loop);
9799  subl(tmp1, 1);
9800  jccb(Assembler::negative, L_first_loop_exit);
9801
9802  subl(len, 4);
9803  subl(offset, 4);
9804
9805  Register op2 = tmp2;
9806  const Register sum = tmp3;
9807  const Register op1 = tmp4;
9808  const Register carry = tmp5;
9809
9810  if (UseBMI2Instructions) {
9811    op2 = rdxReg;
9812  }
9813
9814  movq(op1, Address(in, len, Address::times_4,  8));
9815  rorq(op1, 32);
9816  movq(sum, Address(out, offset, Address::times_4,  8));
9817  rorq(sum, 32);
9818  if (UseBMI2Instructions) {
9819    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9820  }
9821  else {
9822    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9823  }
9824  // Store back in big endian from little endian
9825  rorq(sum, 0x20);
9826  movq(Address(out, offset, Address::times_4,  8), sum);
9827
9828  movq(op1, Address(in, len, Address::times_4,  0));
9829  rorq(op1, 32);
9830  movq(sum, Address(out, offset, Address::times_4,  0));
9831  rorq(sum, 32);
9832  if (UseBMI2Instructions) {
9833    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9834  }
9835  else {
9836    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9837  }
9838  // Store back in big endian from little endian
9839  rorq(sum, 0x20);
9840  movq(Address(out, offset, Address::times_4,  0), sum);
9841
9842  jmp(L_first_loop);
9843  bind(L_first_loop_exit);
9844}
9845
9846/**
9847 * Code for BigInteger::mulAdd() intrinsic
9848 *
9849 * rdi: out
9850 * rsi: in
9851 * r11: offs (out.length - offset)
9852 * rcx: len
9853 * r8:  k
9854 * r12: tmp1
9855 * r13: tmp2
9856 * r14: tmp3
9857 * r15: tmp4
9858 * rbx: tmp5
9859 * Multiply the in[] by word k and add to out[], return the carry in rax
9860 */
9861void MacroAssembler::mul_add(Register out, Register in, Register offs,
9862   Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
9863   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9864
9865  Label L_carry, L_last_in, L_done;
9866
9867// carry = 0;
9868// for (int j=len-1; j >= 0; j--) {
9869//    long product = (in[j] & LONG_MASK) * kLong +
9870//                   (out[offs] & LONG_MASK) + carry;
9871//    out[offs--] = (int)product;
9872//    carry = product >>> 32;
9873// }
9874//
9875  push(tmp1);
9876  push(tmp2);
9877  push(tmp3);
9878  push(tmp4);
9879  push(tmp5);
9880
9881  Register op2 = tmp2;
9882  const Register sum = tmp3;
9883  const Register op1 = tmp4;
9884  const Register carry =  tmp5;
9885
9886  if (UseBMI2Instructions) {
9887    op2 = rdxReg;
9888    movl(op2, k);
9889  }
9890  else {
9891    movl(op2, k);
9892  }
9893
9894  xorq(carry, carry);
9895
9896  //First loop
9897
9898  //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
9899  //The carry is in tmp5
9900  mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
9901
9902  //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
9903  decrementl(len);
9904  jccb(Assembler::negative, L_carry);
9905  decrementl(len);
9906  jccb(Assembler::negative, L_last_in);
9907
9908  movq(op1, Address(in, len, Address::times_4,  0));
9909  rorq(op1, 32);
9910
9911  subl(offs, 2);
9912  movq(sum, Address(out, offs, Address::times_4,  0));
9913  rorq(sum, 32);
9914
9915  if (UseBMI2Instructions) {
9916    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9917  }
9918  else {
9919    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9920  }
9921
9922  // Store back in big endian from little endian
9923  rorq(sum, 0x20);
9924  movq(Address(out, offs, Address::times_4,  0), sum);
9925
9926  testl(len, len);
9927  jccb(Assembler::zero, L_carry);
9928
9929  //Multiply the last in[] entry, if any
9930  bind(L_last_in);
9931  movl(op1, Address(in, 0));
9932  movl(sum, Address(out, offs, Address::times_4,  -4));
9933
9934  movl(raxReg, k);
9935  mull(op1); //tmp4 * eax -> edx:eax
9936  addl(sum, carry);
9937  adcl(rdxReg, 0);
9938  addl(sum, raxReg);
9939  adcl(rdxReg, 0);
9940  movl(carry, rdxReg);
9941
9942  movl(Address(out, offs, Address::times_4,  -4), sum);
9943
9944  bind(L_carry);
9945  //return tmp5/carry as carry in rax
9946  movl(rax, carry);
9947
9948  bind(L_done);
9949  pop(tmp5);
9950  pop(tmp4);
9951  pop(tmp3);
9952  pop(tmp2);
9953  pop(tmp1);
9954}
9955#endif
9956
9957/**
9958 * Emits code to update CRC-32 with a byte value according to constants in table
9959 *
9960 * @param [in,out]crc   Register containing the crc.
9961 * @param [in]val       Register containing the byte to fold into the CRC.
9962 * @param [in]table     Register containing the table of crc constants.
9963 *
9964 * uint32_t crc;
9965 * val = crc_table[(val ^ crc) & 0xFF];
9966 * crc = val ^ (crc >> 8);
9967 *
9968 */
9969void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
9970  xorl(val, crc);
9971  andl(val, 0xFF);
9972  shrl(crc, 8); // unsigned shift
9973  xorl(crc, Address(table, val, Address::times_4, 0));
9974}
9975
9976/**
9977 * Fold 128-bit data chunk
9978 */
9979void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
9980  if (UseAVX > 0) {
9981    vpclmulhdq(xtmp, xK, xcrc); // [123:64]
9982    vpclmulldq(xcrc, xK, xcrc); // [63:0]
9983    vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
9984    pxor(xcrc, xtmp);
9985  } else {
9986    movdqa(xtmp, xcrc);
9987    pclmulhdq(xtmp, xK);   // [123:64]
9988    pclmulldq(xcrc, xK);   // [63:0]
9989    pxor(xcrc, xtmp);
9990    movdqu(xtmp, Address(buf, offset));
9991    pxor(xcrc, xtmp);
9992  }
9993}
9994
9995void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
9996  if (UseAVX > 0) {
9997    vpclmulhdq(xtmp, xK, xcrc);
9998    vpclmulldq(xcrc, xK, xcrc);
9999    pxor(xcrc, xbuf);
10000    pxor(xcrc, xtmp);
10001  } else {
10002    movdqa(xtmp, xcrc);
10003    pclmulhdq(xtmp, xK);
10004    pclmulldq(xcrc, xK);
10005    pxor(xcrc, xbuf);
10006    pxor(xcrc, xtmp);
10007  }
10008}
10009
10010/**
10011 * 8-bit folds to compute 32-bit CRC
10012 *
10013 * uint64_t xcrc;
10014 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
10015 */
10016void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
10017  movdl(tmp, xcrc);
10018  andl(tmp, 0xFF);
10019  movdl(xtmp, Address(table, tmp, Address::times_4, 0));
10020  psrldq(xcrc, 1); // unsigned shift one byte
10021  pxor(xcrc, xtmp);
10022}
10023
10024/**
10025 * uint32_t crc;
10026 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
10027 */
10028void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
10029  movl(tmp, crc);
10030  andl(tmp, 0xFF);
10031  shrl(crc, 8);
10032  xorl(crc, Address(table, tmp, Address::times_4, 0));
10033}
10034
10035/**
10036 * @param crc   register containing existing CRC (32-bit)
10037 * @param buf   register pointing to input byte buffer (byte*)
10038 * @param len   register containing number of bytes
10039 * @param table register that will contain address of CRC table
10040 * @param tmp   scratch register
10041 */
10042void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
10043  assert_different_registers(crc, buf, len, table, tmp, rax);
10044
10045  Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
10046  Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
10047
10048  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
10049  // context for the registers used, where all instructions below are using 128-bit mode
10050  // On EVEX without VL and BW, these instructions will all be AVX.
10051  if (VM_Version::supports_avx512vlbw()) {
10052    movl(tmp, 0xffff);
10053    kmovwl(k1, tmp);
10054  }
10055
10056  lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
10057  notl(crc); // ~crc
10058  cmpl(len, 16);
10059  jcc(Assembler::less, L_tail);
10060
10061  // Align buffer to 16 bytes
10062  movl(tmp, buf);
10063  andl(tmp, 0xF);
10064  jccb(Assembler::zero, L_aligned);
10065  subl(tmp,  16);
10066  addl(len, tmp);
10067
10068  align(4);
10069  BIND(L_align_loop);
10070  movsbl(rax, Address(buf, 0)); // load byte with sign extension
10071  update_byte_crc32(crc, rax, table);
10072  increment(buf);
10073  incrementl(tmp);
10074  jccb(Assembler::less, L_align_loop);
10075
10076  BIND(L_aligned);
10077  movl(tmp, len); // save
10078  shrl(len, 4);
10079  jcc(Assembler::zero, L_tail_restore);
10080
10081  // Fold crc into first bytes of vector
10082  movdqa(xmm1, Address(buf, 0));
10083  movdl(rax, xmm1);
10084  xorl(crc, rax);
10085  pinsrd(xmm1, crc, 0);
10086  addptr(buf, 16);
10087  subl(len, 4); // len > 0
10088  jcc(Assembler::less, L_fold_tail);
10089
10090  movdqa(xmm2, Address(buf,  0));
10091  movdqa(xmm3, Address(buf, 16));
10092  movdqa(xmm4, Address(buf, 32));
10093  addptr(buf, 48);
10094  subl(len, 3);
10095  jcc(Assembler::lessEqual, L_fold_512b);
10096
10097  // Fold total 512 bits of polynomial on each iteration,
10098  // 128 bits per each of 4 parallel streams.
10099  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
10100
10101  align(32);
10102  BIND(L_fold_512b_loop);
10103  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
10104  fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
10105  fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
10106  fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
10107  addptr(buf, 64);
10108  subl(len, 4);
10109  jcc(Assembler::greater, L_fold_512b_loop);
10110
10111  // Fold 512 bits to 128 bits.
10112  BIND(L_fold_512b);
10113  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
10114  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
10115  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
10116  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
10117
10118  // Fold the rest of 128 bits data chunks
10119  BIND(L_fold_tail);
10120  addl(len, 3);
10121  jccb(Assembler::lessEqual, L_fold_128b);
10122  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
10123
10124  BIND(L_fold_tail_loop);
10125  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
10126  addptr(buf, 16);
10127  decrementl(len);
10128  jccb(Assembler::greater, L_fold_tail_loop);
10129
10130  // Fold 128 bits in xmm1 down into 32 bits in crc register.
10131  BIND(L_fold_128b);
10132  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
10133  if (UseAVX > 0) {
10134    vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
10135    vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
10136    vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
10137  } else {
10138    movdqa(xmm2, xmm0);
10139    pclmulqdq(xmm2, xmm1, 0x1);
10140    movdqa(xmm3, xmm0);
10141    pand(xmm3, xmm2);
10142    pclmulqdq(xmm0, xmm3, 0x1);
10143  }
10144  psrldq(xmm1, 8);
10145  psrldq(xmm2, 4);
10146  pxor(xmm0, xmm1);
10147  pxor(xmm0, xmm2);
10148
10149  // 8 8-bit folds to compute 32-bit CRC.
10150  for (int j = 0; j < 4; j++) {
10151    fold_8bit_crc32(xmm0, table, xmm1, rax);
10152  }
10153  movdl(crc, xmm0); // mov 32 bits to general register
10154  for (int j = 0; j < 4; j++) {
10155    fold_8bit_crc32(crc, table, rax);
10156  }
10157
10158  BIND(L_tail_restore);
10159  movl(len, tmp); // restore
10160  BIND(L_tail);
10161  andl(len, 0xf);
10162  jccb(Assembler::zero, L_exit);
10163
10164  // Fold the rest of bytes
10165  align(4);
10166  BIND(L_tail_loop);
10167  movsbl(rax, Address(buf, 0)); // load byte with sign extension
10168  update_byte_crc32(crc, rax, table);
10169  increment(buf);
10170  decrementl(len);
10171  jccb(Assembler::greater, L_tail_loop);
10172
10173  BIND(L_exit);
10174  notl(crc); // ~c
10175}
10176
10177#ifdef _LP64
10178// S. Gueron / Information Processing Letters 112 (2012) 184
10179// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
10180// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
10181// Output: the 64-bit carry-less product of B * CONST
10182void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
10183                                     Register tmp1, Register tmp2, Register tmp3) {
10184  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
10185  if (n > 0) {
10186    addq(tmp3, n * 256 * 8);
10187  }
10188  //    Q1 = TABLEExt[n][B & 0xFF];
10189  movl(tmp1, in);
10190  andl(tmp1, 0x000000FF);
10191  shll(tmp1, 3);
10192  addq(tmp1, tmp3);
10193  movq(tmp1, Address(tmp1, 0));
10194
10195  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
10196  movl(tmp2, in);
10197  shrl(tmp2, 8);
10198  andl(tmp2, 0x000000FF);
10199  shll(tmp2, 3);
10200  addq(tmp2, tmp3);
10201  movq(tmp2, Address(tmp2, 0));
10202
10203  shlq(tmp2, 8);
10204  xorq(tmp1, tmp2);
10205
10206  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
10207  movl(tmp2, in);
10208  shrl(tmp2, 16);
10209  andl(tmp2, 0x000000FF);
10210  shll(tmp2, 3);
10211  addq(tmp2, tmp3);
10212  movq(tmp2, Address(tmp2, 0));
10213
10214  shlq(tmp2, 16);
10215  xorq(tmp1, tmp2);
10216
10217  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
10218  shrl(in, 24);
10219  andl(in, 0x000000FF);
10220  shll(in, 3);
10221  addq(in, tmp3);
10222  movq(in, Address(in, 0));
10223
10224  shlq(in, 24);
10225  xorq(in, tmp1);
10226  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
10227}
10228
10229void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
10230                                      Register in_out,
10231                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
10232                                      XMMRegister w_xtmp2,
10233                                      Register tmp1,
10234                                      Register n_tmp2, Register n_tmp3) {
10235  if (is_pclmulqdq_supported) {
10236    movdl(w_xtmp1, in_out); // modified blindly
10237
10238    movl(tmp1, const_or_pre_comp_const_index);
10239    movdl(w_xtmp2, tmp1);
10240    pclmulqdq(w_xtmp1, w_xtmp2, 0);
10241
10242    movdq(in_out, w_xtmp1);
10243  } else {
10244    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
10245  }
10246}
10247
10248// Recombination Alternative 2: No bit-reflections
10249// T1 = (CRC_A * U1) << 1
10250// T2 = (CRC_B * U2) << 1
10251// C1 = T1 >> 32
10252// C2 = T2 >> 32
10253// T1 = T1 & 0xFFFFFFFF
10254// T2 = T2 & 0xFFFFFFFF
10255// T1 = CRC32(0, T1)
10256// T2 = CRC32(0, T2)
10257// C1 = C1 ^ T1
10258// C2 = C2 ^ T2
10259// CRC = C1 ^ C2 ^ CRC_C
10260void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
10261                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10262                                     Register tmp1, Register tmp2,
10263                                     Register n_tmp3) {
10264  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10265  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10266  shlq(in_out, 1);
10267  movl(tmp1, in_out);
10268  shrq(in_out, 32);
10269  xorl(tmp2, tmp2);
10270  crc32(tmp2, tmp1, 4);
10271  xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
10272  shlq(in1, 1);
10273  movl(tmp1, in1);
10274  shrq(in1, 32);
10275  xorl(tmp2, tmp2);
10276  crc32(tmp2, tmp1, 4);
10277  xorl(in1, tmp2);
10278  xorl(in_out, in1);
10279  xorl(in_out, in2);
10280}
10281
10282// Set N to predefined value
10283// Subtract from a lenght of a buffer
10284// execute in a loop:
10285// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
10286// for i = 1 to N do
10287//  CRC_A = CRC32(CRC_A, A[i])
10288//  CRC_B = CRC32(CRC_B, B[i])
10289//  CRC_C = CRC32(CRC_C, C[i])
10290// end for
10291// Recombine
10292void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
10293                                       Register in_out1, Register in_out2, Register in_out3,
10294                                       Register tmp1, Register tmp2, Register tmp3,
10295                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10296                                       Register tmp4, Register tmp5,
10297                                       Register n_tmp6) {
10298  Label L_processPartitions;
10299  Label L_processPartition;
10300  Label L_exit;
10301
10302  bind(L_processPartitions);
10303  cmpl(in_out1, 3 * size);
10304  jcc(Assembler::less, L_exit);
10305    xorl(tmp1, tmp1);
10306    xorl(tmp2, tmp2);
10307    movq(tmp3, in_out2);
10308    addq(tmp3, size);
10309
10310    bind(L_processPartition);
10311      crc32(in_out3, Address(in_out2, 0), 8);
10312      crc32(tmp1, Address(in_out2, size), 8);
10313      crc32(tmp2, Address(in_out2, size * 2), 8);
10314      addq(in_out2, 8);
10315      cmpq(in_out2, tmp3);
10316      jcc(Assembler::less, L_processPartition);
10317    crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
10318            w_xtmp1, w_xtmp2, w_xtmp3,
10319            tmp4, tmp5,
10320            n_tmp6);
10321    addq(in_out2, 2 * size);
10322    subl(in_out1, 3 * size);
10323    jmp(L_processPartitions);
10324
10325  bind(L_exit);
10326}
10327#else
10328void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
10329                                     Register tmp1, Register tmp2, Register tmp3,
10330                                     XMMRegister xtmp1, XMMRegister xtmp2) {
10331  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
10332  if (n > 0) {
10333    addl(tmp3, n * 256 * 8);
10334  }
10335  //    Q1 = TABLEExt[n][B & 0xFF];
10336  movl(tmp1, in_out);
10337  andl(tmp1, 0x000000FF);
10338  shll(tmp1, 3);
10339  addl(tmp1, tmp3);
10340  movq(xtmp1, Address(tmp1, 0));
10341
10342  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
10343  movl(tmp2, in_out);
10344  shrl(tmp2, 8);
10345  andl(tmp2, 0x000000FF);
10346  shll(tmp2, 3);
10347  addl(tmp2, tmp3);
10348  movq(xtmp2, Address(tmp2, 0));
10349
10350  psllq(xtmp2, 8);
10351  pxor(xtmp1, xtmp2);
10352
10353  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
10354  movl(tmp2, in_out);
10355  shrl(tmp2, 16);
10356  andl(tmp2, 0x000000FF);
10357  shll(tmp2, 3);
10358  addl(tmp2, tmp3);
10359  movq(xtmp2, Address(tmp2, 0));
10360
10361  psllq(xtmp2, 16);
10362  pxor(xtmp1, xtmp2);
10363
10364  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
10365  shrl(in_out, 24);
10366  andl(in_out, 0x000000FF);
10367  shll(in_out, 3);
10368  addl(in_out, tmp3);
10369  movq(xtmp2, Address(in_out, 0));
10370
10371  psllq(xtmp2, 24);
10372  pxor(xtmp1, xtmp2); // Result in CXMM
10373  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
10374}
10375
10376void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
10377                                      Register in_out,
10378                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
10379                                      XMMRegister w_xtmp2,
10380                                      Register tmp1,
10381                                      Register n_tmp2, Register n_tmp3) {
10382  if (is_pclmulqdq_supported) {
10383    movdl(w_xtmp1, in_out);
10384
10385    movl(tmp1, const_or_pre_comp_const_index);
10386    movdl(w_xtmp2, tmp1);
10387    pclmulqdq(w_xtmp1, w_xtmp2, 0);
10388    // Keep result in XMM since GPR is 32 bit in length
10389  } else {
10390    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
10391  }
10392}
10393
10394void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
10395                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10396                                     Register tmp1, Register tmp2,
10397                                     Register n_tmp3) {
10398  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10399  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10400
10401  psllq(w_xtmp1, 1);
10402  movdl(tmp1, w_xtmp1);
10403  psrlq(w_xtmp1, 32);
10404  movdl(in_out, w_xtmp1);
10405
10406  xorl(tmp2, tmp2);
10407  crc32(tmp2, tmp1, 4);
10408  xorl(in_out, tmp2);
10409
10410  psllq(w_xtmp2, 1);
10411  movdl(tmp1, w_xtmp2);
10412  psrlq(w_xtmp2, 32);
10413  movdl(in1, w_xtmp2);
10414
10415  xorl(tmp2, tmp2);
10416  crc32(tmp2, tmp1, 4);
10417  xorl(in1, tmp2);
10418  xorl(in_out, in1);
10419  xorl(in_out, in2);
10420}
10421
10422void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
10423                                       Register in_out1, Register in_out2, Register in_out3,
10424                                       Register tmp1, Register tmp2, Register tmp3,
10425                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10426                                       Register tmp4, Register tmp5,
10427                                       Register n_tmp6) {
10428  Label L_processPartitions;
10429  Label L_processPartition;
10430  Label L_exit;
10431
10432  bind(L_processPartitions);
10433  cmpl(in_out1, 3 * size);
10434  jcc(Assembler::less, L_exit);
10435    xorl(tmp1, tmp1);
10436    xorl(tmp2, tmp2);
10437    movl(tmp3, in_out2);
10438    addl(tmp3, size);
10439
10440    bind(L_processPartition);
10441      crc32(in_out3, Address(in_out2, 0), 4);
10442      crc32(tmp1, Address(in_out2, size), 4);
10443      crc32(tmp2, Address(in_out2, size*2), 4);
10444      crc32(in_out3, Address(in_out2, 0+4), 4);
10445      crc32(tmp1, Address(in_out2, size+4), 4);
10446      crc32(tmp2, Address(in_out2, size*2+4), 4);
10447      addl(in_out2, 8);
10448      cmpl(in_out2, tmp3);
10449      jcc(Assembler::less, L_processPartition);
10450
10451        push(tmp3);
10452        push(in_out1);
10453        push(in_out2);
10454        tmp4 = tmp3;
10455        tmp5 = in_out1;
10456        n_tmp6 = in_out2;
10457
10458      crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
10459            w_xtmp1, w_xtmp2, w_xtmp3,
10460            tmp4, tmp5,
10461            n_tmp6);
10462
10463        pop(in_out2);
10464        pop(in_out1);
10465        pop(tmp3);
10466
10467    addl(in_out2, 2 * size);
10468    subl(in_out1, 3 * size);
10469    jmp(L_processPartitions);
10470
10471  bind(L_exit);
10472}
10473#endif //LP64
10474
10475#ifdef _LP64
10476// Algorithm 2: Pipelined usage of the CRC32 instruction.
10477// Input: A buffer I of L bytes.
10478// Output: the CRC32C value of the buffer.
10479// Notations:
10480// Write L = 24N + r, with N = floor (L/24).
10481// r = L mod 24 (0 <= r < 24).
10482// Consider I as the concatenation of A|B|C|R, where A, B, C, each,
10483// N quadwords, and R consists of r bytes.
10484// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
10485// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
10486// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
10487// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
10488void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10489                                          Register tmp1, Register tmp2, Register tmp3,
10490                                          Register tmp4, Register tmp5, Register tmp6,
10491                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10492                                          bool is_pclmulqdq_supported) {
10493  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10494  Label L_wordByWord;
10495  Label L_byteByByteProlog;
10496  Label L_byteByByte;
10497  Label L_exit;
10498
10499  if (is_pclmulqdq_supported ) {
10500    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10501    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
10502
10503    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10504    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10505
10506    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10507    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10508    assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
10509  } else {
10510    const_or_pre_comp_const_index[0] = 1;
10511    const_or_pre_comp_const_index[1] = 0;
10512
10513    const_or_pre_comp_const_index[2] = 3;
10514    const_or_pre_comp_const_index[3] = 2;
10515
10516    const_or_pre_comp_const_index[4] = 5;
10517    const_or_pre_comp_const_index[5] = 4;
10518   }
10519  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10520                    in2, in1, in_out,
10521                    tmp1, tmp2, tmp3,
10522                    w_xtmp1, w_xtmp2, w_xtmp3,
10523                    tmp4, tmp5,
10524                    tmp6);
10525  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10526                    in2, in1, in_out,
10527                    tmp1, tmp2, tmp3,
10528                    w_xtmp1, w_xtmp2, w_xtmp3,
10529                    tmp4, tmp5,
10530                    tmp6);
10531  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10532                    in2, in1, in_out,
10533                    tmp1, tmp2, tmp3,
10534                    w_xtmp1, w_xtmp2, w_xtmp3,
10535                    tmp4, tmp5,
10536                    tmp6);
10537  movl(tmp1, in2);
10538  andl(tmp1, 0x00000007);
10539  negl(tmp1);
10540  addl(tmp1, in2);
10541  addq(tmp1, in1);
10542
10543  BIND(L_wordByWord);
10544  cmpq(in1, tmp1);
10545  jcc(Assembler::greaterEqual, L_byteByByteProlog);
10546    crc32(in_out, Address(in1, 0), 4);
10547    addq(in1, 4);
10548    jmp(L_wordByWord);
10549
10550  BIND(L_byteByByteProlog);
10551  andl(in2, 0x00000007);
10552  movl(tmp2, 1);
10553
10554  BIND(L_byteByByte);
10555  cmpl(tmp2, in2);
10556  jccb(Assembler::greater, L_exit);
10557    crc32(in_out, Address(in1, 0), 1);
10558    incq(in1);
10559    incl(tmp2);
10560    jmp(L_byteByByte);
10561
10562  BIND(L_exit);
10563}
10564#else
10565void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10566                                          Register tmp1, Register  tmp2, Register tmp3,
10567                                          Register tmp4, Register  tmp5, Register tmp6,
10568                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10569                                          bool is_pclmulqdq_supported) {
10570  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10571  Label L_wordByWord;
10572  Label L_byteByByteProlog;
10573  Label L_byteByByte;
10574  Label L_exit;
10575
10576  if (is_pclmulqdq_supported) {
10577    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10578    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
10579
10580    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10581    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10582
10583    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10584    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10585  } else {
10586    const_or_pre_comp_const_index[0] = 1;
10587    const_or_pre_comp_const_index[1] = 0;
10588
10589    const_or_pre_comp_const_index[2] = 3;
10590    const_or_pre_comp_const_index[3] = 2;
10591
10592    const_or_pre_comp_const_index[4] = 5;
10593    const_or_pre_comp_const_index[5] = 4;
10594  }
10595  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10596                    in2, in1, in_out,
10597                    tmp1, tmp2, tmp3,
10598                    w_xtmp1, w_xtmp2, w_xtmp3,
10599                    tmp4, tmp5,
10600                    tmp6);
10601  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10602                    in2, in1, in_out,
10603                    tmp1, tmp2, tmp3,
10604                    w_xtmp1, w_xtmp2, w_xtmp3,
10605                    tmp4, tmp5,
10606                    tmp6);
10607  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10608                    in2, in1, in_out,
10609                    tmp1, tmp2, tmp3,
10610                    w_xtmp1, w_xtmp2, w_xtmp3,
10611                    tmp4, tmp5,
10612                    tmp6);
10613  movl(tmp1, in2);
10614  andl(tmp1, 0x00000007);
10615  negl(tmp1);
10616  addl(tmp1, in2);
10617  addl(tmp1, in1);
10618
10619  BIND(L_wordByWord);
10620  cmpl(in1, tmp1);
10621  jcc(Assembler::greaterEqual, L_byteByByteProlog);
10622    crc32(in_out, Address(in1,0), 4);
10623    addl(in1, 4);
10624    jmp(L_wordByWord);
10625
10626  BIND(L_byteByByteProlog);
10627  andl(in2, 0x00000007);
10628  movl(tmp2, 1);
10629
10630  BIND(L_byteByByte);
10631  cmpl(tmp2, in2);
10632  jccb(Assembler::greater, L_exit);
10633    movb(tmp1, Address(in1, 0));
10634    crc32(in_out, tmp1, 1);
10635    incl(in1);
10636    incl(tmp2);
10637    jmp(L_byteByByte);
10638
10639  BIND(L_exit);
10640}
10641#endif // LP64
10642#undef BIND
10643#undef BLOCK_COMMENT
10644
10645
10646// Compress char[] array to byte[].
10647void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10648                                         XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10649                                         XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10650                                         Register tmp5, Register result) {
10651  Label copy_chars_loop, return_length, return_zero, done;
10652
10653  // rsi: src
10654  // rdi: dst
10655  // rdx: len
10656  // rcx: tmp5
10657  // rax: result
10658
10659  // rsi holds start addr of source char[] to be compressed
10660  // rdi holds start addr of destination byte[]
10661  // rdx holds length
10662
10663  assert(len != result, "");
10664
10665  // save length for return
10666  push(len);
10667
10668  if (UseSSE42Intrinsics) {
10669    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
10670    Label copy_32_loop, copy_16, copy_tail;
10671
10672    movl(result, len);
10673    movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
10674
10675    // vectored compression
10676    andl(len, 0xfffffff0);    // vector count (in chars)
10677    andl(result, 0x0000000f);    // tail count (in chars)
10678    testl(len, len);
10679    jccb(Assembler::zero, copy_16);
10680
10681    // compress 16 chars per iter
10682    movdl(tmp1Reg, tmp5);
10683    pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10684    pxor(tmp4Reg, tmp4Reg);
10685
10686    lea(src, Address(src, len, Address::times_2));
10687    lea(dst, Address(dst, len, Address::times_1));
10688    negptr(len);
10689
10690    bind(copy_32_loop);
10691    movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
10692    por(tmp4Reg, tmp2Reg);
10693    movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
10694    por(tmp4Reg, tmp3Reg);
10695    ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
10696    jcc(Assembler::notZero, return_zero);
10697    packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
10698    movdqu(Address(dst, len, Address::times_1), tmp2Reg);
10699    addptr(len, 16);
10700    jcc(Assembler::notZero, copy_32_loop);
10701
10702    // compress next vector of 8 chars (if any)
10703    bind(copy_16);
10704    movl(len, result);
10705    andl(len, 0xfffffff8);    // vector count (in chars)
10706    andl(result, 0x00000007);    // tail count (in chars)
10707    testl(len, len);
10708    jccb(Assembler::zero, copy_tail);
10709
10710    movdl(tmp1Reg, tmp5);
10711    pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10712    pxor(tmp3Reg, tmp3Reg);
10713
10714    movdqu(tmp2Reg, Address(src, 0));
10715    ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
10716    jccb(Assembler::notZero, return_zero);
10717    packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
10718    movq(Address(dst, 0), tmp2Reg);
10719    addptr(src, 16);
10720    addptr(dst, 8);
10721
10722    bind(copy_tail);
10723    movl(len, result);
10724  }
10725  // compress 1 char per iter
10726  testl(len, len);
10727  jccb(Assembler::zero, return_length);
10728  lea(src, Address(src, len, Address::times_2));
10729  lea(dst, Address(dst, len, Address::times_1));
10730  negptr(len);
10731
10732  bind(copy_chars_loop);
10733  load_unsigned_short(result, Address(src, len, Address::times_2));
10734  testl(result, 0xff00);      // check if Unicode char
10735  jccb(Assembler::notZero, return_zero);
10736  movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
10737  increment(len);
10738  jcc(Assembler::notZero, copy_chars_loop);
10739
10740  // if compression succeeded, return length
10741  bind(return_length);
10742  pop(result);
10743  jmpb(done);
10744
10745  // if compression failed, return 0
10746  bind(return_zero);
10747  xorl(result, result);
10748  addptr(rsp, wordSize);
10749
10750  bind(done);
10751}
10752
10753// Inflate byte[] array to char[].
10754void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
10755                                        XMMRegister tmp1, Register tmp2) {
10756  Label copy_chars_loop, done;
10757
10758  // rsi: src
10759  // rdi: dst
10760  // rdx: len
10761  // rcx: tmp2
10762
10763  // rsi holds start addr of source byte[] to be inflated
10764  // rdi holds start addr of destination char[]
10765  // rdx holds length
10766  assert_different_registers(src, dst, len, tmp2);
10767
10768  if (UseSSE42Intrinsics) {
10769    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
10770    Label copy_8_loop, copy_bytes, copy_tail;
10771
10772    movl(tmp2, len);
10773    andl(tmp2, 0x00000007);   // tail count (in chars)
10774    andl(len, 0xfffffff8);    // vector count (in chars)
10775    jccb(Assembler::zero, copy_tail);
10776
10777    // vectored inflation
10778    lea(src, Address(src, len, Address::times_1));
10779    lea(dst, Address(dst, len, Address::times_2));
10780    negptr(len);
10781
10782    // inflate 8 chars per iter
10783    bind(copy_8_loop);
10784    pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
10785    movdqu(Address(dst, len, Address::times_2), tmp1);
10786    addptr(len, 8);
10787    jcc(Assembler::notZero, copy_8_loop);
10788
10789    bind(copy_tail);
10790    movl(len, tmp2);
10791
10792    cmpl(len, 4);
10793    jccb(Assembler::less, copy_bytes);
10794
10795    movdl(tmp1, Address(src, 0));  // load 4 byte chars
10796    pmovzxbw(tmp1, tmp1);
10797    movq(Address(dst, 0), tmp1);
10798    subptr(len, 4);
10799    addptr(src, 4);
10800    addptr(dst, 8);
10801
10802    bind(copy_bytes);
10803  }
10804  testl(len, len);
10805  jccb(Assembler::zero, done);
10806  lea(src, Address(src, len, Address::times_1));
10807  lea(dst, Address(dst, len, Address::times_2));
10808  negptr(len);
10809
10810  // inflate 1 char per iter
10811  bind(copy_chars_loop);
10812  load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
10813  movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
10814  increment(len);
10815  jcc(Assembler::notZero, copy_chars_loop);
10816
10817  bind(done);
10818}
10819
10820
10821Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10822  switch (cond) {
10823    // Note some conditions are synonyms for others
10824    case Assembler::zero:         return Assembler::notZero;
10825    case Assembler::notZero:      return Assembler::zero;
10826    case Assembler::less:         return Assembler::greaterEqual;
10827    case Assembler::lessEqual:    return Assembler::greater;
10828    case Assembler::greater:      return Assembler::lessEqual;
10829    case Assembler::greaterEqual: return Assembler::less;
10830    case Assembler::below:        return Assembler::aboveEqual;
10831    case Assembler::belowEqual:   return Assembler::above;
10832    case Assembler::above:        return Assembler::belowEqual;
10833    case Assembler::aboveEqual:   return Assembler::below;
10834    case Assembler::overflow:     return Assembler::noOverflow;
10835    case Assembler::noOverflow:   return Assembler::overflow;
10836    case Assembler::negative:     return Assembler::positive;
10837    case Assembler::positive:     return Assembler::negative;
10838    case Assembler::parity:       return Assembler::noParity;
10839    case Assembler::noParity:     return Assembler::parity;
10840  }
10841  ShouldNotReachHere(); return Assembler::overflow;
10842}
10843
10844SkipIfEqual::SkipIfEqual(
10845    MacroAssembler* masm, const bool* flag_addr, bool value) {
10846  _masm = masm;
10847  _masm->cmp8(ExternalAddress((address)flag_addr), value);
10848  _masm->jcc(Assembler::equal, _label);
10849}
10850
10851SkipIfEqual::~SkipIfEqual() {
10852  _masm->bind(_label);
10853}
10854
10855// 32-bit Windows has its own fast-path implementation
10856// of get_thread
10857#if !defined(WIN32) || defined(_LP64)
10858
10859// This is simply a call to Thread::current()
10860void MacroAssembler::get_thread(Register thread) {
10861  if (thread != rax) {
10862    push(rax);
10863  }
10864  LP64_ONLY(push(rdi);)
10865  LP64_ONLY(push(rsi);)
10866  push(rdx);
10867  push(rcx);
10868#ifdef _LP64
10869  push(r8);
10870  push(r9);
10871  push(r10);
10872  push(r11);
10873#endif
10874
10875  MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10876
10877#ifdef _LP64
10878  pop(r11);
10879  pop(r10);
10880  pop(r9);
10881  pop(r8);
10882#endif
10883  pop(rcx);
10884  pop(rdx);
10885  LP64_ONLY(pop(rsi);)
10886  LP64_ONLY(pop(rdi);)
10887  if (thread != rax) {
10888    mov(thread, rax);
10889    pop(rax);
10890  }
10891}
10892
10893#endif
10894