macroAssembler_x86.cpp revision 9898:2794bc7859f5
1112158Sdas/*
2112158Sdas * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
3112158Sdas * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4112158Sdas *
5112158Sdas * This code is free software; you can redistribute it and/or modify it
6112158Sdas * under the terms of the GNU General Public License version 2 only, as
7112158Sdas * published by the Free Software Foundation.
8112158Sdas *
9112158Sdas * This code is distributed in the hope that it will be useful, but WITHOUT
10112158Sdas * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11112158Sdas * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12112158Sdas * version 2 for more details (a copy is included in the LICENSE file that
13112158Sdas * accompanied this code).
14112158Sdas *
15112158Sdas * You should have received a copy of the GNU General Public License version
16112158Sdas * 2 along with this work; if not, write to the Free Software Foundation,
17112158Sdas * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18112158Sdas *
19112158Sdas * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20112158Sdas * or visit www.oracle.com if you need additional information or have any
21112158Sdas * questions.
22112158Sdas *
23112158Sdas */
24112158Sdas
25112158Sdas#include "precompiled.hpp"
26112158Sdas#include "asm/assembler.hpp"
27112158Sdas#include "asm/assembler.inline.hpp"
28112158Sdas#include "compiler/disassembler.hpp"
29165743Sdas#include "gc/shared/cardTableModRefBS.hpp"
30112158Sdas#include "gc/shared/collectedHeap.inline.hpp"
31112158Sdas#include "interpreter/interpreter.hpp"
32112158Sdas#include "memory/resourceArea.hpp"
33112158Sdas#include "memory/universe.hpp"
34112158Sdas#include "oops/klass.inline.hpp"
35112158Sdas#include "prims/methodHandles.hpp"
36219557Sdas#include "runtime/biasedLocking.hpp"
37112158Sdas#include "runtime/interfaceSupport.hpp"
38219557Sdas#include "runtime/objectMonitor.hpp"
39112158Sdas#include "runtime/os.hpp"
40112158Sdas#include "runtime/sharedRuntime.hpp"
41112158Sdas#include "runtime/stubRoutines.hpp"
42112158Sdas#include "runtime/thread.hpp"
43112158Sdas#include "utilities/macros.hpp"
44112158Sdas#if INCLUDE_ALL_GCS
45112158Sdas#include "gc/g1/g1CollectedHeap.inline.hpp"
46219557Sdas#include "gc/g1/g1SATBCardTableModRefBS.hpp"
47187808Sdas#include "gc/g1/heapRegion.hpp"
48187808Sdas#endif // INCLUDE_ALL_GCS
49187808Sdas#include "crc32c.h"
50187808Sdas#ifdef COMPILER2
51187808Sdas#include "opto/intrinsicnode.hpp"
52187808Sdas#endif
53187808Sdas
54187808Sdas#ifdef PRODUCT
55187808Sdas#define BLOCK_COMMENT(str) /* nothing */
56187808Sdas#define STOP(error) stop(error)
57187808Sdas#else
58187808Sdas#define BLOCK_COMMENT(str) block_comment(str)
59187808Sdas#define STOP(error) block_comment(error); stop(error)
60187808Sdas#endif
61187808Sdas
62112158Sdas#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
63112158Sdas
64112158Sdas#ifdef ASSERT
65112158Sdasbool AbstractAssembler::pd_check_instruction_mark() { return true; }
66219557Sdas#endif
67219557Sdas
68112158Sdasstatic Assembler::Condition reverse[] = {
69112158Sdas    Assembler::noOverflow     /* overflow      = 0x0 */ ,
70112158Sdas    Assembler::overflow       /* noOverflow    = 0x1 */ ,
71112158Sdas    Assembler::aboveEqual     /* carrySet      = 0x2, below         = 0x2 */ ,
72112158Sdas    Assembler::below          /* aboveEqual    = 0x3, carryClear    = 0x3 */ ,
73112158Sdas    Assembler::notZero        /* zero          = 0x4, equal         = 0x4 */ ,
74112158Sdas    Assembler::zero           /* notZero       = 0x5, notEqual      = 0x5 */ ,
75112158Sdas    Assembler::above          /* belowEqual    = 0x6 */ ,
76112158Sdas    Assembler::belowEqual     /* above         = 0x7 */ ,
77112158Sdas    Assembler::positive       /* negative      = 0x8 */ ,
78112158Sdas    Assembler::negative       /* positive      = 0x9 */ ,
79112158Sdas    Assembler::noParity       /* parity        = 0xa */ ,
80112158Sdas    Assembler::parity         /* noParity      = 0xb */ ,
81112158Sdas    Assembler::greaterEqual   /* less          = 0xc */ ,
82112158Sdas    Assembler::less           /* greaterEqual  = 0xd */ ,
83112158Sdas    Assembler::greater        /* lessEqual     = 0xe */ ,
84112158Sdas    Assembler::lessEqual      /* greater       = 0xf, */
85112158Sdas
86112158Sdas};
87112158Sdas
88112158Sdas
89112158Sdas// Implementation of MacroAssembler
90112158Sdas
91112158Sdas// First all the versions that have distinct versions depending on 32/64 bit
92219557Sdas// Unless the difference is trivial (1 line or so).
93112158Sdas
94112158Sdas#ifndef _LP64
95112158Sdas
96112158Sdas// 32bit versions
97112158Sdas
98112158SdasAddress MacroAssembler::as_Address(AddressLiteral adr) {
99112158Sdas  return Address(adr.target(), adr.rspec());
100112158Sdas}
101112158Sdas
102112158SdasAddress MacroAssembler::as_Address(ArrayAddress adr) {
103219557Sdas  return Address::make_array(adr);
104219557Sdas}
105112158Sdas
106219557Sdasvoid MacroAssembler::call_VM_leaf_base(address entry_point,
107112158Sdas                                       int number_of_arguments) {
108219557Sdas  call(RuntimeAddress(entry_point));
109219557Sdas  increment(rsp, number_of_arguments * wordSize);
110112158Sdas}
111112158Sdas
112219557Sdasvoid MacroAssembler::cmpklass(Address src1, Metadata* obj) {
113112158Sdas  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
114112158Sdas}
115112158Sdas
116112158Sdasvoid MacroAssembler::cmpklass(Register src1, Metadata* obj) {
117112158Sdas  cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
118112158Sdas}
119112158Sdas
120112158Sdasvoid MacroAssembler::cmpoop(Address src1, jobject obj) {
121112158Sdas  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
122112158Sdas}
123112158Sdas
124112158Sdasvoid MacroAssembler::cmpoop(Register src1, jobject obj) {
125112158Sdas  cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
126112158Sdas}
127112158Sdas
128112158Sdasvoid MacroAssembler::extend_sign(Register hi, Register lo) {
129112158Sdas  // According to Intel Doc. AP-526, "Integer Divide", p.18.
130112158Sdas  if (VM_Version::is_P6() && hi == rdx && lo == rax) {
131112158Sdas    cdql();
132112158Sdas  } else {
133112158Sdas    movl(hi, lo);
134112158Sdas    sarl(hi, 31);
135112158Sdas  }
136112158Sdas}
137112158Sdas
138112158Sdasvoid MacroAssembler::jC2(Register tmp, Label& L) {
139112158Sdas  // set parity bit if FPU flag C2 is set (via rax)
140112158Sdas  save_rax(tmp);
141112158Sdas  fwait(); fnstsw_ax();
142112158Sdas  sahf();
143112158Sdas  restore_rax(tmp);
144112158Sdas  // branch
145112158Sdas  jcc(Assembler::parity, L);
146112158Sdas}
147112158Sdas
148112158Sdasvoid MacroAssembler::jnC2(Register tmp, Label& L) {
149112158Sdas  // set parity bit if FPU flag C2 is set (via rax)
150112158Sdas  save_rax(tmp);
151112158Sdas  fwait(); fnstsw_ax();
152112158Sdas  sahf();
153112158Sdas  restore_rax(tmp);
154112158Sdas  // branch
155112158Sdas  jcc(Assembler::noParity, L);
156112158Sdas}
157112158Sdas
158112158Sdas// 32bit can do a case table jump in one instruction but we no longer allow the base
159112158Sdas// to be installed in the Address class
160112158Sdasvoid MacroAssembler::jump(ArrayAddress entry) {
161112158Sdas  jmp(as_Address(entry));
162112158Sdas}
163187808Sdas
164112158Sdas// Note: y_lo will be destroyed
165112158Sdasvoid MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
166112158Sdas  // Long compare for Java (semantics as described in JVM spec.)
167187808Sdas  Label high, low, done;
168112158Sdas
169112158Sdas  cmpl(x_hi, y_hi);
170112158Sdas  jcc(Assembler::less, low);
171  jcc(Assembler::greater, high);
172  // x_hi is the return register
173  xorl(x_hi, x_hi);
174  cmpl(x_lo, y_lo);
175  jcc(Assembler::below, low);
176  jcc(Assembler::equal, done);
177
178  bind(high);
179  xorl(x_hi, x_hi);
180  increment(x_hi);
181  jmp(done);
182
183  bind(low);
184  xorl(x_hi, x_hi);
185  decrementl(x_hi);
186
187  bind(done);
188}
189
190void MacroAssembler::lea(Register dst, AddressLiteral src) {
191    mov_literal32(dst, (int32_t)src.target(), src.rspec());
192}
193
194void MacroAssembler::lea(Address dst, AddressLiteral adr) {
195  // leal(dst, as_Address(adr));
196  // see note in movl as to why we must use a move
197  mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
198}
199
200void MacroAssembler::leave() {
201  mov(rsp, rbp);
202  pop(rbp);
203}
204
205void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
206  // Multiplication of two Java long values stored on the stack
207  // as illustrated below. Result is in rdx:rax.
208  //
209  // rsp ---> [  ??  ] \               \
210  //            ....    | y_rsp_offset  |
211  //          [ y_lo ] /  (in bytes)    | x_rsp_offset
212  //          [ y_hi ]                  | (in bytes)
213  //            ....                    |
214  //          [ x_lo ]                 /
215  //          [ x_hi ]
216  //            ....
217  //
218  // Basic idea: lo(result) = lo(x_lo * y_lo)
219  //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
220  Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
221  Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
222  Label quick;
223  // load x_hi, y_hi and check if quick
224  // multiplication is possible
225  movl(rbx, x_hi);
226  movl(rcx, y_hi);
227  movl(rax, rbx);
228  orl(rbx, rcx);                                 // rbx, = 0 <=> x_hi = 0 and y_hi = 0
229  jcc(Assembler::zero, quick);                   // if rbx, = 0 do quick multiply
230  // do full multiplication
231  // 1st step
232  mull(y_lo);                                    // x_hi * y_lo
233  movl(rbx, rax);                                // save lo(x_hi * y_lo) in rbx,
234  // 2nd step
235  movl(rax, x_lo);
236  mull(rcx);                                     // x_lo * y_hi
237  addl(rbx, rax);                                // add lo(x_lo * y_hi) to rbx,
238  // 3rd step
239  bind(quick);                                   // note: rbx, = 0 if quick multiply!
240  movl(rax, x_lo);
241  mull(y_lo);                                    // x_lo * y_lo
242  addl(rdx, rbx);                                // correct hi(x_lo * y_lo)
243}
244
245void MacroAssembler::lneg(Register hi, Register lo) {
246  negl(lo);
247  adcl(hi, 0);
248  negl(hi);
249}
250
251void MacroAssembler::lshl(Register hi, Register lo) {
252  // Java shift left long support (semantics as described in JVM spec., p.305)
253  // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
254  // shift value is in rcx !
255  assert(hi != rcx, "must not use rcx");
256  assert(lo != rcx, "must not use rcx");
257  const Register s = rcx;                        // shift count
258  const int      n = BitsPerWord;
259  Label L;
260  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
261  cmpl(s, n);                                    // if (s < n)
262  jcc(Assembler::less, L);                       // else (s >= n)
263  movl(hi, lo);                                  // x := x << n
264  xorl(lo, lo);
265  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
266  bind(L);                                       // s (mod n) < n
267  shldl(hi, lo);                                 // x := x << s
268  shll(lo);
269}
270
271
272void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
273  // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
274  // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
275  assert(hi != rcx, "must not use rcx");
276  assert(lo != rcx, "must not use rcx");
277  const Register s = rcx;                        // shift count
278  const int      n = BitsPerWord;
279  Label L;
280  andl(s, 0x3f);                                 // s := s & 0x3f (s < 0x40)
281  cmpl(s, n);                                    // if (s < n)
282  jcc(Assembler::less, L);                       // else (s >= n)
283  movl(lo, hi);                                  // x := x >> n
284  if (sign_extension) sarl(hi, 31);
285  else                xorl(hi, hi);
286  // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
287  bind(L);                                       // s (mod n) < n
288  shrdl(lo, hi);                                 // x := x >> s
289  if (sign_extension) sarl(hi);
290  else                shrl(hi);
291}
292
293void MacroAssembler::movoop(Register dst, jobject obj) {
294  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
295}
296
297void MacroAssembler::movoop(Address dst, jobject obj) {
298  mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
299}
300
301void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
302  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
303}
304
305void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
306  mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
307}
308
309void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
310  // scratch register is not used,
311  // it is defined to match parameters of 64-bit version of this method.
312  if (src.is_lval()) {
313    mov_literal32(dst, (intptr_t)src.target(), src.rspec());
314  } else {
315    movl(dst, as_Address(src));
316  }
317}
318
319void MacroAssembler::movptr(ArrayAddress dst, Register src) {
320  movl(as_Address(dst), src);
321}
322
323void MacroAssembler::movptr(Register dst, ArrayAddress src) {
324  movl(dst, as_Address(src));
325}
326
327// src should NEVER be a real pointer. Use AddressLiteral for true pointers
328void MacroAssembler::movptr(Address dst, intptr_t src) {
329  movl(dst, src);
330}
331
332
333void MacroAssembler::pop_callee_saved_registers() {
334  pop(rcx);
335  pop(rdx);
336  pop(rdi);
337  pop(rsi);
338}
339
340void MacroAssembler::pop_fTOS() {
341  fld_d(Address(rsp, 0));
342  addl(rsp, 2 * wordSize);
343}
344
345void MacroAssembler::push_callee_saved_registers() {
346  push(rsi);
347  push(rdi);
348  push(rdx);
349  push(rcx);
350}
351
352void MacroAssembler::push_fTOS() {
353  subl(rsp, 2 * wordSize);
354  fstp_d(Address(rsp, 0));
355}
356
357
358void MacroAssembler::pushoop(jobject obj) {
359  push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
360}
361
362void MacroAssembler::pushklass(Metadata* obj) {
363  push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
364}
365
366void MacroAssembler::pushptr(AddressLiteral src) {
367  if (src.is_lval()) {
368    push_literal32((int32_t)src.target(), src.rspec());
369  } else {
370    pushl(as_Address(src));
371  }
372}
373
374void MacroAssembler::set_word_if_not_zero(Register dst) {
375  xorl(dst, dst);
376  set_byte_if_not_zero(dst);
377}
378
379static void pass_arg0(MacroAssembler* masm, Register arg) {
380  masm->push(arg);
381}
382
383static void pass_arg1(MacroAssembler* masm, Register arg) {
384  masm->push(arg);
385}
386
387static void pass_arg2(MacroAssembler* masm, Register arg) {
388  masm->push(arg);
389}
390
391static void pass_arg3(MacroAssembler* masm, Register arg) {
392  masm->push(arg);
393}
394
395#ifndef PRODUCT
396extern "C" void findpc(intptr_t x);
397#endif
398
399void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
400  // In order to get locks to work, we need to fake a in_VM state
401  JavaThread* thread = JavaThread::current();
402  JavaThreadState saved_state = thread->thread_state();
403  thread->set_thread_state(_thread_in_vm);
404  if (ShowMessageBoxOnError) {
405    JavaThread* thread = JavaThread::current();
406    JavaThreadState saved_state = thread->thread_state();
407    thread->set_thread_state(_thread_in_vm);
408    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
409      ttyLocker ttyl;
410      BytecodeCounter::print();
411    }
412    // To see where a verify_oop failed, get $ebx+40/X for this frame.
413    // This is the value of eip which points to where verify_oop will return.
414    if (os::message_box(msg, "Execution stopped, print registers?")) {
415      print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
416      BREAKPOINT;
417    }
418  } else {
419    ttyLocker ttyl;
420    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
421  }
422  // Don't assert holding the ttyLock
423    assert(false, "DEBUG MESSAGE: %s", msg);
424  ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
425}
426
427void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
428  ttyLocker ttyl;
429  FlagSetting fs(Debugging, true);
430  tty->print_cr("eip = 0x%08x", eip);
431#ifndef PRODUCT
432  if ((WizardMode || Verbose) && PrintMiscellaneous) {
433    tty->cr();
434    findpc(eip);
435    tty->cr();
436  }
437#endif
438#define PRINT_REG(rax) \
439  { tty->print("%s = ", #rax); os::print_location(tty, rax); }
440  PRINT_REG(rax);
441  PRINT_REG(rbx);
442  PRINT_REG(rcx);
443  PRINT_REG(rdx);
444  PRINT_REG(rdi);
445  PRINT_REG(rsi);
446  PRINT_REG(rbp);
447  PRINT_REG(rsp);
448#undef PRINT_REG
449  // Print some words near top of staack.
450  int* dump_sp = (int*) rsp;
451  for (int col1 = 0; col1 < 8; col1++) {
452    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
453    os::print_location(tty, *dump_sp++);
454  }
455  for (int row = 0; row < 16; row++) {
456    tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
457    for (int col = 0; col < 8; col++) {
458      tty->print(" 0x%08x", *dump_sp++);
459    }
460    tty->cr();
461  }
462  // Print some instructions around pc:
463  Disassembler::decode((address)eip-64, (address)eip);
464  tty->print_cr("--------");
465  Disassembler::decode((address)eip, (address)eip+32);
466}
467
468void MacroAssembler::stop(const char* msg) {
469  ExternalAddress message((address)msg);
470  // push address of message
471  pushptr(message.addr());
472  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
473  pusha();                                            // push registers
474  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
475  hlt();
476}
477
478void MacroAssembler::warn(const char* msg) {
479  push_CPU_state();
480
481  ExternalAddress message((address) msg);
482  // push address of message
483  pushptr(message.addr());
484
485  call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
486  addl(rsp, wordSize);       // discard argument
487  pop_CPU_state();
488}
489
490void MacroAssembler::print_state() {
491  { Label L; call(L, relocInfo::none); bind(L); }     // push eip
492  pusha();                                            // push registers
493
494  push_CPU_state();
495  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
496  pop_CPU_state();
497
498  popa();
499  addl(rsp, wordSize);
500}
501
502#else // _LP64
503
504// 64 bit versions
505
506Address MacroAssembler::as_Address(AddressLiteral adr) {
507  // amd64 always does this as a pc-rel
508  // we can be absolute or disp based on the instruction type
509  // jmp/call are displacements others are absolute
510  assert(!adr.is_lval(), "must be rval");
511  assert(reachable(adr), "must be");
512  return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
513
514}
515
516Address MacroAssembler::as_Address(ArrayAddress adr) {
517  AddressLiteral base = adr.base();
518  lea(rscratch1, base);
519  Address index = adr.index();
520  assert(index._disp == 0, "must not have disp"); // maybe it can?
521  Address array(rscratch1, index._index, index._scale, index._disp);
522  return array;
523}
524
525void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
526  Label L, E;
527
528#ifdef _WIN64
529  // Windows always allocates space for it's register args
530  assert(num_args <= 4, "only register arguments supported");
531  subq(rsp,  frame::arg_reg_save_area_bytes);
532#endif
533
534  // Align stack if necessary
535  testl(rsp, 15);
536  jcc(Assembler::zero, L);
537
538  subq(rsp, 8);
539  {
540    call(RuntimeAddress(entry_point));
541  }
542  addq(rsp, 8);
543  jmp(E);
544
545  bind(L);
546  {
547    call(RuntimeAddress(entry_point));
548  }
549
550  bind(E);
551
552#ifdef _WIN64
553  // restore stack pointer
554  addq(rsp, frame::arg_reg_save_area_bytes);
555#endif
556
557}
558
559void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
560  assert(!src2.is_lval(), "should use cmpptr");
561
562  if (reachable(src2)) {
563    cmpq(src1, as_Address(src2));
564  } else {
565    lea(rscratch1, src2);
566    Assembler::cmpq(src1, Address(rscratch1, 0));
567  }
568}
569
570int MacroAssembler::corrected_idivq(Register reg) {
571  // Full implementation of Java ldiv and lrem; checks for special
572  // case as described in JVM spec., p.243 & p.271.  The function
573  // returns the (pc) offset of the idivl instruction - may be needed
574  // for implicit exceptions.
575  //
576  //         normal case                           special case
577  //
578  // input : rax: dividend                         min_long
579  //         reg: divisor   (may not be eax/edx)   -1
580  //
581  // output: rax: quotient  (= rax idiv reg)       min_long
582  //         rdx: remainder (= rax irem reg)       0
583  assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
584  static const int64_t min_long = 0x8000000000000000;
585  Label normal_case, special_case;
586
587  // check for special case
588  cmp64(rax, ExternalAddress((address) &min_long));
589  jcc(Assembler::notEqual, normal_case);
590  xorl(rdx, rdx); // prepare rdx for possible special case (where
591                  // remainder = 0)
592  cmpq(reg, -1);
593  jcc(Assembler::equal, special_case);
594
595  // handle normal case
596  bind(normal_case);
597  cdqq();
598  int idivq_offset = offset();
599  idivq(reg);
600
601  // normal and special case exit
602  bind(special_case);
603
604  return idivq_offset;
605}
606
607void MacroAssembler::decrementq(Register reg, int value) {
608  if (value == min_jint) { subq(reg, value); return; }
609  if (value <  0) { incrementq(reg, -value); return; }
610  if (value == 0) {                        ; return; }
611  if (value == 1 && UseIncDec) { decq(reg) ; return; }
612  /* else */      { subq(reg, value)       ; return; }
613}
614
615void MacroAssembler::decrementq(Address dst, int value) {
616  if (value == min_jint) { subq(dst, value); return; }
617  if (value <  0) { incrementq(dst, -value); return; }
618  if (value == 0) {                        ; return; }
619  if (value == 1 && UseIncDec) { decq(dst) ; return; }
620  /* else */      { subq(dst, value)       ; return; }
621}
622
623void MacroAssembler::incrementq(AddressLiteral dst) {
624  if (reachable(dst)) {
625    incrementq(as_Address(dst));
626  } else {
627    lea(rscratch1, dst);
628    incrementq(Address(rscratch1, 0));
629  }
630}
631
632void MacroAssembler::incrementq(Register reg, int value) {
633  if (value == min_jint) { addq(reg, value); return; }
634  if (value <  0) { decrementq(reg, -value); return; }
635  if (value == 0) {                        ; return; }
636  if (value == 1 && UseIncDec) { incq(reg) ; return; }
637  /* else */      { addq(reg, value)       ; return; }
638}
639
640void MacroAssembler::incrementq(Address dst, int value) {
641  if (value == min_jint) { addq(dst, value); return; }
642  if (value <  0) { decrementq(dst, -value); return; }
643  if (value == 0) {                        ; return; }
644  if (value == 1 && UseIncDec) { incq(dst) ; return; }
645  /* else */      { addq(dst, value)       ; return; }
646}
647
648// 32bit can do a case table jump in one instruction but we no longer allow the base
649// to be installed in the Address class
650void MacroAssembler::jump(ArrayAddress entry) {
651  lea(rscratch1, entry.base());
652  Address dispatch = entry.index();
653  assert(dispatch._base == noreg, "must be");
654  dispatch._base = rscratch1;
655  jmp(dispatch);
656}
657
658void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
659  ShouldNotReachHere(); // 64bit doesn't use two regs
660  cmpq(x_lo, y_lo);
661}
662
663void MacroAssembler::lea(Register dst, AddressLiteral src) {
664    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
665}
666
667void MacroAssembler::lea(Address dst, AddressLiteral adr) {
668  mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
669  movptr(dst, rscratch1);
670}
671
672void MacroAssembler::leave() {
673  // %%% is this really better? Why not on 32bit too?
674  emit_int8((unsigned char)0xC9); // LEAVE
675}
676
677void MacroAssembler::lneg(Register hi, Register lo) {
678  ShouldNotReachHere(); // 64bit doesn't use two regs
679  negq(lo);
680}
681
682void MacroAssembler::movoop(Register dst, jobject obj) {
683  mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
684}
685
686void MacroAssembler::movoop(Address dst, jobject obj) {
687  mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
688  movq(dst, rscratch1);
689}
690
691void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
692  mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
693}
694
695void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
696  mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
697  movq(dst, rscratch1);
698}
699
700void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
701  if (src.is_lval()) {
702    mov_literal64(dst, (intptr_t)src.target(), src.rspec());
703  } else {
704    if (reachable(src)) {
705      movq(dst, as_Address(src));
706    } else {
707      lea(scratch, src);
708      movq(dst, Address(scratch, 0));
709    }
710  }
711}
712
713void MacroAssembler::movptr(ArrayAddress dst, Register src) {
714  movq(as_Address(dst), src);
715}
716
717void MacroAssembler::movptr(Register dst, ArrayAddress src) {
718  movq(dst, as_Address(src));
719}
720
721// src should NEVER be a real pointer. Use AddressLiteral for true pointers
722void MacroAssembler::movptr(Address dst, intptr_t src) {
723  mov64(rscratch1, src);
724  movq(dst, rscratch1);
725}
726
727// These are mostly for initializing NULL
728void MacroAssembler::movptr(Address dst, int32_t src) {
729  movslq(dst, src);
730}
731
732void MacroAssembler::movptr(Register dst, int32_t src) {
733  mov64(dst, (intptr_t)src);
734}
735
736void MacroAssembler::pushoop(jobject obj) {
737  movoop(rscratch1, obj);
738  push(rscratch1);
739}
740
741void MacroAssembler::pushklass(Metadata* obj) {
742  mov_metadata(rscratch1, obj);
743  push(rscratch1);
744}
745
746void MacroAssembler::pushptr(AddressLiteral src) {
747  lea(rscratch1, src);
748  if (src.is_lval()) {
749    push(rscratch1);
750  } else {
751    pushq(Address(rscratch1, 0));
752  }
753}
754
755void MacroAssembler::reset_last_Java_frame(bool clear_fp,
756                                           bool clear_pc) {
757  // we must set sp to zero to clear frame
758  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
759  // must clear fp, so that compiled frames are not confused; it is
760  // possible that we need it only for debugging
761  if (clear_fp) {
762    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
763  }
764
765  if (clear_pc) {
766    movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
767  }
768}
769
770void MacroAssembler::set_last_Java_frame(Register last_java_sp,
771                                         Register last_java_fp,
772                                         address  last_java_pc) {
773  // determine last_java_sp register
774  if (!last_java_sp->is_valid()) {
775    last_java_sp = rsp;
776  }
777
778  // last_java_fp is optional
779  if (last_java_fp->is_valid()) {
780    movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
781           last_java_fp);
782  }
783
784  // last_java_pc is optional
785  if (last_java_pc != NULL) {
786    Address java_pc(r15_thread,
787                    JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
788    lea(rscratch1, InternalAddress(last_java_pc));
789    movptr(java_pc, rscratch1);
790  }
791
792  movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
793}
794
795static void pass_arg0(MacroAssembler* masm, Register arg) {
796  if (c_rarg0 != arg ) {
797    masm->mov(c_rarg0, arg);
798  }
799}
800
801static void pass_arg1(MacroAssembler* masm, Register arg) {
802  if (c_rarg1 != arg ) {
803    masm->mov(c_rarg1, arg);
804  }
805}
806
807static void pass_arg2(MacroAssembler* masm, Register arg) {
808  if (c_rarg2 != arg ) {
809    masm->mov(c_rarg2, arg);
810  }
811}
812
813static void pass_arg3(MacroAssembler* masm, Register arg) {
814  if (c_rarg3 != arg ) {
815    masm->mov(c_rarg3, arg);
816  }
817}
818
819void MacroAssembler::stop(const char* msg) {
820  address rip = pc();
821  pusha(); // get regs on stack
822  lea(c_rarg0, ExternalAddress((address) msg));
823  lea(c_rarg1, InternalAddress(rip));
824  movq(c_rarg2, rsp); // pass pointer to regs array
825  andq(rsp, -16); // align stack as required by ABI
826  call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
827  hlt();
828}
829
830void MacroAssembler::warn(const char* msg) {
831  push(rbp);
832  movq(rbp, rsp);
833  andq(rsp, -16);     // align stack as required by push_CPU_state and call
834  push_CPU_state();   // keeps alignment at 16 bytes
835  lea(c_rarg0, ExternalAddress((address) msg));
836  call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
837  pop_CPU_state();
838  mov(rsp, rbp);
839  pop(rbp);
840}
841
842void MacroAssembler::print_state() {
843  address rip = pc();
844  pusha();            // get regs on stack
845  push(rbp);
846  movq(rbp, rsp);
847  andq(rsp, -16);     // align stack as required by push_CPU_state and call
848  push_CPU_state();   // keeps alignment at 16 bytes
849
850  lea(c_rarg0, InternalAddress(rip));
851  lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
852  call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
853
854  pop_CPU_state();
855  mov(rsp, rbp);
856  pop(rbp);
857  popa();
858}
859
860#ifndef PRODUCT
861extern "C" void findpc(intptr_t x);
862#endif
863
864void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
865  // In order to get locks to work, we need to fake a in_VM state
866  if (ShowMessageBoxOnError) {
867    JavaThread* thread = JavaThread::current();
868    JavaThreadState saved_state = thread->thread_state();
869    thread->set_thread_state(_thread_in_vm);
870#ifndef PRODUCT
871    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
872      ttyLocker ttyl;
873      BytecodeCounter::print();
874    }
875#endif
876    // To see where a verify_oop failed, get $ebx+40/X for this frame.
877    // XXX correct this offset for amd64
878    // This is the value of eip which points to where verify_oop will return.
879    if (os::message_box(msg, "Execution stopped, print registers?")) {
880      print_state64(pc, regs);
881      BREAKPOINT;
882      assert(false, "start up GDB");
883    }
884    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
885  } else {
886    ttyLocker ttyl;
887    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
888                    msg);
889    assert(false, "DEBUG MESSAGE: %s", msg);
890  }
891}
892
893void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
894  ttyLocker ttyl;
895  FlagSetting fs(Debugging, true);
896  tty->print_cr("rip = 0x%016lx", pc);
897#ifndef PRODUCT
898  tty->cr();
899  findpc(pc);
900  tty->cr();
901#endif
902#define PRINT_REG(rax, value) \
903  { tty->print("%s = ", #rax); os::print_location(tty, value); }
904  PRINT_REG(rax, regs[15]);
905  PRINT_REG(rbx, regs[12]);
906  PRINT_REG(rcx, regs[14]);
907  PRINT_REG(rdx, regs[13]);
908  PRINT_REG(rdi, regs[8]);
909  PRINT_REG(rsi, regs[9]);
910  PRINT_REG(rbp, regs[10]);
911  PRINT_REG(rsp, regs[11]);
912  PRINT_REG(r8 , regs[7]);
913  PRINT_REG(r9 , regs[6]);
914  PRINT_REG(r10, regs[5]);
915  PRINT_REG(r11, regs[4]);
916  PRINT_REG(r12, regs[3]);
917  PRINT_REG(r13, regs[2]);
918  PRINT_REG(r14, regs[1]);
919  PRINT_REG(r15, regs[0]);
920#undef PRINT_REG
921  // Print some words near top of staack.
922  int64_t* rsp = (int64_t*) regs[11];
923  int64_t* dump_sp = rsp;
924  for (int col1 = 0; col1 < 8; col1++) {
925    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
926    os::print_location(tty, *dump_sp++);
927  }
928  for (int row = 0; row < 25; row++) {
929    tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
930    for (int col = 0; col < 4; col++) {
931      tty->print(" 0x%016lx", *dump_sp++);
932    }
933    tty->cr();
934  }
935  // Print some instructions around pc:
936  Disassembler::decode((address)pc-64, (address)pc);
937  tty->print_cr("--------");
938  Disassembler::decode((address)pc, (address)pc+32);
939}
940
941#endif // _LP64
942
943// Now versions that are common to 32/64 bit
944
945void MacroAssembler::addptr(Register dst, int32_t imm32) {
946  LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
947}
948
949void MacroAssembler::addptr(Register dst, Register src) {
950  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
951}
952
953void MacroAssembler::addptr(Address dst, Register src) {
954  LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
955}
956
957void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
958  if (reachable(src)) {
959    Assembler::addsd(dst, as_Address(src));
960  } else {
961    lea(rscratch1, src);
962    Assembler::addsd(dst, Address(rscratch1, 0));
963  }
964}
965
966void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
967  if (reachable(src)) {
968    addss(dst, as_Address(src));
969  } else {
970    lea(rscratch1, src);
971    addss(dst, Address(rscratch1, 0));
972  }
973}
974
975void MacroAssembler::align(int modulus) {
976  align(modulus, offset());
977}
978
979void MacroAssembler::align(int modulus, int target) {
980  if (target % modulus != 0) {
981    nop(modulus - (target % modulus));
982  }
983}
984
985void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
986  // Used in sign-masking with aligned address.
987  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
988  if (reachable(src)) {
989    Assembler::andpd(dst, as_Address(src));
990  } else {
991    lea(rscratch1, src);
992    Assembler::andpd(dst, Address(rscratch1, 0));
993  }
994}
995
996void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
997  // Used in sign-masking with aligned address.
998  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
999  if (reachable(src)) {
1000    Assembler::andps(dst, as_Address(src));
1001  } else {
1002    lea(rscratch1, src);
1003    Assembler::andps(dst, Address(rscratch1, 0));
1004  }
1005}
1006
1007void MacroAssembler::andptr(Register dst, int32_t imm32) {
1008  LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1009}
1010
1011void MacroAssembler::atomic_incl(Address counter_addr) {
1012  if (os::is_MP())
1013    lock();
1014  incrementl(counter_addr);
1015}
1016
1017void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1018  if (reachable(counter_addr)) {
1019    atomic_incl(as_Address(counter_addr));
1020  } else {
1021    lea(scr, counter_addr);
1022    atomic_incl(Address(scr, 0));
1023  }
1024}
1025
1026#ifdef _LP64
1027void MacroAssembler::atomic_incq(Address counter_addr) {
1028  if (os::is_MP())
1029    lock();
1030  incrementq(counter_addr);
1031}
1032
1033void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1034  if (reachable(counter_addr)) {
1035    atomic_incq(as_Address(counter_addr));
1036  } else {
1037    lea(scr, counter_addr);
1038    atomic_incq(Address(scr, 0));
1039  }
1040}
1041#endif
1042
1043// Writes to stack successive pages until offset reached to check for
1044// stack overflow + shadow pages.  This clobbers tmp.
1045void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1046  movptr(tmp, rsp);
1047  // Bang stack for total size given plus shadow page size.
1048  // Bang one page at a time because large size can bang beyond yellow and
1049  // red zones.
1050  Label loop;
1051  bind(loop);
1052  movl(Address(tmp, (-os::vm_page_size())), size );
1053  subptr(tmp, os::vm_page_size());
1054  subl(size, os::vm_page_size());
1055  jcc(Assembler::greater, loop);
1056
1057  // Bang down shadow pages too.
1058  // At this point, (tmp-0) is the last address touched, so don't
1059  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
1060  // was post-decremented.)  Skip this address by starting at i=1, and
1061  // touch a few more pages below.  N.B.  It is important to touch all
1062  // the way down including all pages in the shadow zone.
1063  for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) {
1064    // this could be any sized move but this is can be a debugging crumb
1065    // so the bigger the better.
1066    movptr(Address(tmp, (-i*os::vm_page_size())), size );
1067  }
1068}
1069
1070void MacroAssembler::reserved_stack_check() {
1071    // testing if reserved zone needs to be enabled
1072    Label no_reserved_zone_enabling;
1073    Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
1074    NOT_LP64(get_thread(rsi);)
1075
1076    cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset()));
1077    jcc(Assembler::below, no_reserved_zone_enabling);
1078
1079    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread);
1080    jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
1081    should_not_reach_here();
1082
1083    bind(no_reserved_zone_enabling);
1084}
1085
1086int MacroAssembler::biased_locking_enter(Register lock_reg,
1087                                         Register obj_reg,
1088                                         Register swap_reg,
1089                                         Register tmp_reg,
1090                                         bool swap_reg_contains_mark,
1091                                         Label& done,
1092                                         Label* slow_case,
1093                                         BiasedLockingCounters* counters) {
1094  assert(UseBiasedLocking, "why call this otherwise?");
1095  assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1096  assert(tmp_reg != noreg, "tmp_reg must be supplied");
1097  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1098  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1099  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
1100  Address saved_mark_addr(lock_reg, 0);
1101
1102  if (PrintBiasedLockingStatistics && counters == NULL) {
1103    counters = BiasedLocking::counters();
1104  }
1105  // Biased locking
1106  // See whether the lock is currently biased toward our thread and
1107  // whether the epoch is still valid
1108  // Note that the runtime guarantees sufficient alignment of JavaThread
1109  // pointers to allow age to be placed into low bits
1110  // First check to see whether biasing is even enabled for this object
1111  Label cas_label;
1112  int null_check_offset = -1;
1113  if (!swap_reg_contains_mark) {
1114    null_check_offset = offset();
1115    movptr(swap_reg, mark_addr);
1116  }
1117  movptr(tmp_reg, swap_reg);
1118  andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1119  cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1120  jcc(Assembler::notEqual, cas_label);
1121  // The bias pattern is present in the object's header. Need to check
1122  // whether the bias owner and the epoch are both still current.
1123#ifndef _LP64
1124  // Note that because there is no current thread register on x86_32 we
1125  // need to store off the mark word we read out of the object to
1126  // avoid reloading it and needing to recheck invariants below. This
1127  // store is unfortunate but it makes the overall code shorter and
1128  // simpler.
1129  movptr(saved_mark_addr, swap_reg);
1130#endif
1131  if (swap_reg_contains_mark) {
1132    null_check_offset = offset();
1133  }
1134  load_prototype_header(tmp_reg, obj_reg);
1135#ifdef _LP64
1136  orptr(tmp_reg, r15_thread);
1137  xorptr(tmp_reg, swap_reg);
1138  Register header_reg = tmp_reg;
1139#else
1140  xorptr(tmp_reg, swap_reg);
1141  get_thread(swap_reg);
1142  xorptr(swap_reg, tmp_reg);
1143  Register header_reg = swap_reg;
1144#endif
1145  andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1146  if (counters != NULL) {
1147    cond_inc32(Assembler::zero,
1148               ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1149  }
1150  jcc(Assembler::equal, done);
1151
1152  Label try_revoke_bias;
1153  Label try_rebias;
1154
1155  // At this point we know that the header has the bias pattern and
1156  // that we are not the bias owner in the current epoch. We need to
1157  // figure out more details about the state of the header in order to
1158  // know what operations can be legally performed on the object's
1159  // header.
1160
1161  // If the low three bits in the xor result aren't clear, that means
1162  // the prototype header is no longer biased and we have to revoke
1163  // the bias on this object.
1164  testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1165  jccb(Assembler::notZero, try_revoke_bias);
1166
1167  // Biasing is still enabled for this data type. See whether the
1168  // epoch of the current bias is still valid, meaning that the epoch
1169  // bits of the mark word are equal to the epoch bits of the
1170  // prototype header. (Note that the prototype header's epoch bits
1171  // only change at a safepoint.) If not, attempt to rebias the object
1172  // toward the current thread. Note that we must be absolutely sure
1173  // that the current epoch is invalid in order to do this because
1174  // otherwise the manipulations it performs on the mark word are
1175  // illegal.
1176  testptr(header_reg, markOopDesc::epoch_mask_in_place);
1177  jccb(Assembler::notZero, try_rebias);
1178
1179  // The epoch of the current bias is still valid but we know nothing
1180  // about the owner; it might be set or it might be clear. Try to
1181  // acquire the bias of the object using an atomic operation. If this
1182  // fails we will go in to the runtime to revoke the object's bias.
1183  // Note that we first construct the presumed unbiased header so we
1184  // don't accidentally blow away another thread's valid bias.
1185  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1186  andptr(swap_reg,
1187         markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1188#ifdef _LP64
1189  movptr(tmp_reg, swap_reg);
1190  orptr(tmp_reg, r15_thread);
1191#else
1192  get_thread(tmp_reg);
1193  orptr(tmp_reg, swap_reg);
1194#endif
1195  if (os::is_MP()) {
1196    lock();
1197  }
1198  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1199  // If the biasing toward our thread failed, this means that
1200  // another thread succeeded in biasing it toward itself and we
1201  // need to revoke that bias. The revocation will occur in the
1202  // interpreter runtime in the slow case.
1203  if (counters != NULL) {
1204    cond_inc32(Assembler::zero,
1205               ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1206  }
1207  if (slow_case != NULL) {
1208    jcc(Assembler::notZero, *slow_case);
1209  }
1210  jmp(done);
1211
1212  bind(try_rebias);
1213  // At this point we know the epoch has expired, meaning that the
1214  // current "bias owner", if any, is actually invalid. Under these
1215  // circumstances _only_, we are allowed to use the current header's
1216  // value as the comparison value when doing the cas to acquire the
1217  // bias in the current epoch. In other words, we allow transfer of
1218  // the bias from one thread to another directly in this situation.
1219  //
1220  // FIXME: due to a lack of registers we currently blow away the age
1221  // bits in this situation. Should attempt to preserve them.
1222  load_prototype_header(tmp_reg, obj_reg);
1223#ifdef _LP64
1224  orptr(tmp_reg, r15_thread);
1225#else
1226  get_thread(swap_reg);
1227  orptr(tmp_reg, swap_reg);
1228  movptr(swap_reg, saved_mark_addr);
1229#endif
1230  if (os::is_MP()) {
1231    lock();
1232  }
1233  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1234  // If the biasing toward our thread failed, then another thread
1235  // succeeded in biasing it toward itself and we need to revoke that
1236  // bias. The revocation will occur in the runtime in the slow case.
1237  if (counters != NULL) {
1238    cond_inc32(Assembler::zero,
1239               ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1240  }
1241  if (slow_case != NULL) {
1242    jcc(Assembler::notZero, *slow_case);
1243  }
1244  jmp(done);
1245
1246  bind(try_revoke_bias);
1247  // The prototype mark in the klass doesn't have the bias bit set any
1248  // more, indicating that objects of this data type are not supposed
1249  // to be biased any more. We are going to try to reset the mark of
1250  // this object to the prototype value and fall through to the
1251  // CAS-based locking scheme. Note that if our CAS fails, it means
1252  // that another thread raced us for the privilege of revoking the
1253  // bias of this particular object, so it's okay to continue in the
1254  // normal locking code.
1255  //
1256  // FIXME: due to a lack of registers we currently blow away the age
1257  // bits in this situation. Should attempt to preserve them.
1258  NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1259  load_prototype_header(tmp_reg, obj_reg);
1260  if (os::is_MP()) {
1261    lock();
1262  }
1263  cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1264  // Fall through to the normal CAS-based lock, because no matter what
1265  // the result of the above CAS, some thread must have succeeded in
1266  // removing the bias bit from the object's header.
1267  if (counters != NULL) {
1268    cond_inc32(Assembler::zero,
1269               ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1270  }
1271
1272  bind(cas_label);
1273
1274  return null_check_offset;
1275}
1276
1277void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1278  assert(UseBiasedLocking, "why call this otherwise?");
1279
1280  // Check for biased locking unlock case, which is a no-op
1281  // Note: we do not have to check the thread ID for two reasons.
1282  // First, the interpreter checks for IllegalMonitorStateException at
1283  // a higher level. Second, if the bias was revoked while we held the
1284  // lock, the object could not be rebiased toward another thread, so
1285  // the bias bit would be clear.
1286  movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1287  andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1288  cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1289  jcc(Assembler::equal, done);
1290}
1291
1292#ifdef COMPILER2
1293
1294#if INCLUDE_RTM_OPT
1295
1296// Update rtm_counters based on abort status
1297// input: abort_status
1298//        rtm_counters (RTMLockingCounters*)
1299// flags are killed
1300void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1301
1302  atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1303  if (PrintPreciseRTMLockingStatistics) {
1304    for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1305      Label check_abort;
1306      testl(abort_status, (1<<i));
1307      jccb(Assembler::equal, check_abort);
1308      atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1309      bind(check_abort);
1310    }
1311  }
1312}
1313
1314// Branch if (random & (count-1) != 0), count is 2^n
1315// tmp, scr and flags are killed
1316void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1317  assert(tmp == rax, "");
1318  assert(scr == rdx, "");
1319  rdtsc(); // modifies EDX:EAX
1320  andptr(tmp, count-1);
1321  jccb(Assembler::notZero, brLabel);
1322}
1323
1324// Perform abort ratio calculation, set no_rtm bit if high ratio
1325// input:  rtm_counters_Reg (RTMLockingCounters* address)
1326// tmpReg, rtm_counters_Reg and flags are killed
1327void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1328                                                 Register rtm_counters_Reg,
1329                                                 RTMLockingCounters* rtm_counters,
1330                                                 Metadata* method_data) {
1331  Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1332
1333  if (RTMLockingCalculationDelay > 0) {
1334    // Delay calculation
1335    movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1336    testptr(tmpReg, tmpReg);
1337    jccb(Assembler::equal, L_done);
1338  }
1339  // Abort ratio calculation only if abort_count > RTMAbortThreshold
1340  //   Aborted transactions = abort_count * 100
1341  //   All transactions = total_count *  RTMTotalCountIncrRate
1342  //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1343
1344  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1345  cmpptr(tmpReg, RTMAbortThreshold);
1346  jccb(Assembler::below, L_check_always_rtm2);
1347  imulptr(tmpReg, tmpReg, 100);
1348
1349  Register scrReg = rtm_counters_Reg;
1350  movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1351  imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1352  imulptr(scrReg, scrReg, RTMAbortRatio);
1353  cmpptr(tmpReg, scrReg);
1354  jccb(Assembler::below, L_check_always_rtm1);
1355  if (method_data != NULL) {
1356    // set rtm_state to "no rtm" in MDO
1357    mov_metadata(tmpReg, method_data);
1358    if (os::is_MP()) {
1359      lock();
1360    }
1361    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1362  }
1363  jmpb(L_done);
1364  bind(L_check_always_rtm1);
1365  // Reload RTMLockingCounters* address
1366  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1367  bind(L_check_always_rtm2);
1368  movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1369  cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1370  jccb(Assembler::below, L_done);
1371  if (method_data != NULL) {
1372    // set rtm_state to "always rtm" in MDO
1373    mov_metadata(tmpReg, method_data);
1374    if (os::is_MP()) {
1375      lock();
1376    }
1377    orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1378  }
1379  bind(L_done);
1380}
1381
1382// Update counters and perform abort ratio calculation
1383// input:  abort_status_Reg
1384// rtm_counters_Reg, flags are killed
1385void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1386                                   Register rtm_counters_Reg,
1387                                   RTMLockingCounters* rtm_counters,
1388                                   Metadata* method_data,
1389                                   bool profile_rtm) {
1390
1391  assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1392  // update rtm counters based on rax value at abort
1393  // reads abort_status_Reg, updates flags
1394  lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1395  rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1396  if (profile_rtm) {
1397    // Save abort status because abort_status_Reg is used by following code.
1398    if (RTMRetryCount > 0) {
1399      push(abort_status_Reg);
1400    }
1401    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1402    rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1403    // restore abort status
1404    if (RTMRetryCount > 0) {
1405      pop(abort_status_Reg);
1406    }
1407  }
1408}
1409
1410// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1411// inputs: retry_count_Reg
1412//       : abort_status_Reg
1413// output: retry_count_Reg decremented by 1
1414// flags are killed
1415void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1416  Label doneRetry;
1417  assert(abort_status_Reg == rax, "");
1418  // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1419  // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1420  // if reason is in 0x6 and retry count != 0 then retry
1421  andptr(abort_status_Reg, 0x6);
1422  jccb(Assembler::zero, doneRetry);
1423  testl(retry_count_Reg, retry_count_Reg);
1424  jccb(Assembler::zero, doneRetry);
1425  pause();
1426  decrementl(retry_count_Reg);
1427  jmp(retryLabel);
1428  bind(doneRetry);
1429}
1430
1431// Spin and retry if lock is busy,
1432// inputs: box_Reg (monitor address)
1433//       : retry_count_Reg
1434// output: retry_count_Reg decremented by 1
1435//       : clear z flag if retry count exceeded
1436// tmp_Reg, scr_Reg, flags are killed
1437void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1438                                            Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1439  Label SpinLoop, SpinExit, doneRetry;
1440  int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1441
1442  testl(retry_count_Reg, retry_count_Reg);
1443  jccb(Assembler::zero, doneRetry);
1444  decrementl(retry_count_Reg);
1445  movptr(scr_Reg, RTMSpinLoopCount);
1446
1447  bind(SpinLoop);
1448  pause();
1449  decrementl(scr_Reg);
1450  jccb(Assembler::lessEqual, SpinExit);
1451  movptr(tmp_Reg, Address(box_Reg, owner_offset));
1452  testptr(tmp_Reg, tmp_Reg);
1453  jccb(Assembler::notZero, SpinLoop);
1454
1455  bind(SpinExit);
1456  jmp(retryLabel);
1457  bind(doneRetry);
1458  incrementl(retry_count_Reg); // clear z flag
1459}
1460
1461// Use RTM for normal stack locks
1462// Input: objReg (object to lock)
1463void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1464                                       Register retry_on_abort_count_Reg,
1465                                       RTMLockingCounters* stack_rtm_counters,
1466                                       Metadata* method_data, bool profile_rtm,
1467                                       Label& DONE_LABEL, Label& IsInflated) {
1468  assert(UseRTMForStackLocks, "why call this otherwise?");
1469  assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1470  assert(tmpReg == rax, "");
1471  assert(scrReg == rdx, "");
1472  Label L_rtm_retry, L_decrement_retry, L_on_abort;
1473
1474  if (RTMRetryCount > 0) {
1475    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1476    bind(L_rtm_retry);
1477  }
1478  movptr(tmpReg, Address(objReg, 0));
1479  testptr(tmpReg, markOopDesc::monitor_value);  // inflated vs stack-locked|neutral|biased
1480  jcc(Assembler::notZero, IsInflated);
1481
1482  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1483    Label L_noincrement;
1484    if (RTMTotalCountIncrRate > 1) {
1485      // tmpReg, scrReg and flags are killed
1486      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1487    }
1488    assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1489    atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1490    bind(L_noincrement);
1491  }
1492  xbegin(L_on_abort);
1493  movptr(tmpReg, Address(objReg, 0));       // fetch markword
1494  andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1495  cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1496  jcc(Assembler::equal, DONE_LABEL);        // all done if unlocked
1497
1498  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1499  if (UseRTMXendForLockBusy) {
1500    xend();
1501    movptr(abort_status_Reg, 0x2);   // Set the abort status to 2 (so we can retry)
1502    jmp(L_decrement_retry);
1503  }
1504  else {
1505    xabort(0);
1506  }
1507  bind(L_on_abort);
1508  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1509    rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1510  }
1511  bind(L_decrement_retry);
1512  if (RTMRetryCount > 0) {
1513    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1514    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1515  }
1516}
1517
1518// Use RTM for inflating locks
1519// inputs: objReg (object to lock)
1520//         boxReg (on-stack box address (displaced header location) - KILLED)
1521//         tmpReg (ObjectMonitor address + markOopDesc::monitor_value)
1522void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1523                                          Register scrReg, Register retry_on_busy_count_Reg,
1524                                          Register retry_on_abort_count_Reg,
1525                                          RTMLockingCounters* rtm_counters,
1526                                          Metadata* method_data, bool profile_rtm,
1527                                          Label& DONE_LABEL) {
1528  assert(UseRTMLocking, "why call this otherwise?");
1529  assert(tmpReg == rax, "");
1530  assert(scrReg == rdx, "");
1531  Label L_rtm_retry, L_decrement_retry, L_on_abort;
1532  int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1533
1534  // Without cast to int32_t a movptr will destroy r10 which is typically obj
1535  movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1536  movptr(boxReg, tmpReg); // Save ObjectMonitor address
1537
1538  if (RTMRetryCount > 0) {
1539    movl(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy
1540    movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1541    bind(L_rtm_retry);
1542  }
1543  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1544    Label L_noincrement;
1545    if (RTMTotalCountIncrRate > 1) {
1546      // tmpReg, scrReg and flags are killed
1547      branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1548    }
1549    assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1550    atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1551    bind(L_noincrement);
1552  }
1553  xbegin(L_on_abort);
1554  movptr(tmpReg, Address(objReg, 0));
1555  movptr(tmpReg, Address(tmpReg, owner_offset));
1556  testptr(tmpReg, tmpReg);
1557  jcc(Assembler::zero, DONE_LABEL);
1558  if (UseRTMXendForLockBusy) {
1559    xend();
1560    jmp(L_decrement_retry);
1561  }
1562  else {
1563    xabort(0);
1564  }
1565  bind(L_on_abort);
1566  Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1567  if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1568    rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1569  }
1570  if (RTMRetryCount > 0) {
1571    // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1572    rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1573  }
1574
1575  movptr(tmpReg, Address(boxReg, owner_offset)) ;
1576  testptr(tmpReg, tmpReg) ;
1577  jccb(Assembler::notZero, L_decrement_retry) ;
1578
1579  // Appears unlocked - try to swing _owner from null to non-null.
1580  // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1581#ifdef _LP64
1582  Register threadReg = r15_thread;
1583#else
1584  get_thread(scrReg);
1585  Register threadReg = scrReg;
1586#endif
1587  if (os::is_MP()) {
1588    lock();
1589  }
1590  cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1591
1592  if (RTMRetryCount > 0) {
1593    // success done else retry
1594    jccb(Assembler::equal, DONE_LABEL) ;
1595    bind(L_decrement_retry);
1596    // Spin and retry if lock is busy.
1597    rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1598  }
1599  else {
1600    bind(L_decrement_retry);
1601  }
1602}
1603
1604#endif //  INCLUDE_RTM_OPT
1605
1606// Fast_Lock and Fast_Unlock used by C2
1607
1608// Because the transitions from emitted code to the runtime
1609// monitorenter/exit helper stubs are so slow it's critical that
1610// we inline both the stack-locking fast-path and the inflated fast path.
1611//
1612// See also: cmpFastLock and cmpFastUnlock.
1613//
1614// What follows is a specialized inline transliteration of the code
1615// in slow_enter() and slow_exit().  If we're concerned about I$ bloat
1616// another option would be to emit TrySlowEnter and TrySlowExit methods
1617// at startup-time.  These methods would accept arguments as
1618// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1619// indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
1620// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1621// In practice, however, the # of lock sites is bounded and is usually small.
1622// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1623// if the processor uses simple bimodal branch predictors keyed by EIP
1624// Since the helper routines would be called from multiple synchronization
1625// sites.
1626//
1627// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1628// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1629// to those specialized methods.  That'd give us a mostly platform-independent
1630// implementation that the JITs could optimize and inline at their pleasure.
1631// Done correctly, the only time we'd need to cross to native could would be
1632// to park() or unpark() threads.  We'd also need a few more unsafe operators
1633// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1634// (b) explicit barriers or fence operations.
1635//
1636// TODO:
1637//
1638// *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1639//    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1640//    Given TLAB allocation, Self is usually manifested in a register, so passing it into
1641//    the lock operators would typically be faster than reifying Self.
1642//
1643// *  Ideally I'd define the primitives as:
1644//       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1645//       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1646//    Unfortunately ADLC bugs prevent us from expressing the ideal form.
1647//    Instead, we're stuck with a rather awkward and brittle register assignments below.
1648//    Furthermore the register assignments are overconstrained, possibly resulting in
1649//    sub-optimal code near the synchronization site.
1650//
1651// *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
1652//    Alternately, use a better sp-proximity test.
1653//
1654// *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1655//    Either one is sufficient to uniquely identify a thread.
1656//    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1657//
1658// *  Intrinsify notify() and notifyAll() for the common cases where the
1659//    object is locked by the calling thread but the waitlist is empty.
1660//    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1661//
1662// *  use jccb and jmpb instead of jcc and jmp to improve code density.
1663//    But beware of excessive branch density on AMD Opterons.
1664//
1665// *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1666//    or failure of the fast-path.  If the fast-path fails then we pass
1667//    control to the slow-path, typically in C.  In Fast_Lock and
1668//    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1669//    will emit a conditional branch immediately after the node.
1670//    So we have branches to branches and lots of ICC.ZF games.
1671//    Instead, it might be better to have C2 pass a "FailureLabel"
1672//    into Fast_Lock and Fast_Unlock.  In the case of success, control
1673//    will drop through the node.  ICC.ZF is undefined at exit.
1674//    In the case of failure, the node will branch directly to the
1675//    FailureLabel
1676
1677
1678// obj: object to lock
1679// box: on-stack box address (displaced header location) - KILLED
1680// rax,: tmp -- KILLED
1681// scr: tmp -- KILLED
1682void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1683                               Register scrReg, Register cx1Reg, Register cx2Reg,
1684                               BiasedLockingCounters* counters,
1685                               RTMLockingCounters* rtm_counters,
1686                               RTMLockingCounters* stack_rtm_counters,
1687                               Metadata* method_data,
1688                               bool use_rtm, bool profile_rtm) {
1689  // Ensure the register assignents are disjoint
1690  assert(tmpReg == rax, "");
1691
1692  if (use_rtm) {
1693    assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1694  } else {
1695    assert(cx1Reg == noreg, "");
1696    assert(cx2Reg == noreg, "");
1697    assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1698  }
1699
1700  if (counters != NULL) {
1701    atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1702  }
1703  if (EmitSync & 1) {
1704      // set box->dhw = markOopDesc::unused_mark()
1705      // Force all sync thru slow-path: slow_enter() and slow_exit()
1706      movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1707      cmpptr (rsp, (int32_t)NULL_WORD);
1708  } else {
1709    // Possible cases that we'll encounter in fast_lock
1710    // ------------------------------------------------
1711    // * Inflated
1712    //    -- unlocked
1713    //    -- Locked
1714    //       = by self
1715    //       = by other
1716    // * biased
1717    //    -- by Self
1718    //    -- by other
1719    // * neutral
1720    // * stack-locked
1721    //    -- by self
1722    //       = sp-proximity test hits
1723    //       = sp-proximity test generates false-negative
1724    //    -- by other
1725    //
1726
1727    Label IsInflated, DONE_LABEL;
1728
1729    // it's stack-locked, biased or neutral
1730    // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1731    // order to reduce the number of conditional branches in the most common cases.
1732    // Beware -- there's a subtle invariant that fetch of the markword
1733    // at [FETCH], below, will never observe a biased encoding (*101b).
1734    // If this invariant is not held we risk exclusion (safety) failure.
1735    if (UseBiasedLocking && !UseOptoBiasInlining) {
1736      biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1737    }
1738
1739#if INCLUDE_RTM_OPT
1740    if (UseRTMForStackLocks && use_rtm) {
1741      rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1742                        stack_rtm_counters, method_data, profile_rtm,
1743                        DONE_LABEL, IsInflated);
1744    }
1745#endif // INCLUDE_RTM_OPT
1746
1747    movptr(tmpReg, Address(objReg, 0));          // [FETCH]
1748    testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1749    jccb(Assembler::notZero, IsInflated);
1750
1751    // Attempt stack-locking ...
1752    orptr (tmpReg, markOopDesc::unlocked_value);
1753    movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
1754    if (os::is_MP()) {
1755      lock();
1756    }
1757    cmpxchgptr(boxReg, Address(objReg, 0));      // Updates tmpReg
1758    if (counters != NULL) {
1759      cond_inc32(Assembler::equal,
1760                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1761    }
1762    jcc(Assembler::equal, DONE_LABEL);           // Success
1763
1764    // Recursive locking.
1765    // The object is stack-locked: markword contains stack pointer to BasicLock.
1766    // Locked by current thread if difference with current SP is less than one page.
1767    subptr(tmpReg, rsp);
1768    // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1769    andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1770    movptr(Address(boxReg, 0), tmpReg);
1771    if (counters != NULL) {
1772      cond_inc32(Assembler::equal,
1773                 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1774    }
1775    jmp(DONE_LABEL);
1776
1777    bind(IsInflated);
1778    // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value
1779
1780#if INCLUDE_RTM_OPT
1781    // Use the same RTM locking code in 32- and 64-bit VM.
1782    if (use_rtm) {
1783      rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1784                           rtm_counters, method_data, profile_rtm, DONE_LABEL);
1785    } else {
1786#endif // INCLUDE_RTM_OPT
1787
1788#ifndef _LP64
1789    // The object is inflated.
1790
1791    // boxReg refers to the on-stack BasicLock in the current frame.
1792    // We'd like to write:
1793    //   set box->_displaced_header = markOopDesc::unused_mark().  Any non-0 value suffices.
1794    // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
1795    // additional latency as we have another ST in the store buffer that must drain.
1796
1797    if (EmitSync & 8192) {
1798       movptr(Address(boxReg, 0), 3);            // results in ST-before-CAS penalty
1799       get_thread (scrReg);
1800       movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2]
1801       movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
1802       if (os::is_MP()) {
1803         lock();
1804       }
1805       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1806    } else
1807    if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
1808       // register juggle because we need tmpReg for cmpxchgptr below
1809       movptr(scrReg, boxReg);
1810       movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2]
1811
1812       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1813       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1814          // prefetchw [eax + Offset(_owner)-2]
1815          prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1816       }
1817
1818       if ((EmitSync & 64) == 0) {
1819         // Optimistic form: consider XORL tmpReg,tmpReg
1820         movptr(tmpReg, NULL_WORD);
1821       } else {
1822         // Can suffer RTS->RTO upgrades on shared or cold $ lines
1823         // Test-And-CAS instead of CAS
1824         movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1825         testptr(tmpReg, tmpReg);                   // Locked ?
1826         jccb  (Assembler::notZero, DONE_LABEL);
1827       }
1828
1829       // Appears unlocked - try to swing _owner from null to non-null.
1830       // Ideally, I'd manifest "Self" with get_thread and then attempt
1831       // to CAS the register containing Self into m->Owner.
1832       // But we don't have enough registers, so instead we can either try to CAS
1833       // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
1834       // we later store "Self" into m->Owner.  Transiently storing a stack address
1835       // (rsp or the address of the box) into  m->owner is harmless.
1836       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1837       if (os::is_MP()) {
1838         lock();
1839       }
1840       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1841       movptr(Address(scrReg, 0), 3);          // box->_displaced_header = 3
1842       // If we weren't able to swing _owner from NULL to the BasicLock
1843       // then take the slow path.
1844       jccb  (Assembler::notZero, DONE_LABEL);
1845       // update _owner from BasicLock to thread
1846       get_thread (scrReg);                    // beware: clobbers ICCs
1847       movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
1848       xorptr(boxReg, boxReg);                 // set icc.ZFlag = 1 to indicate success
1849
1850       // If the CAS fails we can either retry or pass control to the slow-path.
1851       // We use the latter tactic.
1852       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1853       // If the CAS was successful ...
1854       //   Self has acquired the lock
1855       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1856       // Intentional fall-through into DONE_LABEL ...
1857    } else {
1858       movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark()));  // results in ST-before-CAS penalty
1859       movptr(boxReg, tmpReg);
1860
1861       // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1862       if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1863          // prefetchw [eax + Offset(_owner)-2]
1864          prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1865       }
1866
1867       if ((EmitSync & 64) == 0) {
1868         // Optimistic form
1869         xorptr  (tmpReg, tmpReg);
1870       } else {
1871         // Can suffer RTS->RTO upgrades on shared or cold $ lines
1872         movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));   // rax, = m->_owner
1873         testptr(tmpReg, tmpReg);                   // Locked ?
1874         jccb  (Assembler::notZero, DONE_LABEL);
1875       }
1876
1877       // Appears unlocked - try to swing _owner from null to non-null.
1878       // Use either "Self" (in scr) or rsp as thread identity in _owner.
1879       // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
1880       get_thread (scrReg);
1881       if (os::is_MP()) {
1882         lock();
1883       }
1884       cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1885
1886       // If the CAS fails we can either retry or pass control to the slow-path.
1887       // We use the latter tactic.
1888       // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1889       // If the CAS was successful ...
1890       //   Self has acquired the lock
1891       //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1892       // Intentional fall-through into DONE_LABEL ...
1893    }
1894#else // _LP64
1895    // It's inflated
1896    movq(scrReg, tmpReg);
1897    xorq(tmpReg, tmpReg);
1898
1899    if (os::is_MP()) {
1900      lock();
1901    }
1902    cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
1903    // Unconditionally set box->_displaced_header = markOopDesc::unused_mark().
1904    // Without cast to int32_t movptr will destroy r10 which is typically obj.
1905    movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1906    // Intentional fall-through into DONE_LABEL ...
1907    // Propagate ICC.ZF from CAS above into DONE_LABEL.
1908#endif // _LP64
1909#if INCLUDE_RTM_OPT
1910    } // use_rtm()
1911#endif
1912    // DONE_LABEL is a hot target - we'd really like to place it at the
1913    // start of cache line by padding with NOPs.
1914    // See the AMD and Intel software optimization manuals for the
1915    // most efficient "long" NOP encodings.
1916    // Unfortunately none of our alignment mechanisms suffice.
1917    bind(DONE_LABEL);
1918
1919    // At DONE_LABEL the icc ZFlag is set as follows ...
1920    // Fast_Unlock uses the same protocol.
1921    // ZFlag == 1 -> Success
1922    // ZFlag == 0 -> Failure - force control through the slow-path
1923  }
1924}
1925
1926// obj: object to unlock
1927// box: box address (displaced header location), killed.  Must be EAX.
1928// tmp: killed, cannot be obj nor box.
1929//
1930// Some commentary on balanced locking:
1931//
1932// Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1933// Methods that don't have provably balanced locking are forced to run in the
1934// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1935// The interpreter provides two properties:
1936// I1:  At return-time the interpreter automatically and quietly unlocks any
1937//      objects acquired the current activation (frame).  Recall that the
1938//      interpreter maintains an on-stack list of locks currently held by
1939//      a frame.
1940// I2:  If a method attempts to unlock an object that is not held by the
1941//      the frame the interpreter throws IMSX.
1942//
1943// Lets say A(), which has provably balanced locking, acquires O and then calls B().
1944// B() doesn't have provably balanced locking so it runs in the interpreter.
1945// Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
1946// is still locked by A().
1947//
1948// The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
1949// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1950// should not be unlocked by "normal" java-level locking and vice-versa.  The specification
1951// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1952// Arguably given that the spec legislates the JNI case as undefined our implementation
1953// could reasonably *avoid* checking owner in Fast_Unlock().
1954// In the interest of performance we elide m->Owner==Self check in unlock.
1955// A perfectly viable alternative is to elide the owner check except when
1956// Xcheck:jni is enabled.
1957
1958void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
1959  assert(boxReg == rax, "");
1960  assert_different_registers(objReg, boxReg, tmpReg);
1961
1962  if (EmitSync & 4) {
1963    // Disable - inhibit all inlining.  Force control through the slow-path
1964    cmpptr (rsp, 0);
1965  } else {
1966    Label DONE_LABEL, Stacked, CheckSucc;
1967
1968    // Critically, the biased locking test must have precedence over
1969    // and appear before the (box->dhw == 0) recursive stack-lock test.
1970    if (UseBiasedLocking && !UseOptoBiasInlining) {
1971       biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1972    }
1973
1974#if INCLUDE_RTM_OPT
1975    if (UseRTMForStackLocks && use_rtm) {
1976      assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1977      Label L_regular_unlock;
1978      movptr(tmpReg, Address(objReg, 0));           // fetch markword
1979      andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1980      cmpptr(tmpReg, markOopDesc::unlocked_value);            // bits = 001 unlocked
1981      jccb(Assembler::notEqual, L_regular_unlock);  // if !HLE RegularLock
1982      xend();                                       // otherwise end...
1983      jmp(DONE_LABEL);                              // ... and we're done
1984      bind(L_regular_unlock);
1985    }
1986#endif
1987
1988    cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1989    jcc   (Assembler::zero, DONE_LABEL);            // 0 indicates recursive stack-lock
1990    movptr(tmpReg, Address(objReg, 0));             // Examine the object's markword
1991    testptr(tmpReg, markOopDesc::monitor_value);    // Inflated?
1992    jccb  (Assembler::zero, Stacked);
1993
1994    // It's inflated.
1995#if INCLUDE_RTM_OPT
1996    if (use_rtm) {
1997      Label L_regular_inflated_unlock;
1998      int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
1999      movptr(boxReg, Address(tmpReg, owner_offset));
2000      testptr(boxReg, boxReg);
2001      jccb(Assembler::notZero, L_regular_inflated_unlock);
2002      xend();
2003      jmpb(DONE_LABEL);
2004      bind(L_regular_inflated_unlock);
2005    }
2006#endif
2007
2008    // Despite our balanced locking property we still check that m->_owner == Self
2009    // as java routines or native JNI code called by this thread might
2010    // have released the lock.
2011    // Refer to the comments in synchronizer.cpp for how we might encode extra
2012    // state in _succ so we can avoid fetching EntryList|cxq.
2013    //
2014    // I'd like to add more cases in fast_lock() and fast_unlock() --
2015    // such as recursive enter and exit -- but we have to be wary of
2016    // I$ bloat, T$ effects and BP$ effects.
2017    //
2018    // If there's no contention try a 1-0 exit.  That is, exit without
2019    // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
2020    // we detect and recover from the race that the 1-0 exit admits.
2021    //
2022    // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
2023    // before it STs null into _owner, releasing the lock.  Updates
2024    // to data protected by the critical section must be visible before
2025    // we drop the lock (and thus before any other thread could acquire
2026    // the lock and observe the fields protected by the lock).
2027    // IA32's memory-model is SPO, so STs are ordered with respect to
2028    // each other and there's no need for an explicit barrier (fence).
2029    // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2030#ifndef _LP64
2031    get_thread (boxReg);
2032    if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2033      // prefetchw [ebx + Offset(_owner)-2]
2034      prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2035    }
2036
2037    // Note that we could employ various encoding schemes to reduce
2038    // the number of loads below (currently 4) to just 2 or 3.
2039    // Refer to the comments in synchronizer.cpp.
2040    // In practice the chain of fetches doesn't seem to impact performance, however.
2041    xorptr(boxReg, boxReg);
2042    if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2043       // Attempt to reduce branch density - AMD's branch predictor.
2044       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2045       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2046       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2047       jccb  (Assembler::notZero, DONE_LABEL);
2048       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2049       jmpb  (DONE_LABEL);
2050    } else {
2051       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2052       jccb  (Assembler::notZero, DONE_LABEL);
2053       movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2054       orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2055       jccb  (Assembler::notZero, CheckSucc);
2056       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2057       jmpb  (DONE_LABEL);
2058    }
2059
2060    // The Following code fragment (EmitSync & 65536) improves the performance of
2061    // contended applications and contended synchronization microbenchmarks.
2062    // Unfortunately the emission of the code - even though not executed - causes regressions
2063    // in scimark and jetstream, evidently because of $ effects.  Replacing the code
2064    // with an equal number of never-executed NOPs results in the same regression.
2065    // We leave it off by default.
2066
2067    if ((EmitSync & 65536) != 0) {
2068       Label LSuccess, LGoSlowPath ;
2069
2070       bind  (CheckSucc);
2071
2072       // Optional pre-test ... it's safe to elide this
2073       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2074       jccb(Assembler::zero, LGoSlowPath);
2075
2076       // We have a classic Dekker-style idiom:
2077       //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
2078       // There are a number of ways to implement the barrier:
2079       // (1) lock:andl &m->_owner, 0
2080       //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2081       //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2082       //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2083       // (2) If supported, an explicit MFENCE is appealing.
2084       //     In older IA32 processors MFENCE is slower than lock:add or xchg
2085       //     particularly if the write-buffer is full as might be the case if
2086       //     if stores closely precede the fence or fence-equivalent instruction.
2087       //     See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2088       //     as the situation has changed with Nehalem and Shanghai.
2089       // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2090       //     The $lines underlying the top-of-stack should be in M-state.
2091       //     The locked add instruction is serializing, of course.
2092       // (4) Use xchg, which is serializing
2093       //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2094       // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2095       //     The integer condition codes will tell us if succ was 0.
2096       //     Since _succ and _owner should reside in the same $line and
2097       //     we just stored into _owner, it's likely that the $line
2098       //     remains in M-state for the lock:orl.
2099       //
2100       // We currently use (3), although it's likely that switching to (2)
2101       // is correct for the future.
2102
2103       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
2104       if (os::is_MP()) {
2105         lock(); addptr(Address(rsp, 0), 0);
2106       }
2107       // Ratify _succ remains non-null
2108       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
2109       jccb  (Assembler::notZero, LSuccess);
2110
2111       xorptr(boxReg, boxReg);                  // box is really EAX
2112       if (os::is_MP()) { lock(); }
2113       cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2114       // There's no successor so we tried to regrab the lock with the
2115       // placeholder value. If that didn't work, then another thread
2116       // grabbed the lock so we're done (and exit was a success).
2117       jccb  (Assembler::notEqual, LSuccess);
2118       // Since we're low on registers we installed rsp as a placeholding in _owner.
2119       // Now install Self over rsp.  This is safe as we're transitioning from
2120       // non-null to non=null
2121       get_thread (boxReg);
2122       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg);
2123       // Intentional fall-through into LGoSlowPath ...
2124
2125       bind  (LGoSlowPath);
2126       orptr(boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2127       jmpb  (DONE_LABEL);
2128
2129       bind  (LSuccess);
2130       xorptr(boxReg, boxReg);                 // set ICC.ZF=1 to indicate success
2131       jmpb  (DONE_LABEL);
2132    }
2133
2134    bind (Stacked);
2135    // It's not inflated and it's not recursively stack-locked and it's not biased.
2136    // It must be stack-locked.
2137    // Try to reset the header to displaced header.
2138    // The "box" value on the stack is stable, so we can reload
2139    // and be assured we observe the same value as above.
2140    movptr(tmpReg, Address(boxReg, 0));
2141    if (os::is_MP()) {
2142      lock();
2143    }
2144    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2145    // Intention fall-thru into DONE_LABEL
2146
2147    // DONE_LABEL is a hot target - we'd really like to place it at the
2148    // start of cache line by padding with NOPs.
2149    // See the AMD and Intel software optimization manuals for the
2150    // most efficient "long" NOP encodings.
2151    // Unfortunately none of our alignment mechanisms suffice.
2152    if ((EmitSync & 65536) == 0) {
2153       bind (CheckSucc);
2154    }
2155#else // _LP64
2156    // It's inflated
2157    if (EmitSync & 1024) {
2158      // Emit code to check that _owner == Self
2159      // We could fold the _owner test into subsequent code more efficiently
2160      // than using a stand-alone check, but since _owner checking is off by
2161      // default we don't bother. We also might consider predicating the
2162      // _owner==Self check on Xcheck:jni or running on a debug build.
2163      movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2164      xorptr(boxReg, r15_thread);
2165    } else {
2166      xorptr(boxReg, boxReg);
2167    }
2168    orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
2169    jccb  (Assembler::notZero, DONE_LABEL);
2170    movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
2171    orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
2172    jccb  (Assembler::notZero, CheckSucc);
2173    movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2174    jmpb  (DONE_LABEL);
2175
2176    if ((EmitSync & 65536) == 0) {
2177      // Try to avoid passing control into the slow_path ...
2178      Label LSuccess, LGoSlowPath ;
2179      bind  (CheckSucc);
2180
2181      // The following optional optimization can be elided if necessary
2182      // Effectively: if (succ == null) goto SlowPath
2183      // The code reduces the window for a race, however,
2184      // and thus benefits performance.
2185      cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2186      jccb  (Assembler::zero, LGoSlowPath);
2187
2188      if ((EmitSync & 16) && os::is_MP()) {
2189        orptr(boxReg, boxReg);
2190        xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2191      } else {
2192        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
2193        if (os::is_MP()) {
2194          // Memory barrier/fence
2195          // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
2196          // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
2197          // This is faster on Nehalem and AMD Shanghai/Barcelona.
2198          // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
2199          // We might also restructure (ST Owner=0;barrier;LD _Succ) to
2200          // (mov box,0; xchgq box, &m->Owner; LD _succ) .
2201          lock(); addl(Address(rsp, 0), 0);
2202        }
2203      }
2204      cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
2205      jccb  (Assembler::notZero, LSuccess);
2206
2207      // Rare inopportune interleaving - race.
2208      // The successor vanished in the small window above.
2209      // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
2210      // We need to ensure progress and succession.
2211      // Try to reacquire the lock.
2212      // If that fails then the new owner is responsible for succession and this
2213      // thread needs to take no further action and can exit via the fast path (success).
2214      // If the re-acquire succeeds then pass control into the slow path.
2215      // As implemented, this latter mode is horrible because we generated more
2216      // coherence traffic on the lock *and* artifically extended the critical section
2217      // length while by virtue of passing control into the slow path.
2218
2219      // box is really RAX -- the following CMPXCHG depends on that binding
2220      // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
2221      movptr(boxReg, (int32_t)NULL_WORD);
2222      if (os::is_MP()) { lock(); }
2223      cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
2224      // There's no successor so we tried to regrab the lock.
2225      // If that didn't work, then another thread grabbed the
2226      // lock so we're done (and exit was a success).
2227      jccb  (Assembler::notEqual, LSuccess);
2228      // Intentional fall-through into slow-path
2229
2230      bind  (LGoSlowPath);
2231      orl   (boxReg, 1);                      // set ICC.ZF=0 to indicate failure
2232      jmpb  (DONE_LABEL);
2233
2234      bind  (LSuccess);
2235      testl (boxReg, 0);                      // set ICC.ZF=1 to indicate success
2236      jmpb  (DONE_LABEL);
2237    }
2238
2239    bind  (Stacked);
2240    movptr(tmpReg, Address (boxReg, 0));      // re-fetch
2241    if (os::is_MP()) { lock(); }
2242    cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2243
2244    if (EmitSync & 65536) {
2245       bind (CheckSucc);
2246    }
2247#endif
2248    bind(DONE_LABEL);
2249  }
2250}
2251#endif // COMPILER2
2252
2253void MacroAssembler::c2bool(Register x) {
2254  // implements x == 0 ? 0 : 1
2255  // note: must only look at least-significant byte of x
2256  //       since C-style booleans are stored in one byte
2257  //       only! (was bug)
2258  andl(x, 0xFF);
2259  setb(Assembler::notZero, x);
2260}
2261
2262// Wouldn't need if AddressLiteral version had new name
2263void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2264  Assembler::call(L, rtype);
2265}
2266
2267void MacroAssembler::call(Register entry) {
2268  Assembler::call(entry);
2269}
2270
2271void MacroAssembler::call(AddressLiteral entry) {
2272  if (reachable(entry)) {
2273    Assembler::call_literal(entry.target(), entry.rspec());
2274  } else {
2275    lea(rscratch1, entry);
2276    Assembler::call(rscratch1);
2277  }
2278}
2279
2280void MacroAssembler::ic_call(address entry, jint method_index) {
2281  RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
2282  movptr(rax, (intptr_t)Universe::non_oop_word());
2283  call(AddressLiteral(entry, rh));
2284}
2285
2286// Implementation of call_VM versions
2287
2288void MacroAssembler::call_VM(Register oop_result,
2289                             address entry_point,
2290                             bool check_exceptions) {
2291  Label C, E;
2292  call(C, relocInfo::none);
2293  jmp(E);
2294
2295  bind(C);
2296  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2297  ret(0);
2298
2299  bind(E);
2300}
2301
2302void MacroAssembler::call_VM(Register oop_result,
2303                             address entry_point,
2304                             Register arg_1,
2305                             bool check_exceptions) {
2306  Label C, E;
2307  call(C, relocInfo::none);
2308  jmp(E);
2309
2310  bind(C);
2311  pass_arg1(this, arg_1);
2312  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2313  ret(0);
2314
2315  bind(E);
2316}
2317
2318void MacroAssembler::call_VM(Register oop_result,
2319                             address entry_point,
2320                             Register arg_1,
2321                             Register arg_2,
2322                             bool check_exceptions) {
2323  Label C, E;
2324  call(C, relocInfo::none);
2325  jmp(E);
2326
2327  bind(C);
2328
2329  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2330
2331  pass_arg2(this, arg_2);
2332  pass_arg1(this, arg_1);
2333  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2334  ret(0);
2335
2336  bind(E);
2337}
2338
2339void MacroAssembler::call_VM(Register oop_result,
2340                             address entry_point,
2341                             Register arg_1,
2342                             Register arg_2,
2343                             Register arg_3,
2344                             bool check_exceptions) {
2345  Label C, E;
2346  call(C, relocInfo::none);
2347  jmp(E);
2348
2349  bind(C);
2350
2351  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2352  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2353  pass_arg3(this, arg_3);
2354
2355  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2356  pass_arg2(this, arg_2);
2357
2358  pass_arg1(this, arg_1);
2359  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2360  ret(0);
2361
2362  bind(E);
2363}
2364
2365void MacroAssembler::call_VM(Register oop_result,
2366                             Register last_java_sp,
2367                             address entry_point,
2368                             int number_of_arguments,
2369                             bool check_exceptions) {
2370  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2371  call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2372}
2373
2374void MacroAssembler::call_VM(Register oop_result,
2375                             Register last_java_sp,
2376                             address entry_point,
2377                             Register arg_1,
2378                             bool check_exceptions) {
2379  pass_arg1(this, arg_1);
2380  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2381}
2382
2383void MacroAssembler::call_VM(Register oop_result,
2384                             Register last_java_sp,
2385                             address entry_point,
2386                             Register arg_1,
2387                             Register arg_2,
2388                             bool check_exceptions) {
2389
2390  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2391  pass_arg2(this, arg_2);
2392  pass_arg1(this, arg_1);
2393  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2394}
2395
2396void MacroAssembler::call_VM(Register oop_result,
2397                             Register last_java_sp,
2398                             address entry_point,
2399                             Register arg_1,
2400                             Register arg_2,
2401                             Register arg_3,
2402                             bool check_exceptions) {
2403  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2404  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2405  pass_arg3(this, arg_3);
2406  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2407  pass_arg2(this, arg_2);
2408  pass_arg1(this, arg_1);
2409  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2410}
2411
2412void MacroAssembler::super_call_VM(Register oop_result,
2413                                   Register last_java_sp,
2414                                   address entry_point,
2415                                   int number_of_arguments,
2416                                   bool check_exceptions) {
2417  Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2418  MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2419}
2420
2421void MacroAssembler::super_call_VM(Register oop_result,
2422                                   Register last_java_sp,
2423                                   address entry_point,
2424                                   Register arg_1,
2425                                   bool check_exceptions) {
2426  pass_arg1(this, arg_1);
2427  super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2428}
2429
2430void MacroAssembler::super_call_VM(Register oop_result,
2431                                   Register last_java_sp,
2432                                   address entry_point,
2433                                   Register arg_1,
2434                                   Register arg_2,
2435                                   bool check_exceptions) {
2436
2437  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2438  pass_arg2(this, arg_2);
2439  pass_arg1(this, arg_1);
2440  super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2441}
2442
2443void MacroAssembler::super_call_VM(Register oop_result,
2444                                   Register last_java_sp,
2445                                   address entry_point,
2446                                   Register arg_1,
2447                                   Register arg_2,
2448                                   Register arg_3,
2449                                   bool check_exceptions) {
2450  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2451  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2452  pass_arg3(this, arg_3);
2453  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2454  pass_arg2(this, arg_2);
2455  pass_arg1(this, arg_1);
2456  super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2457}
2458
2459void MacroAssembler::call_VM_base(Register oop_result,
2460                                  Register java_thread,
2461                                  Register last_java_sp,
2462                                  address  entry_point,
2463                                  int      number_of_arguments,
2464                                  bool     check_exceptions) {
2465  // determine java_thread register
2466  if (!java_thread->is_valid()) {
2467#ifdef _LP64
2468    java_thread = r15_thread;
2469#else
2470    java_thread = rdi;
2471    get_thread(java_thread);
2472#endif // LP64
2473  }
2474  // determine last_java_sp register
2475  if (!last_java_sp->is_valid()) {
2476    last_java_sp = rsp;
2477  }
2478  // debugging support
2479  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
2480  LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2481#ifdef ASSERT
2482  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2483  // r12 is the heapbase.
2484  LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2485#endif // ASSERT
2486
2487  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
2488  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2489
2490  // push java thread (becomes first argument of C function)
2491
2492  NOT_LP64(push(java_thread); number_of_arguments++);
2493  LP64_ONLY(mov(c_rarg0, r15_thread));
2494
2495  // set last Java frame before call
2496  assert(last_java_sp != rbp, "can't use ebp/rbp");
2497
2498  // Only interpreter should have to set fp
2499  set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2500
2501  // do the call, remove parameters
2502  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2503
2504  // restore the thread (cannot use the pushed argument since arguments
2505  // may be overwritten by C code generated by an optimizing compiler);
2506  // however can use the register value directly if it is callee saved.
2507  if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2508    // rdi & rsi (also r15) are callee saved -> nothing to do
2509#ifdef ASSERT
2510    guarantee(java_thread != rax, "change this code");
2511    push(rax);
2512    { Label L;
2513      get_thread(rax);
2514      cmpptr(java_thread, rax);
2515      jcc(Assembler::equal, L);
2516      STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2517      bind(L);
2518    }
2519    pop(rax);
2520#endif
2521  } else {
2522    get_thread(java_thread);
2523  }
2524  // reset last Java frame
2525  // Only interpreter should have to clear fp
2526  reset_last_Java_frame(java_thread, true, false);
2527
2528   // C++ interp handles this in the interpreter
2529  check_and_handle_popframe(java_thread);
2530  check_and_handle_earlyret(java_thread);
2531
2532  if (check_exceptions) {
2533    // check for pending exceptions (java_thread is set upon return)
2534    cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2535#ifndef _LP64
2536    jump_cc(Assembler::notEqual,
2537            RuntimeAddress(StubRoutines::forward_exception_entry()));
2538#else
2539    // This used to conditionally jump to forward_exception however it is
2540    // possible if we relocate that the branch will not reach. So we must jump
2541    // around so we can always reach
2542
2543    Label ok;
2544    jcc(Assembler::equal, ok);
2545    jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2546    bind(ok);
2547#endif // LP64
2548  }
2549
2550  // get oop result if there is one and reset the value in the thread
2551  if (oop_result->is_valid()) {
2552    get_vm_result(oop_result, java_thread);
2553  }
2554}
2555
2556void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2557
2558  // Calculate the value for last_Java_sp
2559  // somewhat subtle. call_VM does an intermediate call
2560  // which places a return address on the stack just under the
2561  // stack pointer as the user finsihed with it. This allows
2562  // use to retrieve last_Java_pc from last_Java_sp[-1].
2563  // On 32bit we then have to push additional args on the stack to accomplish
2564  // the actual requested call. On 64bit call_VM only can use register args
2565  // so the only extra space is the return address that call_VM created.
2566  // This hopefully explains the calculations here.
2567
2568#ifdef _LP64
2569  // We've pushed one address, correct last_Java_sp
2570  lea(rax, Address(rsp, wordSize));
2571#else
2572  lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2573#endif // LP64
2574
2575  call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2576
2577}
2578
2579void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2580  call_VM_leaf_base(entry_point, number_of_arguments);
2581}
2582
2583void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2584  pass_arg0(this, arg_0);
2585  call_VM_leaf(entry_point, 1);
2586}
2587
2588void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2589
2590  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2591  pass_arg1(this, arg_1);
2592  pass_arg0(this, arg_0);
2593  call_VM_leaf(entry_point, 2);
2594}
2595
2596void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2597  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2598  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2599  pass_arg2(this, arg_2);
2600  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2601  pass_arg1(this, arg_1);
2602  pass_arg0(this, arg_0);
2603  call_VM_leaf(entry_point, 3);
2604}
2605
2606void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2607  pass_arg0(this, arg_0);
2608  MacroAssembler::call_VM_leaf_base(entry_point, 1);
2609}
2610
2611void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2612
2613  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2614  pass_arg1(this, arg_1);
2615  pass_arg0(this, arg_0);
2616  MacroAssembler::call_VM_leaf_base(entry_point, 2);
2617}
2618
2619void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2620  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2621  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2622  pass_arg2(this, arg_2);
2623  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2624  pass_arg1(this, arg_1);
2625  pass_arg0(this, arg_0);
2626  MacroAssembler::call_VM_leaf_base(entry_point, 3);
2627}
2628
2629void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2630  LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2631  LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2632  LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2633  pass_arg3(this, arg_3);
2634  LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2635  LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2636  pass_arg2(this, arg_2);
2637  LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2638  pass_arg1(this, arg_1);
2639  pass_arg0(this, arg_0);
2640  MacroAssembler::call_VM_leaf_base(entry_point, 4);
2641}
2642
2643void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2644  movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2645  movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2646  verify_oop(oop_result, "broken oop in call_VM_base");
2647}
2648
2649void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2650  movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2651  movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2652}
2653
2654void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2655}
2656
2657void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2658}
2659
2660void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2661  if (reachable(src1)) {
2662    cmpl(as_Address(src1), imm);
2663  } else {
2664    lea(rscratch1, src1);
2665    cmpl(Address(rscratch1, 0), imm);
2666  }
2667}
2668
2669void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2670  assert(!src2.is_lval(), "use cmpptr");
2671  if (reachable(src2)) {
2672    cmpl(src1, as_Address(src2));
2673  } else {
2674    lea(rscratch1, src2);
2675    cmpl(src1, Address(rscratch1, 0));
2676  }
2677}
2678
2679void MacroAssembler::cmp32(Register src1, int32_t imm) {
2680  Assembler::cmpl(src1, imm);
2681}
2682
2683void MacroAssembler::cmp32(Register src1, Address src2) {
2684  Assembler::cmpl(src1, src2);
2685}
2686
2687void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2688  ucomisd(opr1, opr2);
2689
2690  Label L;
2691  if (unordered_is_less) {
2692    movl(dst, -1);
2693    jcc(Assembler::parity, L);
2694    jcc(Assembler::below , L);
2695    movl(dst, 0);
2696    jcc(Assembler::equal , L);
2697    increment(dst);
2698  } else { // unordered is greater
2699    movl(dst, 1);
2700    jcc(Assembler::parity, L);
2701    jcc(Assembler::above , L);
2702    movl(dst, 0);
2703    jcc(Assembler::equal , L);
2704    decrementl(dst);
2705  }
2706  bind(L);
2707}
2708
2709void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2710  ucomiss(opr1, opr2);
2711
2712  Label L;
2713  if (unordered_is_less) {
2714    movl(dst, -1);
2715    jcc(Assembler::parity, L);
2716    jcc(Assembler::below , L);
2717    movl(dst, 0);
2718    jcc(Assembler::equal , L);
2719    increment(dst);
2720  } else { // unordered is greater
2721    movl(dst, 1);
2722    jcc(Assembler::parity, L);
2723    jcc(Assembler::above , L);
2724    movl(dst, 0);
2725    jcc(Assembler::equal , L);
2726    decrementl(dst);
2727  }
2728  bind(L);
2729}
2730
2731
2732void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2733  if (reachable(src1)) {
2734    cmpb(as_Address(src1), imm);
2735  } else {
2736    lea(rscratch1, src1);
2737    cmpb(Address(rscratch1, 0), imm);
2738  }
2739}
2740
2741void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2742#ifdef _LP64
2743  if (src2.is_lval()) {
2744    movptr(rscratch1, src2);
2745    Assembler::cmpq(src1, rscratch1);
2746  } else if (reachable(src2)) {
2747    cmpq(src1, as_Address(src2));
2748  } else {
2749    lea(rscratch1, src2);
2750    Assembler::cmpq(src1, Address(rscratch1, 0));
2751  }
2752#else
2753  if (src2.is_lval()) {
2754    cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2755  } else {
2756    cmpl(src1, as_Address(src2));
2757  }
2758#endif // _LP64
2759}
2760
2761void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2762  assert(src2.is_lval(), "not a mem-mem compare");
2763#ifdef _LP64
2764  // moves src2's literal address
2765  movptr(rscratch1, src2);
2766  Assembler::cmpq(src1, rscratch1);
2767#else
2768  cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2769#endif // _LP64
2770}
2771
2772void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2773  if (reachable(adr)) {
2774    if (os::is_MP())
2775      lock();
2776    cmpxchgptr(reg, as_Address(adr));
2777  } else {
2778    lea(rscratch1, adr);
2779    if (os::is_MP())
2780      lock();
2781    cmpxchgptr(reg, Address(rscratch1, 0));
2782  }
2783}
2784
2785void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2786  LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2787}
2788
2789void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2790  if (reachable(src)) {
2791    Assembler::comisd(dst, as_Address(src));
2792  } else {
2793    lea(rscratch1, src);
2794    Assembler::comisd(dst, Address(rscratch1, 0));
2795  }
2796}
2797
2798void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2799  if (reachable(src)) {
2800    Assembler::comiss(dst, as_Address(src));
2801  } else {
2802    lea(rscratch1, src);
2803    Assembler::comiss(dst, Address(rscratch1, 0));
2804  }
2805}
2806
2807
2808void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2809  Condition negated_cond = negate_condition(cond);
2810  Label L;
2811  jcc(negated_cond, L);
2812  pushf(); // Preserve flags
2813  atomic_incl(counter_addr);
2814  popf();
2815  bind(L);
2816}
2817
2818int MacroAssembler::corrected_idivl(Register reg) {
2819  // Full implementation of Java idiv and irem; checks for
2820  // special case as described in JVM spec., p.243 & p.271.
2821  // The function returns the (pc) offset of the idivl
2822  // instruction - may be needed for implicit exceptions.
2823  //
2824  //         normal case                           special case
2825  //
2826  // input : rax,: dividend                         min_int
2827  //         reg: divisor   (may not be rax,/rdx)   -1
2828  //
2829  // output: rax,: quotient  (= rax, idiv reg)       min_int
2830  //         rdx: remainder (= rax, irem reg)       0
2831  assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2832  const int min_int = 0x80000000;
2833  Label normal_case, special_case;
2834
2835  // check for special case
2836  cmpl(rax, min_int);
2837  jcc(Assembler::notEqual, normal_case);
2838  xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2839  cmpl(reg, -1);
2840  jcc(Assembler::equal, special_case);
2841
2842  // handle normal case
2843  bind(normal_case);
2844  cdql();
2845  int idivl_offset = offset();
2846  idivl(reg);
2847
2848  // normal and special case exit
2849  bind(special_case);
2850
2851  return idivl_offset;
2852}
2853
2854
2855
2856void MacroAssembler::decrementl(Register reg, int value) {
2857  if (value == min_jint) {subl(reg, value) ; return; }
2858  if (value <  0) { incrementl(reg, -value); return; }
2859  if (value == 0) {                        ; return; }
2860  if (value == 1 && UseIncDec) { decl(reg) ; return; }
2861  /* else */      { subl(reg, value)       ; return; }
2862}
2863
2864void MacroAssembler::decrementl(Address dst, int value) {
2865  if (value == min_jint) {subl(dst, value) ; return; }
2866  if (value <  0) { incrementl(dst, -value); return; }
2867  if (value == 0) {                        ; return; }
2868  if (value == 1 && UseIncDec) { decl(dst) ; return; }
2869  /* else */      { subl(dst, value)       ; return; }
2870}
2871
2872void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2873  assert (shift_value > 0, "illegal shift value");
2874  Label _is_positive;
2875  testl (reg, reg);
2876  jcc (Assembler::positive, _is_positive);
2877  int offset = (1 << shift_value) - 1 ;
2878
2879  if (offset == 1) {
2880    incrementl(reg);
2881  } else {
2882    addl(reg, offset);
2883  }
2884
2885  bind (_is_positive);
2886  sarl(reg, shift_value);
2887}
2888
2889void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2890  if (reachable(src)) {
2891    Assembler::divsd(dst, as_Address(src));
2892  } else {
2893    lea(rscratch1, src);
2894    Assembler::divsd(dst, Address(rscratch1, 0));
2895  }
2896}
2897
2898void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2899  if (reachable(src)) {
2900    Assembler::divss(dst, as_Address(src));
2901  } else {
2902    lea(rscratch1, src);
2903    Assembler::divss(dst, Address(rscratch1, 0));
2904  }
2905}
2906
2907// !defined(COMPILER2) is because of stupid core builds
2908#if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) || INCLUDE_JVMCI
2909void MacroAssembler::empty_FPU_stack() {
2910  if (VM_Version::supports_mmx()) {
2911    emms();
2912  } else {
2913    for (int i = 8; i-- > 0; ) ffree(i);
2914  }
2915}
2916#endif // !LP64 || C1 || !C2 || INCLUDE_JVMCI
2917
2918
2919// Defines obj, preserves var_size_in_bytes
2920void MacroAssembler::eden_allocate(Register obj,
2921                                   Register var_size_in_bytes,
2922                                   int con_size_in_bytes,
2923                                   Register t1,
2924                                   Label& slow_case) {
2925  assert(obj == rax, "obj must be in rax, for cmpxchg");
2926  assert_different_registers(obj, var_size_in_bytes, t1);
2927  if (!Universe::heap()->supports_inline_contig_alloc()) {
2928    jmp(slow_case);
2929  } else {
2930    Register end = t1;
2931    Label retry;
2932    bind(retry);
2933    ExternalAddress heap_top((address) Universe::heap()->top_addr());
2934    movptr(obj, heap_top);
2935    if (var_size_in_bytes == noreg) {
2936      lea(end, Address(obj, con_size_in_bytes));
2937    } else {
2938      lea(end, Address(obj, var_size_in_bytes, Address::times_1));
2939    }
2940    // if end < obj then we wrapped around => object too long => slow case
2941    cmpptr(end, obj);
2942    jcc(Assembler::below, slow_case);
2943    cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
2944    jcc(Assembler::above, slow_case);
2945    // Compare obj with the top addr, and if still equal, store the new top addr in
2946    // end at the address of the top addr pointer. Sets ZF if was equal, and clears
2947    // it otherwise. Use lock prefix for atomicity on MPs.
2948    locked_cmpxchgptr(end, heap_top);
2949    jcc(Assembler::notEqual, retry);
2950  }
2951}
2952
2953void MacroAssembler::enter() {
2954  push(rbp);
2955  mov(rbp, rsp);
2956}
2957
2958// A 5 byte nop that is safe for patching (see patch_verified_entry)
2959void MacroAssembler::fat_nop() {
2960  if (UseAddressNop) {
2961    addr_nop_5();
2962  } else {
2963    emit_int8(0x26); // es:
2964    emit_int8(0x2e); // cs:
2965    emit_int8(0x64); // fs:
2966    emit_int8(0x65); // gs:
2967    emit_int8((unsigned char)0x90);
2968  }
2969}
2970
2971void MacroAssembler::fcmp(Register tmp) {
2972  fcmp(tmp, 1, true, true);
2973}
2974
2975void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
2976  assert(!pop_right || pop_left, "usage error");
2977  if (VM_Version::supports_cmov()) {
2978    assert(tmp == noreg, "unneeded temp");
2979    if (pop_left) {
2980      fucomip(index);
2981    } else {
2982      fucomi(index);
2983    }
2984    if (pop_right) {
2985      fpop();
2986    }
2987  } else {
2988    assert(tmp != noreg, "need temp");
2989    if (pop_left) {
2990      if (pop_right) {
2991        fcompp();
2992      } else {
2993        fcomp(index);
2994      }
2995    } else {
2996      fcom(index);
2997    }
2998    // convert FPU condition into eflags condition via rax,
2999    save_rax(tmp);
3000    fwait(); fnstsw_ax();
3001    sahf();
3002    restore_rax(tmp);
3003  }
3004  // condition codes set as follows:
3005  //
3006  // CF (corresponds to C0) if x < y
3007  // PF (corresponds to C2) if unordered
3008  // ZF (corresponds to C3) if x = y
3009}
3010
3011void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
3012  fcmp2int(dst, unordered_is_less, 1, true, true);
3013}
3014
3015void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
3016  fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
3017  Label L;
3018  if (unordered_is_less) {
3019    movl(dst, -1);
3020    jcc(Assembler::parity, L);
3021    jcc(Assembler::below , L);
3022    movl(dst, 0);
3023    jcc(Assembler::equal , L);
3024    increment(dst);
3025  } else { // unordered is greater
3026    movl(dst, 1);
3027    jcc(Assembler::parity, L);
3028    jcc(Assembler::above , L);
3029    movl(dst, 0);
3030    jcc(Assembler::equal , L);
3031    decrementl(dst);
3032  }
3033  bind(L);
3034}
3035
3036void MacroAssembler::fld_d(AddressLiteral src) {
3037  fld_d(as_Address(src));
3038}
3039
3040void MacroAssembler::fld_s(AddressLiteral src) {
3041  fld_s(as_Address(src));
3042}
3043
3044void MacroAssembler::fld_x(AddressLiteral src) {
3045  Assembler::fld_x(as_Address(src));
3046}
3047
3048void MacroAssembler::fldcw(AddressLiteral src) {
3049  Assembler::fldcw(as_Address(src));
3050}
3051
3052void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) {
3053  if (reachable(src)) {
3054    Assembler::mulpd(dst, as_Address(src));
3055  } else {
3056    lea(rscratch1, src);
3057    Assembler::mulpd(dst, Address(rscratch1, 0));
3058  }
3059}
3060
3061void MacroAssembler::increase_precision() {
3062  subptr(rsp, BytesPerWord);
3063  fnstcw(Address(rsp, 0));
3064  movl(rax, Address(rsp, 0));
3065  orl(rax, 0x300);
3066  push(rax);
3067  fldcw(Address(rsp, 0));
3068  pop(rax);
3069}
3070
3071void MacroAssembler::restore_precision() {
3072  fldcw(Address(rsp, 0));
3073  addptr(rsp, BytesPerWord);
3074}
3075
3076void MacroAssembler::fpop() {
3077  ffree();
3078  fincstp();
3079}
3080
3081void MacroAssembler::load_float(Address src) {
3082  if (UseSSE >= 1) {
3083    movflt(xmm0, src);
3084  } else {
3085    LP64_ONLY(ShouldNotReachHere());
3086    NOT_LP64(fld_s(src));
3087  }
3088}
3089
3090void MacroAssembler::store_float(Address dst) {
3091  if (UseSSE >= 1) {
3092    movflt(dst, xmm0);
3093  } else {
3094    LP64_ONLY(ShouldNotReachHere());
3095    NOT_LP64(fstp_s(dst));
3096  }
3097}
3098
3099void MacroAssembler::load_double(Address src) {
3100  if (UseSSE >= 2) {
3101    movdbl(xmm0, src);
3102  } else {
3103    LP64_ONLY(ShouldNotReachHere());
3104    NOT_LP64(fld_d(src));
3105  }
3106}
3107
3108void MacroAssembler::store_double(Address dst) {
3109  if (UseSSE >= 2) {
3110    movdbl(dst, xmm0);
3111  } else {
3112    LP64_ONLY(ShouldNotReachHere());
3113    NOT_LP64(fstp_d(dst));
3114  }
3115}
3116
3117void MacroAssembler::fremr(Register tmp) {
3118  save_rax(tmp);
3119  { Label L;
3120    bind(L);
3121    fprem();
3122    fwait(); fnstsw_ax();
3123#ifdef _LP64
3124    testl(rax, 0x400);
3125    jcc(Assembler::notEqual, L);
3126#else
3127    sahf();
3128    jcc(Assembler::parity, L);
3129#endif // _LP64
3130  }
3131  restore_rax(tmp);
3132  // Result is in ST0.
3133  // Note: fxch & fpop to get rid of ST1
3134  // (otherwise FPU stack could overflow eventually)
3135  fxch(1);
3136  fpop();
3137}
3138
3139
3140void MacroAssembler::incrementl(AddressLiteral dst) {
3141  if (reachable(dst)) {
3142    incrementl(as_Address(dst));
3143  } else {
3144    lea(rscratch1, dst);
3145    incrementl(Address(rscratch1, 0));
3146  }
3147}
3148
3149void MacroAssembler::incrementl(ArrayAddress dst) {
3150  incrementl(as_Address(dst));
3151}
3152
3153void MacroAssembler::incrementl(Register reg, int value) {
3154  if (value == min_jint) {addl(reg, value) ; return; }
3155  if (value <  0) { decrementl(reg, -value); return; }
3156  if (value == 0) {                        ; return; }
3157  if (value == 1 && UseIncDec) { incl(reg) ; return; }
3158  /* else */      { addl(reg, value)       ; return; }
3159}
3160
3161void MacroAssembler::incrementl(Address dst, int value) {
3162  if (value == min_jint) {addl(dst, value) ; return; }
3163  if (value <  0) { decrementl(dst, -value); return; }
3164  if (value == 0) {                        ; return; }
3165  if (value == 1 && UseIncDec) { incl(dst) ; return; }
3166  /* else */      { addl(dst, value)       ; return; }
3167}
3168
3169void MacroAssembler::jump(AddressLiteral dst) {
3170  if (reachable(dst)) {
3171    jmp_literal(dst.target(), dst.rspec());
3172  } else {
3173    lea(rscratch1, dst);
3174    jmp(rscratch1);
3175  }
3176}
3177
3178void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3179  if (reachable(dst)) {
3180    InstructionMark im(this);
3181    relocate(dst.reloc());
3182    const int short_size = 2;
3183    const int long_size = 6;
3184    int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3185    if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3186      // 0111 tttn #8-bit disp
3187      emit_int8(0x70 | cc);
3188      emit_int8((offs - short_size) & 0xFF);
3189    } else {
3190      // 0000 1111 1000 tttn #32-bit disp
3191      emit_int8(0x0F);
3192      emit_int8((unsigned char)(0x80 | cc));
3193      emit_int32(offs - long_size);
3194    }
3195  } else {
3196#ifdef ASSERT
3197    warning("reversing conditional branch");
3198#endif /* ASSERT */
3199    Label skip;
3200    jccb(reverse[cc], skip);
3201    lea(rscratch1, dst);
3202    Assembler::jmp(rscratch1);
3203    bind(skip);
3204  }
3205}
3206
3207void MacroAssembler::ldmxcsr(AddressLiteral src) {
3208  if (reachable(src)) {
3209    Assembler::ldmxcsr(as_Address(src));
3210  } else {
3211    lea(rscratch1, src);
3212    Assembler::ldmxcsr(Address(rscratch1, 0));
3213  }
3214}
3215
3216int MacroAssembler::load_signed_byte(Register dst, Address src) {
3217  int off;
3218  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3219    off = offset();
3220    movsbl(dst, src); // movsxb
3221  } else {
3222    off = load_unsigned_byte(dst, src);
3223    shll(dst, 24);
3224    sarl(dst, 24);
3225  }
3226  return off;
3227}
3228
3229// Note: load_signed_short used to be called load_signed_word.
3230// Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3231// manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3232// The term "word" in HotSpot means a 32- or 64-bit machine word.
3233int MacroAssembler::load_signed_short(Register dst, Address src) {
3234  int off;
3235  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3236    // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3237    // version but this is what 64bit has always done. This seems to imply
3238    // that users are only using 32bits worth.
3239    off = offset();
3240    movswl(dst, src); // movsxw
3241  } else {
3242    off = load_unsigned_short(dst, src);
3243    shll(dst, 16);
3244    sarl(dst, 16);
3245  }
3246  return off;
3247}
3248
3249int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3250  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3251  // and "3.9 Partial Register Penalties", p. 22).
3252  int off;
3253  if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3254    off = offset();
3255    movzbl(dst, src); // movzxb
3256  } else {
3257    xorl(dst, dst);
3258    off = offset();
3259    movb(dst, src);
3260  }
3261  return off;
3262}
3263
3264// Note: load_unsigned_short used to be called load_unsigned_word.
3265int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3266  // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3267  // and "3.9 Partial Register Penalties", p. 22).
3268  int off;
3269  if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3270    off = offset();
3271    movzwl(dst, src); // movzxw
3272  } else {
3273    xorl(dst, dst);
3274    off = offset();
3275    movw(dst, src);
3276  }
3277  return off;
3278}
3279
3280void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3281  switch (size_in_bytes) {
3282#ifndef _LP64
3283  case  8:
3284    assert(dst2 != noreg, "second dest register required");
3285    movl(dst,  src);
3286    movl(dst2, src.plus_disp(BytesPerInt));
3287    break;
3288#else
3289  case  8:  movq(dst, src); break;
3290#endif
3291  case  4:  movl(dst, src); break;
3292  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3293  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3294  default:  ShouldNotReachHere();
3295  }
3296}
3297
3298void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3299  switch (size_in_bytes) {
3300#ifndef _LP64
3301  case  8:
3302    assert(src2 != noreg, "second source register required");
3303    movl(dst,                        src);
3304    movl(dst.plus_disp(BytesPerInt), src2);
3305    break;
3306#else
3307  case  8:  movq(dst, src); break;
3308#endif
3309  case  4:  movl(dst, src); break;
3310  case  2:  movw(dst, src); break;
3311  case  1:  movb(dst, src); break;
3312  default:  ShouldNotReachHere();
3313  }
3314}
3315
3316void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3317  if (reachable(dst)) {
3318    movl(as_Address(dst), src);
3319  } else {
3320    lea(rscratch1, dst);
3321    movl(Address(rscratch1, 0), src);
3322  }
3323}
3324
3325void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3326  if (reachable(src)) {
3327    movl(dst, as_Address(src));
3328  } else {
3329    lea(rscratch1, src);
3330    movl(dst, Address(rscratch1, 0));
3331  }
3332}
3333
3334// C++ bool manipulation
3335
3336void MacroAssembler::movbool(Register dst, Address src) {
3337  if(sizeof(bool) == 1)
3338    movb(dst, src);
3339  else if(sizeof(bool) == 2)
3340    movw(dst, src);
3341  else if(sizeof(bool) == 4)
3342    movl(dst, src);
3343  else
3344    // unsupported
3345    ShouldNotReachHere();
3346}
3347
3348void MacroAssembler::movbool(Address dst, bool boolconst) {
3349  if(sizeof(bool) == 1)
3350    movb(dst, (int) boolconst);
3351  else if(sizeof(bool) == 2)
3352    movw(dst, (int) boolconst);
3353  else if(sizeof(bool) == 4)
3354    movl(dst, (int) boolconst);
3355  else
3356    // unsupported
3357    ShouldNotReachHere();
3358}
3359
3360void MacroAssembler::movbool(Address dst, Register src) {
3361  if(sizeof(bool) == 1)
3362    movb(dst, src);
3363  else if(sizeof(bool) == 2)
3364    movw(dst, src);
3365  else if(sizeof(bool) == 4)
3366    movl(dst, src);
3367  else
3368    // unsupported
3369    ShouldNotReachHere();
3370}
3371
3372void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3373  movb(as_Address(dst), src);
3374}
3375
3376void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3377  if (reachable(src)) {
3378    movdl(dst, as_Address(src));
3379  } else {
3380    lea(rscratch1, src);
3381    movdl(dst, Address(rscratch1, 0));
3382  }
3383}
3384
3385void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3386  if (reachable(src)) {
3387    movq(dst, as_Address(src));
3388  } else {
3389    lea(rscratch1, src);
3390    movq(dst, Address(rscratch1, 0));
3391  }
3392}
3393
3394void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3395  if (reachable(src)) {
3396    if (UseXmmLoadAndClearUpper) {
3397      movsd (dst, as_Address(src));
3398    } else {
3399      movlpd(dst, as_Address(src));
3400    }
3401  } else {
3402    lea(rscratch1, src);
3403    if (UseXmmLoadAndClearUpper) {
3404      movsd (dst, Address(rscratch1, 0));
3405    } else {
3406      movlpd(dst, Address(rscratch1, 0));
3407    }
3408  }
3409}
3410
3411void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3412  if (reachable(src)) {
3413    movss(dst, as_Address(src));
3414  } else {
3415    lea(rscratch1, src);
3416    movss(dst, Address(rscratch1, 0));
3417  }
3418}
3419
3420void MacroAssembler::movptr(Register dst, Register src) {
3421  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3422}
3423
3424void MacroAssembler::movptr(Register dst, Address src) {
3425  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3426}
3427
3428// src should NEVER be a real pointer. Use AddressLiteral for true pointers
3429void MacroAssembler::movptr(Register dst, intptr_t src) {
3430  LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3431}
3432
3433void MacroAssembler::movptr(Address dst, Register src) {
3434  LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3435}
3436
3437void MacroAssembler::movdqu(Address dst, XMMRegister src) {
3438  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3439    Assembler::vextractf32x4h(dst, src, 0);
3440  } else {
3441    Assembler::movdqu(dst, src);
3442  }
3443}
3444
3445void MacroAssembler::movdqu(XMMRegister dst, Address src) {
3446  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3447    Assembler::vinsertf32x4h(dst, src, 0);
3448  } else {
3449    Assembler::movdqu(dst, src);
3450  }
3451}
3452
3453void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) {
3454  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3455    Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3456  } else {
3457    Assembler::movdqu(dst, src);
3458  }
3459}
3460
3461void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3462  if (reachable(src)) {
3463    movdqu(dst, as_Address(src));
3464  } else {
3465    lea(rscratch1, src);
3466    movdqu(dst, Address(rscratch1, 0));
3467  }
3468}
3469
3470void MacroAssembler::vmovdqu(Address dst, XMMRegister src) {
3471  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (src->encoding() > 15)) {
3472    Assembler::vextractf64x4h(dst, src, 0);
3473  } else {
3474    Assembler::vmovdqu(dst, src);
3475  }
3476}
3477
3478void MacroAssembler::vmovdqu(XMMRegister dst, Address src) {
3479  if (UseAVX > 2 && !VM_Version::supports_avx512vl() && (dst->encoding() > 15)) {
3480    Assembler::vinsertf64x4h(dst, src, 0);
3481  } else {
3482    Assembler::vmovdqu(dst, src);
3483  }
3484}
3485
3486void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) {
3487  if (UseAVX > 2 && !VM_Version::supports_avx512vl()) {
3488    Assembler::evmovdqul(dst, src, Assembler::AVX_512bit);
3489  }
3490  else {
3491    Assembler::vmovdqu(dst, src);
3492  }
3493}
3494
3495void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src) {
3496  if (reachable(src)) {
3497    vmovdqu(dst, as_Address(src));
3498  }
3499  else {
3500    lea(rscratch1, src);
3501    vmovdqu(dst, Address(rscratch1, 0));
3502  }
3503}
3504
3505void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3506  if (reachable(src)) {
3507    Assembler::movdqa(dst, as_Address(src));
3508  } else {
3509    lea(rscratch1, src);
3510    Assembler::movdqa(dst, Address(rscratch1, 0));
3511  }
3512}
3513
3514void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3515  if (reachable(src)) {
3516    Assembler::movsd(dst, as_Address(src));
3517  } else {
3518    lea(rscratch1, src);
3519    Assembler::movsd(dst, Address(rscratch1, 0));
3520  }
3521}
3522
3523void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3524  if (reachable(src)) {
3525    Assembler::movss(dst, as_Address(src));
3526  } else {
3527    lea(rscratch1, src);
3528    Assembler::movss(dst, Address(rscratch1, 0));
3529  }
3530}
3531
3532void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3533  if (reachable(src)) {
3534    Assembler::mulsd(dst, as_Address(src));
3535  } else {
3536    lea(rscratch1, src);
3537    Assembler::mulsd(dst, Address(rscratch1, 0));
3538  }
3539}
3540
3541void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3542  if (reachable(src)) {
3543    Assembler::mulss(dst, as_Address(src));
3544  } else {
3545    lea(rscratch1, src);
3546    Assembler::mulss(dst, Address(rscratch1, 0));
3547  }
3548}
3549
3550void MacroAssembler::null_check(Register reg, int offset) {
3551  if (needs_explicit_null_check(offset)) {
3552    // provoke OS NULL exception if reg = NULL by
3553    // accessing M[reg] w/o changing any (non-CC) registers
3554    // NOTE: cmpl is plenty here to provoke a segv
3555    cmpptr(rax, Address(reg, 0));
3556    // Note: should probably use testl(rax, Address(reg, 0));
3557    //       may be shorter code (however, this version of
3558    //       testl needs to be implemented first)
3559  } else {
3560    // nothing to do, (later) access of M[reg + offset]
3561    // will provoke OS NULL exception if reg = NULL
3562  }
3563}
3564
3565void MacroAssembler::os_breakpoint() {
3566  // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3567  // (e.g., MSVC can't call ps() otherwise)
3568  call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3569}
3570
3571#ifdef _LP64
3572#define XSTATE_BV 0x200
3573#endif
3574
3575void MacroAssembler::pop_CPU_state() {
3576  pop_FPU_state();
3577  pop_IU_state();
3578}
3579
3580void MacroAssembler::pop_FPU_state() {
3581#ifndef _LP64
3582  frstor(Address(rsp, 0));
3583#else
3584  fxrstor(Address(rsp, 0));
3585#endif
3586  addptr(rsp, FPUStateSizeInWords * wordSize);
3587}
3588
3589void MacroAssembler::pop_IU_state() {
3590  popa();
3591  LP64_ONLY(addq(rsp, 8));
3592  popf();
3593}
3594
3595// Save Integer and Float state
3596// Warning: Stack must be 16 byte aligned (64bit)
3597void MacroAssembler::push_CPU_state() {
3598  push_IU_state();
3599  push_FPU_state();
3600}
3601
3602void MacroAssembler::push_FPU_state() {
3603  subptr(rsp, FPUStateSizeInWords * wordSize);
3604#ifndef _LP64
3605  fnsave(Address(rsp, 0));
3606  fwait();
3607#else
3608  fxsave(Address(rsp, 0));
3609#endif // LP64
3610}
3611
3612void MacroAssembler::push_IU_state() {
3613  // Push flags first because pusha kills them
3614  pushf();
3615  // Make sure rsp stays 16-byte aligned
3616  LP64_ONLY(subq(rsp, 8));
3617  pusha();
3618}
3619
3620void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3621  // determine java_thread register
3622  if (!java_thread->is_valid()) {
3623    java_thread = rdi;
3624    get_thread(java_thread);
3625  }
3626  // we must set sp to zero to clear frame
3627  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3628  if (clear_fp) {
3629    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3630  }
3631
3632  if (clear_pc)
3633    movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3634
3635}
3636
3637void MacroAssembler::restore_rax(Register tmp) {
3638  if (tmp == noreg) pop(rax);
3639  else if (tmp != rax) mov(rax, tmp);
3640}
3641
3642void MacroAssembler::round_to(Register reg, int modulus) {
3643  addptr(reg, modulus - 1);
3644  andptr(reg, -modulus);
3645}
3646
3647void MacroAssembler::save_rax(Register tmp) {
3648  if (tmp == noreg) push(rax);
3649  else if (tmp != rax) mov(tmp, rax);
3650}
3651
3652// Write serialization page so VM thread can do a pseudo remote membar.
3653// We use the current thread pointer to calculate a thread specific
3654// offset to write to within the page. This minimizes bus traffic
3655// due to cache line collision.
3656void MacroAssembler::serialize_memory(Register thread, Register tmp) {
3657  movl(tmp, thread);
3658  shrl(tmp, os::get_serialize_page_shift_count());
3659  andl(tmp, (os::vm_page_size() - sizeof(int)));
3660
3661  Address index(noreg, tmp, Address::times_1);
3662  ExternalAddress page(os::get_memory_serialize_page());
3663
3664  // Size of store must match masking code above
3665  movl(as_Address(ArrayAddress(page, index)), tmp);
3666}
3667
3668// Calls to C land
3669//
3670// When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3671// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3672// has to be reset to 0. This is required to allow proper stack traversal.
3673void MacroAssembler::set_last_Java_frame(Register java_thread,
3674                                         Register last_java_sp,
3675                                         Register last_java_fp,
3676                                         address  last_java_pc) {
3677  // determine java_thread register
3678  if (!java_thread->is_valid()) {
3679    java_thread = rdi;
3680    get_thread(java_thread);
3681  }
3682  // determine last_java_sp register
3683  if (!last_java_sp->is_valid()) {
3684    last_java_sp = rsp;
3685  }
3686
3687  // last_java_fp is optional
3688
3689  if (last_java_fp->is_valid()) {
3690    movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3691  }
3692
3693  // last_java_pc is optional
3694
3695  if (last_java_pc != NULL) {
3696    lea(Address(java_thread,
3697                 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3698        InternalAddress(last_java_pc));
3699
3700  }
3701  movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3702}
3703
3704void MacroAssembler::shlptr(Register dst, int imm8) {
3705  LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3706}
3707
3708void MacroAssembler::shrptr(Register dst, int imm8) {
3709  LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3710}
3711
3712void MacroAssembler::sign_extend_byte(Register reg) {
3713  if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3714    movsbl(reg, reg); // movsxb
3715  } else {
3716    shll(reg, 24);
3717    sarl(reg, 24);
3718  }
3719}
3720
3721void MacroAssembler::sign_extend_short(Register reg) {
3722  if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3723    movswl(reg, reg); // movsxw
3724  } else {
3725    shll(reg, 16);
3726    sarl(reg, 16);
3727  }
3728}
3729
3730void MacroAssembler::testl(Register dst, AddressLiteral src) {
3731  assert(reachable(src), "Address should be reachable");
3732  testl(dst, as_Address(src));
3733}
3734
3735void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) {
3736  int dst_enc = dst->encoding();
3737  int src_enc = src->encoding();
3738  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3739    Assembler::pcmpeqb(dst, src);
3740  } else if ((dst_enc < 16) && (src_enc < 16)) {
3741    Assembler::pcmpeqb(dst, src);
3742  } else if (src_enc < 16) {
3743    subptr(rsp, 64);
3744    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3745    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3746    Assembler::pcmpeqb(xmm0, src);
3747    movdqu(dst, xmm0);
3748    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3749    addptr(rsp, 64);
3750  } else if (dst_enc < 16) {
3751    subptr(rsp, 64);
3752    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3753    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3754    Assembler::pcmpeqb(dst, xmm0);
3755    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3756    addptr(rsp, 64);
3757  } else {
3758    subptr(rsp, 64);
3759    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3760    subptr(rsp, 64);
3761    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3762    movdqu(xmm0, src);
3763    movdqu(xmm1, dst);
3764    Assembler::pcmpeqb(xmm1, xmm0);
3765    movdqu(dst, xmm1);
3766    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
3767    addptr(rsp, 64);
3768    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3769    addptr(rsp, 64);
3770  }
3771}
3772
3773void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) {
3774  int dst_enc = dst->encoding();
3775  int src_enc = src->encoding();
3776  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3777    Assembler::pcmpeqw(dst, src);
3778  } else if ((dst_enc < 16) && (src_enc < 16)) {
3779    Assembler::pcmpeqw(dst, src);
3780  } else if (src_enc < 16) {
3781    subptr(rsp, 64);
3782    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3783    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3784    Assembler::pcmpeqw(xmm0, src);
3785    movdqu(dst, xmm0);
3786    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3787    addptr(rsp, 64);
3788  } else if (dst_enc < 16) {
3789    subptr(rsp, 64);
3790    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3791    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3792    Assembler::pcmpeqw(dst, xmm0);
3793    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3794    addptr(rsp, 64);
3795  } else {
3796    subptr(rsp, 64);
3797    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3798    subptr(rsp, 64);
3799    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3800    movdqu(xmm0, src);
3801    movdqu(xmm1, dst);
3802    Assembler::pcmpeqw(xmm1, xmm0);
3803    movdqu(dst, xmm1);
3804    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
3805    addptr(rsp, 64);
3806    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3807    addptr(rsp, 64);
3808  }
3809}
3810
3811void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) {
3812  int dst_enc = dst->encoding();
3813  if (dst_enc < 16) {
3814    Assembler::pcmpestri(dst, src, imm8);
3815  } else {
3816    subptr(rsp, 64);
3817    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3818    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3819    Assembler::pcmpestri(xmm0, src, imm8);
3820    movdqu(dst, xmm0);
3821    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3822    addptr(rsp, 64);
3823  }
3824}
3825
3826void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
3827  int dst_enc = dst->encoding();
3828  int src_enc = src->encoding();
3829  if ((dst_enc < 16) && (src_enc < 16)) {
3830    Assembler::pcmpestri(dst, src, imm8);
3831  } else if (src_enc < 16) {
3832    subptr(rsp, 64);
3833    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3834    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3835    Assembler::pcmpestri(xmm0, src, imm8);
3836    movdqu(dst, xmm0);
3837    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3838    addptr(rsp, 64);
3839  } else if (dst_enc < 16) {
3840    subptr(rsp, 64);
3841    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3842    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3843    Assembler::pcmpestri(dst, xmm0, imm8);
3844    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3845    addptr(rsp, 64);
3846  } else {
3847    subptr(rsp, 64);
3848    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3849    subptr(rsp, 64);
3850    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3851    movdqu(xmm0, src);
3852    movdqu(xmm1, dst);
3853    Assembler::pcmpestri(xmm1, xmm0, imm8);
3854    movdqu(dst, xmm1);
3855    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
3856    addptr(rsp, 64);
3857    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3858    addptr(rsp, 64);
3859  }
3860}
3861
3862void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) {
3863  int dst_enc = dst->encoding();
3864  int src_enc = src->encoding();
3865  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3866    Assembler::pmovzxbw(dst, src);
3867  } else if ((dst_enc < 16) && (src_enc < 16)) {
3868    Assembler::pmovzxbw(dst, src);
3869  } else if (src_enc < 16) {
3870    subptr(rsp, 64);
3871    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3872    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3873    Assembler::pmovzxbw(xmm0, src);
3874    movdqu(dst, xmm0);
3875    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3876    addptr(rsp, 64);
3877  } else if (dst_enc < 16) {
3878    subptr(rsp, 64);
3879    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3880    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3881    Assembler::pmovzxbw(dst, xmm0);
3882    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3883    addptr(rsp, 64);
3884  } else {
3885    subptr(rsp, 64);
3886    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3887    subptr(rsp, 64);
3888    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3889    movdqu(xmm0, src);
3890    movdqu(xmm1, dst);
3891    Assembler::pmovzxbw(xmm1, xmm0);
3892    movdqu(dst, xmm1);
3893    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
3894    addptr(rsp, 64);
3895    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3896    addptr(rsp, 64);
3897  }
3898}
3899
3900void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) {
3901  int dst_enc = dst->encoding();
3902  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
3903    Assembler::pmovzxbw(dst, src);
3904  } else if (dst_enc < 16) {
3905    Assembler::pmovzxbw(dst, src);
3906  } else {
3907    subptr(rsp, 64);
3908    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3909    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3910    Assembler::pmovzxbw(xmm0, src);
3911    movdqu(dst, xmm0);
3912    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3913    addptr(rsp, 64);
3914  }
3915}
3916
3917void MacroAssembler::pmovmskb(Register dst, XMMRegister src) {
3918  int src_enc = src->encoding();
3919  if (src_enc < 16) {
3920    Assembler::pmovmskb(dst, src);
3921  } else {
3922    subptr(rsp, 64);
3923    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3924    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3925    Assembler::pmovmskb(dst, xmm0);
3926    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3927    addptr(rsp, 64);
3928  }
3929}
3930
3931void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) {
3932  int dst_enc = dst->encoding();
3933  int src_enc = src->encoding();
3934  if ((dst_enc < 16) && (src_enc < 16)) {
3935    Assembler::ptest(dst, src);
3936  } else if (src_enc < 16) {
3937    subptr(rsp, 64);
3938    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3939    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
3940    Assembler::ptest(xmm0, src);
3941    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3942    addptr(rsp, 64);
3943  } else if (dst_enc < 16) {
3944    subptr(rsp, 64);
3945    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3946    evmovdqul(xmm0, src, Assembler::AVX_512bit);
3947    Assembler::ptest(dst, xmm0);
3948    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3949    addptr(rsp, 64);
3950  } else {
3951    subptr(rsp, 64);
3952    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
3953    subptr(rsp, 64);
3954    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
3955    movdqu(xmm0, src);
3956    movdqu(xmm1, dst);
3957    Assembler::ptest(xmm1, xmm0);
3958    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
3959    addptr(rsp, 64);
3960    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
3961    addptr(rsp, 64);
3962  }
3963}
3964
3965void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3966  if (reachable(src)) {
3967    Assembler::sqrtsd(dst, as_Address(src));
3968  } else {
3969    lea(rscratch1, src);
3970    Assembler::sqrtsd(dst, Address(rscratch1, 0));
3971  }
3972}
3973
3974void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3975  if (reachable(src)) {
3976    Assembler::sqrtss(dst, as_Address(src));
3977  } else {
3978    lea(rscratch1, src);
3979    Assembler::sqrtss(dst, Address(rscratch1, 0));
3980  }
3981}
3982
3983void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3984  if (reachable(src)) {
3985    Assembler::subsd(dst, as_Address(src));
3986  } else {
3987    lea(rscratch1, src);
3988    Assembler::subsd(dst, Address(rscratch1, 0));
3989  }
3990}
3991
3992void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3993  if (reachable(src)) {
3994    Assembler::subss(dst, as_Address(src));
3995  } else {
3996    lea(rscratch1, src);
3997    Assembler::subss(dst, Address(rscratch1, 0));
3998  }
3999}
4000
4001void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
4002  if (reachable(src)) {
4003    Assembler::ucomisd(dst, as_Address(src));
4004  } else {
4005    lea(rscratch1, src);
4006    Assembler::ucomisd(dst, Address(rscratch1, 0));
4007  }
4008}
4009
4010void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
4011  if (reachable(src)) {
4012    Assembler::ucomiss(dst, as_Address(src));
4013  } else {
4014    lea(rscratch1, src);
4015    Assembler::ucomiss(dst, Address(rscratch1, 0));
4016  }
4017}
4018
4019void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
4020  // Used in sign-bit flipping with aligned address.
4021  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4022  if (reachable(src)) {
4023    Assembler::xorpd(dst, as_Address(src));
4024  } else {
4025    lea(rscratch1, src);
4026    Assembler::xorpd(dst, Address(rscratch1, 0));
4027  }
4028}
4029
4030void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) {
4031  if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
4032    Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
4033  }
4034  else {
4035    Assembler::xorpd(dst, src);
4036  }
4037}
4038
4039void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) {
4040  if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) {
4041    Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit);
4042  } else {
4043    Assembler::xorps(dst, src);
4044  }
4045}
4046
4047void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
4048  // Used in sign-bit flipping with aligned address.
4049  assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
4050  if (reachable(src)) {
4051    Assembler::xorps(dst, as_Address(src));
4052  } else {
4053    lea(rscratch1, src);
4054    Assembler::xorps(dst, Address(rscratch1, 0));
4055  }
4056}
4057
4058void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
4059  // Used in sign-bit flipping with aligned address.
4060  bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
4061  assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
4062  if (reachable(src)) {
4063    Assembler::pshufb(dst, as_Address(src));
4064  } else {
4065    lea(rscratch1, src);
4066    Assembler::pshufb(dst, Address(rscratch1, 0));
4067  }
4068}
4069
4070// AVX 3-operands instructions
4071
4072void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4073  if (reachable(src)) {
4074    vaddsd(dst, nds, as_Address(src));
4075  } else {
4076    lea(rscratch1, src);
4077    vaddsd(dst, nds, Address(rscratch1, 0));
4078  }
4079}
4080
4081void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4082  if (reachable(src)) {
4083    vaddss(dst, nds, as_Address(src));
4084  } else {
4085    lea(rscratch1, src);
4086    vaddss(dst, nds, Address(rscratch1, 0));
4087  }
4088}
4089
4090void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4091  int dst_enc = dst->encoding();
4092  int nds_enc = nds->encoding();
4093  int src_enc = src->encoding();
4094  if ((dst_enc < 16) && (nds_enc < 16)) {
4095    vandps(dst, nds, negate_field, vector_len);
4096  } else if ((src_enc < 16) && (dst_enc < 16)) {
4097    movss(src, nds);
4098    vandps(dst, src, negate_field, vector_len);
4099  } else if (src_enc < 16) {
4100    movss(src, nds);
4101    vandps(src, src, negate_field, vector_len);
4102    movss(dst, src);
4103  } else if (dst_enc < 16) {
4104    movdqu(src, xmm0);
4105    movss(xmm0, nds);
4106    vandps(dst, xmm0, negate_field, vector_len);
4107    movdqu(xmm0, src);
4108  } else if (nds_enc < 16) {
4109    movdqu(src, xmm0);
4110    vandps(xmm0, nds, negate_field, vector_len);
4111    movss(dst, xmm0);
4112    movdqu(xmm0, src);
4113  } else {
4114    movdqu(src, xmm0);
4115    movss(xmm0, nds);
4116    vandps(xmm0, xmm0, negate_field, vector_len);
4117    movss(dst, xmm0);
4118    movdqu(xmm0, src);
4119  }
4120}
4121
4122void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) {
4123  int dst_enc = dst->encoding();
4124  int nds_enc = nds->encoding();
4125  int src_enc = src->encoding();
4126  if ((dst_enc < 16) && (nds_enc < 16)) {
4127    vandpd(dst, nds, negate_field, vector_len);
4128  } else if ((src_enc < 16) && (dst_enc < 16)) {
4129    movsd(src, nds);
4130    vandpd(dst, src, negate_field, vector_len);
4131  } else if (src_enc < 16) {
4132    movsd(src, nds);
4133    vandpd(src, src, negate_field, vector_len);
4134    movsd(dst, src);
4135  } else if (dst_enc < 16) {
4136    movdqu(src, xmm0);
4137    movsd(xmm0, nds);
4138    vandpd(dst, xmm0, negate_field, vector_len);
4139    movdqu(xmm0, src);
4140  } else if (nds_enc < 16) {
4141    movdqu(src, xmm0);
4142    vandpd(xmm0, nds, negate_field, vector_len);
4143    movsd(dst, xmm0);
4144    movdqu(xmm0, src);
4145  } else {
4146    movdqu(src, xmm0);
4147    movsd(xmm0, nds);
4148    vandpd(xmm0, xmm0, negate_field, vector_len);
4149    movsd(dst, xmm0);
4150    movdqu(xmm0, src);
4151  }
4152}
4153
4154void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4155  int dst_enc = dst->encoding();
4156  int nds_enc = nds->encoding();
4157  int src_enc = src->encoding();
4158  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4159    Assembler::vpaddb(dst, nds, src, vector_len);
4160  } else if ((dst_enc < 16) && (src_enc < 16)) {
4161    Assembler::vpaddb(dst, dst, src, vector_len);
4162  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4163    // use nds as scratch for src
4164    evmovdqul(nds, src, Assembler::AVX_512bit);
4165    Assembler::vpaddb(dst, dst, nds, vector_len);
4166  } else if ((src_enc < 16) && (nds_enc < 16)) {
4167    // use nds as scratch for dst
4168    evmovdqul(nds, dst, Assembler::AVX_512bit);
4169    Assembler::vpaddb(nds, nds, src, vector_len);
4170    evmovdqul(dst, nds, Assembler::AVX_512bit);
4171  } else if (dst_enc < 16) {
4172    // use nds as scatch for xmm0 to hold src
4173    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4174    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4175    Assembler::vpaddb(dst, dst, xmm0, vector_len);
4176    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4177  } else {
4178    // worse case scenario, all regs are in the upper bank
4179    subptr(rsp, 64);
4180    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4181    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4182    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4183    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4184    Assembler::vpaddb(xmm0, xmm0, xmm1, vector_len);
4185    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4186    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4187    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4188    addptr(rsp, 64);
4189  }
4190}
4191
4192void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4193  int dst_enc = dst->encoding();
4194  int nds_enc = nds->encoding();
4195  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4196    Assembler::vpaddb(dst, nds, src, vector_len);
4197  } else if (dst_enc < 16) {
4198    Assembler::vpaddb(dst, dst, src, vector_len);
4199  } else if (nds_enc < 16) {
4200    // implies dst_enc in upper bank with src as scratch
4201    evmovdqul(nds, dst, Assembler::AVX_512bit);
4202    Assembler::vpaddb(nds, nds, src, vector_len);
4203    evmovdqul(dst, nds, Assembler::AVX_512bit);
4204  } else {
4205    // worse case scenario, all regs in upper bank
4206    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4207    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4208    Assembler::vpaddb(xmm0, xmm0, src, vector_len);
4209    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4210  }
4211}
4212
4213void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4214  int dst_enc = dst->encoding();
4215  int nds_enc = nds->encoding();
4216  int src_enc = src->encoding();
4217  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4218    Assembler::vpaddw(dst, nds, src, vector_len);
4219  } else if ((dst_enc < 16) && (src_enc < 16)) {
4220    Assembler::vpaddw(dst, dst, src, vector_len);
4221  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4222    // use nds as scratch for src
4223    evmovdqul(nds, src, Assembler::AVX_512bit);
4224    Assembler::vpaddw(dst, dst, nds, vector_len);
4225  } else if ((src_enc < 16) && (nds_enc < 16)) {
4226    // use nds as scratch for dst
4227    evmovdqul(nds, dst, Assembler::AVX_512bit);
4228    Assembler::vpaddw(nds, nds, src, vector_len);
4229    evmovdqul(dst, nds, Assembler::AVX_512bit);
4230  } else if (dst_enc < 16) {
4231    // use nds as scatch for xmm0 to hold src
4232    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4233    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4234    Assembler::vpaddw(dst, dst, xmm0, vector_len);
4235    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4236  } else {
4237    // worse case scenario, all regs are in the upper bank
4238    subptr(rsp, 64);
4239    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4240    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4241    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4242    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4243    Assembler::vpaddw(xmm0, xmm0, xmm1, vector_len);
4244    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4245    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4246    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4247    addptr(rsp, 64);
4248  }
4249}
4250
4251void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4252  int dst_enc = dst->encoding();
4253  int nds_enc = nds->encoding();
4254  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4255    Assembler::vpaddw(dst, nds, src, vector_len);
4256  } else if (dst_enc < 16) {
4257    Assembler::vpaddw(dst, dst, src, vector_len);
4258  } else if (nds_enc < 16) {
4259    // implies dst_enc in upper bank with src as scratch
4260    evmovdqul(nds, dst, Assembler::AVX_512bit);
4261    Assembler::vpaddw(nds, nds, src, vector_len);
4262    evmovdqul(dst, nds, Assembler::AVX_512bit);
4263  } else {
4264    // worse case scenario, all regs in upper bank
4265    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4266    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4267    Assembler::vpaddw(xmm0, xmm0, src, vector_len);
4268    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4269  }
4270}
4271
4272void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
4273  int dst_enc = dst->encoding();
4274  int src_enc = src->encoding();
4275  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4276    Assembler::vpbroadcastw(dst, src);
4277  } else if ((dst_enc < 16) && (src_enc < 16)) {
4278    Assembler::vpbroadcastw(dst, src);
4279  } else if (src_enc < 16) {
4280    subptr(rsp, 64);
4281    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4282    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4283    Assembler::vpbroadcastw(xmm0, src);
4284    movdqu(dst, xmm0);
4285    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4286    addptr(rsp, 64);
4287  } else if (dst_enc < 16) {
4288    subptr(rsp, 64);
4289    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4290    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4291    Assembler::vpbroadcastw(dst, xmm0);
4292    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4293    addptr(rsp, 64);
4294  } else {
4295    subptr(rsp, 64);
4296    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4297    subptr(rsp, 64);
4298    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4299    movdqu(xmm0, src);
4300    movdqu(xmm1, dst);
4301    Assembler::vpbroadcastw(xmm1, xmm0);
4302    movdqu(dst, xmm1);
4303    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4304    addptr(rsp, 64);
4305    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4306    addptr(rsp, 64);
4307  }
4308}
4309
4310void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4311  int dst_enc = dst->encoding();
4312  int nds_enc = nds->encoding();
4313  int src_enc = src->encoding();
4314  assert(dst_enc == nds_enc, "");
4315  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4316    Assembler::vpcmpeqb(dst, nds, src, vector_len);
4317  } else if ((dst_enc < 16) && (src_enc < 16)) {
4318    Assembler::vpcmpeqb(dst, nds, src, vector_len);
4319  } else if (src_enc < 16) {
4320    subptr(rsp, 64);
4321    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4322    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4323    Assembler::vpcmpeqb(xmm0, xmm0, src, vector_len);
4324    movdqu(dst, xmm0);
4325    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4326    addptr(rsp, 64);
4327  } else if (dst_enc < 16) {
4328    subptr(rsp, 64);
4329    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4330    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4331    Assembler::vpcmpeqb(dst, dst, xmm0, vector_len);
4332    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4333    addptr(rsp, 64);
4334  } else {
4335    subptr(rsp, 64);
4336    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4337    subptr(rsp, 64);
4338    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4339    movdqu(xmm0, src);
4340    movdqu(xmm1, dst);
4341    Assembler::vpcmpeqb(xmm1, xmm1, xmm0, vector_len);
4342    movdqu(dst, xmm1);
4343    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4344    addptr(rsp, 64);
4345    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4346    addptr(rsp, 64);
4347  }
4348}
4349
4350void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4351  int dst_enc = dst->encoding();
4352  int nds_enc = nds->encoding();
4353  int src_enc = src->encoding();
4354  assert(dst_enc == nds_enc, "");
4355  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4356    Assembler::vpcmpeqw(dst, nds, src, vector_len);
4357  } else if ((dst_enc < 16) && (src_enc < 16)) {
4358    Assembler::vpcmpeqw(dst, nds, src, vector_len);
4359  } else if (src_enc < 16) {
4360    subptr(rsp, 64);
4361    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4362    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4363    Assembler::vpcmpeqw(xmm0, xmm0, src, vector_len);
4364    movdqu(dst, xmm0);
4365    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4366    addptr(rsp, 64);
4367  } else if (dst_enc < 16) {
4368    subptr(rsp, 64);
4369    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4370    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4371    Assembler::vpcmpeqw(dst, dst, xmm0, vector_len);
4372    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4373    addptr(rsp, 64);
4374  } else {
4375    subptr(rsp, 64);
4376    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4377    subptr(rsp, 64);
4378    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4379    movdqu(xmm0, src);
4380    movdqu(xmm1, dst);
4381    Assembler::vpcmpeqw(xmm1, xmm1, xmm0, vector_len);
4382    movdqu(dst, xmm1);
4383    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4384    addptr(rsp, 64);
4385    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4386    addptr(rsp, 64);
4387  }
4388}
4389
4390void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) {
4391  int dst_enc = dst->encoding();
4392  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4393    Assembler::vpmovzxbw(dst, src, vector_len);
4394  } else if (dst_enc < 16) {
4395    Assembler::vpmovzxbw(dst, src, vector_len);
4396  } else {
4397    subptr(rsp, 64);
4398    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4399    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4400    Assembler::vpmovzxbw(xmm0, src, vector_len);
4401    movdqu(dst, xmm0);
4402    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4403    addptr(rsp, 64);
4404  }
4405}
4406
4407void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) {
4408  int src_enc = src->encoding();
4409  if (src_enc < 16) {
4410    Assembler::vpmovmskb(dst, src);
4411  } else {
4412    subptr(rsp, 64);
4413    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4414    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4415    Assembler::vpmovmskb(dst, xmm0);
4416    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4417    addptr(rsp, 64);
4418  }
4419}
4420
4421void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4422  int dst_enc = dst->encoding();
4423  int nds_enc = nds->encoding();
4424  int src_enc = src->encoding();
4425  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4426    Assembler::vpmullw(dst, nds, src, vector_len);
4427  } else if ((dst_enc < 16) && (src_enc < 16)) {
4428    Assembler::vpmullw(dst, dst, src, vector_len);
4429  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4430    // use nds as scratch for src
4431    evmovdqul(nds, src, Assembler::AVX_512bit);
4432    Assembler::vpmullw(dst, dst, nds, vector_len);
4433  } else if ((src_enc < 16) && (nds_enc < 16)) {
4434    // use nds as scratch for dst
4435    evmovdqul(nds, dst, Assembler::AVX_512bit);
4436    Assembler::vpmullw(nds, nds, src, vector_len);
4437    evmovdqul(dst, nds, Assembler::AVX_512bit);
4438  } else if (dst_enc < 16) {
4439    // use nds as scatch for xmm0 to hold src
4440    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4441    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4442    Assembler::vpmullw(dst, dst, xmm0, vector_len);
4443    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4444  } else {
4445    // worse case scenario, all regs are in the upper bank
4446    subptr(rsp, 64);
4447    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4448    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4449    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4450    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4451    Assembler::vpmullw(xmm0, xmm0, xmm1, vector_len);
4452    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4453    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4454    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4455    addptr(rsp, 64);
4456  }
4457}
4458
4459void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4460  int dst_enc = dst->encoding();
4461  int nds_enc = nds->encoding();
4462  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4463    Assembler::vpmullw(dst, nds, src, vector_len);
4464  } else if (dst_enc < 16) {
4465    Assembler::vpmullw(dst, dst, src, vector_len);
4466  } else if (nds_enc < 16) {
4467    // implies dst_enc in upper bank with src as scratch
4468    evmovdqul(nds, dst, Assembler::AVX_512bit);
4469    Assembler::vpmullw(nds, nds, src, vector_len);
4470    evmovdqul(dst, nds, Assembler::AVX_512bit);
4471  } else {
4472    // worse case scenario, all regs in upper bank
4473    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4474    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4475    Assembler::vpmullw(xmm0, xmm0, src, vector_len);
4476    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4477  }
4478}
4479
4480void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4481  int dst_enc = dst->encoding();
4482  int nds_enc = nds->encoding();
4483  int src_enc = src->encoding();
4484  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4485    Assembler::vpsubb(dst, nds, src, vector_len);
4486  } else if ((dst_enc < 16) && (src_enc < 16)) {
4487    Assembler::vpsubb(dst, dst, src, vector_len);
4488  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4489    // use nds as scratch for src
4490    evmovdqul(nds, src, Assembler::AVX_512bit);
4491    Assembler::vpsubb(dst, dst, nds, vector_len);
4492  } else if ((src_enc < 16) && (nds_enc < 16)) {
4493    // use nds as scratch for dst
4494    evmovdqul(nds, dst, Assembler::AVX_512bit);
4495    Assembler::vpsubb(nds, nds, src, vector_len);
4496    evmovdqul(dst, nds, Assembler::AVX_512bit);
4497  } else if (dst_enc < 16) {
4498    // use nds as scatch for xmm0 to hold src
4499    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4500    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4501    Assembler::vpsubb(dst, dst, xmm0, vector_len);
4502    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4503  } else {
4504    // worse case scenario, all regs are in the upper bank
4505    subptr(rsp, 64);
4506    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4507    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4508    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4509    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4510    Assembler::vpsubb(xmm0, xmm0, xmm1, vector_len);
4511    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4512    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4513    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4514    addptr(rsp, 64);
4515  }
4516}
4517
4518void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4519  int dst_enc = dst->encoding();
4520  int nds_enc = nds->encoding();
4521  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4522    Assembler::vpsubb(dst, nds, src, vector_len);
4523  } else if (dst_enc < 16) {
4524    Assembler::vpsubb(dst, dst, src, vector_len);
4525  } else if (nds_enc < 16) {
4526    // implies dst_enc in upper bank with src as scratch
4527    evmovdqul(nds, dst, Assembler::AVX_512bit);
4528    Assembler::vpsubb(nds, nds, src, vector_len);
4529    evmovdqul(dst, nds, Assembler::AVX_512bit);
4530  } else {
4531    // worse case scenario, all regs in upper bank
4532    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4533    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4534    Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4535    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4536  }
4537}
4538
4539void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
4540  int dst_enc = dst->encoding();
4541  int nds_enc = nds->encoding();
4542  int src_enc = src->encoding();
4543  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4544    Assembler::vpsubw(dst, nds, src, vector_len);
4545  } else if ((dst_enc < 16) && (src_enc < 16)) {
4546    Assembler::vpsubw(dst, dst, src, vector_len);
4547  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4548    // use nds as scratch for src
4549    evmovdqul(nds, src, Assembler::AVX_512bit);
4550    Assembler::vpsubw(dst, dst, nds, vector_len);
4551  } else if ((src_enc < 16) && (nds_enc < 16)) {
4552    // use nds as scratch for dst
4553    evmovdqul(nds, dst, Assembler::AVX_512bit);
4554    Assembler::vpsubw(nds, nds, src, vector_len);
4555    evmovdqul(dst, nds, Assembler::AVX_512bit);
4556  } else if (dst_enc < 16) {
4557    // use nds as scatch for xmm0 to hold src
4558    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4559    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4560    Assembler::vpsubw(dst, dst, xmm0, vector_len);
4561    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4562  } else {
4563    // worse case scenario, all regs are in the upper bank
4564    subptr(rsp, 64);
4565    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4566    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4567    evmovdqul(xmm1, src, Assembler::AVX_512bit);
4568    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4569    Assembler::vpsubw(xmm0, xmm0, xmm1, vector_len);
4570    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4571    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4572    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4573    addptr(rsp, 64);
4574  }
4575}
4576
4577void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
4578  int dst_enc = dst->encoding();
4579  int nds_enc = nds->encoding();
4580  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4581    Assembler::vpsubw(dst, nds, src, vector_len);
4582  } else if (dst_enc < 16) {
4583    Assembler::vpsubw(dst, dst, src, vector_len);
4584  } else if (nds_enc < 16) {
4585    // implies dst_enc in upper bank with src as scratch
4586    evmovdqul(nds, dst, Assembler::AVX_512bit);
4587    Assembler::vpsubw(nds, nds, src, vector_len);
4588    evmovdqul(dst, nds, Assembler::AVX_512bit);
4589  } else {
4590    // worse case scenario, all regs in upper bank
4591    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4592    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4593    Assembler::vpsubw(xmm0, xmm0, src, vector_len);
4594    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4595  }
4596}
4597
4598void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4599  int dst_enc = dst->encoding();
4600  int nds_enc = nds->encoding();
4601  int shift_enc = shift->encoding();
4602  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4603    Assembler::vpsraw(dst, nds, shift, vector_len);
4604  } else if ((dst_enc < 16) && (shift_enc < 16)) {
4605    Assembler::vpsraw(dst, dst, shift, vector_len);
4606  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4607    // use nds_enc as scratch with shift
4608    evmovdqul(nds, shift, Assembler::AVX_512bit);
4609    Assembler::vpsraw(dst, dst, nds, vector_len);
4610  } else if ((shift_enc < 16) && (nds_enc < 16)) {
4611    // use nds as scratch with dst
4612    evmovdqul(nds, dst, Assembler::AVX_512bit);
4613    Assembler::vpsraw(nds, nds, shift, vector_len);
4614    evmovdqul(dst, nds, Assembler::AVX_512bit);
4615  } else if (dst_enc < 16) {
4616    // use nds to save a copy of xmm0 and hold shift
4617    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4618    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4619    Assembler::vpsraw(dst, dst, xmm0, vector_len);
4620    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4621  } else if (nds_enc < 16) {
4622    // use nds as dest as temps
4623    evmovdqul(nds, dst, Assembler::AVX_512bit);
4624    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4625    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4626    Assembler::vpsraw(nds, nds, xmm0, vector_len);
4627    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4628    evmovdqul(dst, nds, Assembler::AVX_512bit);
4629  } else {
4630    // worse case scenario, all regs are in the upper bank
4631    subptr(rsp, 64);
4632    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4633    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4634    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4635    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4636    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4637    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4638    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4639    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4640    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4641    addptr(rsp, 64);
4642  }
4643}
4644
4645void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4646  int dst_enc = dst->encoding();
4647  int nds_enc = nds->encoding();
4648  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4649    Assembler::vpsraw(dst, nds, shift, vector_len);
4650  } else if (dst_enc < 16) {
4651    Assembler::vpsraw(dst, dst, shift, vector_len);
4652  } else if (nds_enc < 16) {
4653    // use nds as scratch
4654    evmovdqul(nds, dst, Assembler::AVX_512bit);
4655    Assembler::vpsraw(nds, nds, shift, vector_len);
4656    evmovdqul(dst, nds, Assembler::AVX_512bit);
4657  } else {
4658    // use nds as scratch for xmm0
4659    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4660    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4661    Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
4662    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4663  }
4664}
4665
4666void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4667  int dst_enc = dst->encoding();
4668  int nds_enc = nds->encoding();
4669  int shift_enc = shift->encoding();
4670  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4671    Assembler::vpsrlw(dst, nds, shift, vector_len);
4672  } else if ((dst_enc < 16) && (shift_enc < 16)) {
4673    Assembler::vpsrlw(dst, dst, shift, vector_len);
4674  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4675    // use nds_enc as scratch with shift
4676    evmovdqul(nds, shift, Assembler::AVX_512bit);
4677    Assembler::vpsrlw(dst, dst, nds, vector_len);
4678  } else if ((shift_enc < 16) && (nds_enc < 16)) {
4679    // use nds as scratch with dst
4680    evmovdqul(nds, dst, Assembler::AVX_512bit);
4681    Assembler::vpsrlw(nds, nds, shift, vector_len);
4682    evmovdqul(dst, nds, Assembler::AVX_512bit);
4683  } else if (dst_enc < 16) {
4684    // use nds to save a copy of xmm0 and hold shift
4685    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4686    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4687    Assembler::vpsrlw(dst, dst, xmm0, vector_len);
4688    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4689  } else if (nds_enc < 16) {
4690    // use nds as dest as temps
4691    evmovdqul(nds, dst, Assembler::AVX_512bit);
4692    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4693    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4694    Assembler::vpsrlw(nds, nds, xmm0, vector_len);
4695    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4696    evmovdqul(dst, nds, Assembler::AVX_512bit);
4697  } else {
4698    // worse case scenario, all regs are in the upper bank
4699    subptr(rsp, 64);
4700    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4701    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4702    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4703    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4704    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4705    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4706    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4707    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4708    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4709    addptr(rsp, 64);
4710  }
4711}
4712
4713void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4714  int dst_enc = dst->encoding();
4715  int nds_enc = nds->encoding();
4716  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4717    Assembler::vpsrlw(dst, nds, shift, vector_len);
4718  } else if (dst_enc < 16) {
4719    Assembler::vpsrlw(dst, dst, shift, vector_len);
4720  } else if (nds_enc < 16) {
4721    // use nds as scratch
4722    evmovdqul(nds, dst, Assembler::AVX_512bit);
4723    Assembler::vpsrlw(nds, nds, shift, vector_len);
4724    evmovdqul(dst, nds, Assembler::AVX_512bit);
4725  } else {
4726    // use nds as scratch for xmm0
4727    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4728    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4729    Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
4730    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4731  }
4732}
4733
4734void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) {
4735  int dst_enc = dst->encoding();
4736  int nds_enc = nds->encoding();
4737  int shift_enc = shift->encoding();
4738  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4739    Assembler::vpsllw(dst, nds, shift, vector_len);
4740  } else if ((dst_enc < 16) && (shift_enc < 16)) {
4741    Assembler::vpsllw(dst, dst, shift, vector_len);
4742  } else if ((dst_enc < 16) && (nds_enc < 16)) {
4743    // use nds_enc as scratch with shift
4744    evmovdqul(nds, shift, Assembler::AVX_512bit);
4745    Assembler::vpsllw(dst, dst, nds, vector_len);
4746  } else if ((shift_enc < 16) && (nds_enc < 16)) {
4747    // use nds as scratch with dst
4748    evmovdqul(nds, dst, Assembler::AVX_512bit);
4749    Assembler::vpsllw(nds, nds, shift, vector_len);
4750    evmovdqul(dst, nds, Assembler::AVX_512bit);
4751  } else if (dst_enc < 16) {
4752    // use nds to save a copy of xmm0 and hold shift
4753    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4754    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4755    Assembler::vpsllw(dst, dst, xmm0, vector_len);
4756    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4757  } else if (nds_enc < 16) {
4758    // use nds as dest as temps
4759    evmovdqul(nds, dst, Assembler::AVX_512bit);
4760    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4761    evmovdqul(xmm0, shift, Assembler::AVX_512bit);
4762    Assembler::vpsllw(nds, nds, xmm0, vector_len);
4763    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4764    evmovdqul(dst, nds, Assembler::AVX_512bit);
4765  } else {
4766    // worse case scenario, all regs are in the upper bank
4767    subptr(rsp, 64);
4768    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4769    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4770    evmovdqul(xmm1, shift, Assembler::AVX_512bit);
4771    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4772    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
4773    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
4774    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4775    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4776    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4777    addptr(rsp, 64);
4778  }
4779}
4780
4781void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
4782  int dst_enc = dst->encoding();
4783  int nds_enc = nds->encoding();
4784  if (VM_Version::supports_avxonly() || VM_Version::supports_avx512bw()) {
4785    Assembler::vpsllw(dst, nds, shift, vector_len);
4786  } else if (dst_enc < 16) {
4787    Assembler::vpsllw(dst, dst, shift, vector_len);
4788  } else if (nds_enc < 16) {
4789    // use nds as scratch
4790    evmovdqul(nds, dst, Assembler::AVX_512bit);
4791    Assembler::vpsllw(nds, nds, shift, vector_len);
4792    evmovdqul(dst, nds, Assembler::AVX_512bit);
4793  } else {
4794    // use nds as scratch for xmm0
4795    evmovdqul(nds, xmm0, Assembler::AVX_512bit);
4796    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4797    Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
4798    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
4799  }
4800}
4801
4802void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) {
4803  int dst_enc = dst->encoding();
4804  int src_enc = src->encoding();
4805  if ((dst_enc < 16) && (src_enc < 16)) {
4806    Assembler::vptest(dst, src);
4807  } else if (src_enc < 16) {
4808    subptr(rsp, 64);
4809    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4810    evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4811    Assembler::vptest(xmm0, src);
4812    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4813    addptr(rsp, 64);
4814  } else if (dst_enc < 16) {
4815    subptr(rsp, 64);
4816    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4817    evmovdqul(xmm0, src, Assembler::AVX_512bit);
4818    Assembler::vptest(dst, xmm0);
4819    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4820    addptr(rsp, 64);
4821  } else {
4822    subptr(rsp, 64);
4823    evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4824    subptr(rsp, 64);
4825    evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4826    movdqu(xmm0, src);
4827    movdqu(xmm1, dst);
4828    Assembler::vptest(xmm1, xmm0);
4829    evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4830    addptr(rsp, 64);
4831    evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4832    addptr(rsp, 64);
4833  }
4834}
4835
4836// This instruction exists within macros, ergo we cannot control its input
4837// when emitted through those patterns.
4838void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) {
4839  if (VM_Version::supports_avx512nobw()) {
4840    int dst_enc = dst->encoding();
4841    int src_enc = src->encoding();
4842    if (dst_enc == src_enc) {
4843      if (dst_enc < 16) {
4844        Assembler::punpcklbw(dst, src);
4845      } else {
4846        subptr(rsp, 64);
4847        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4848        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4849        Assembler::punpcklbw(xmm0, xmm0);
4850        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4851        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4852        addptr(rsp, 64);
4853      }
4854    } else {
4855      if ((src_enc < 16) && (dst_enc < 16)) {
4856        Assembler::punpcklbw(dst, src);
4857      } else if (src_enc < 16) {
4858        subptr(rsp, 64);
4859        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4860        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4861        Assembler::punpcklbw(xmm0, src);
4862        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4863        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4864        addptr(rsp, 64);
4865      } else if (dst_enc < 16) {
4866        subptr(rsp, 64);
4867        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4868        evmovdqul(xmm0, src, Assembler::AVX_512bit);
4869        Assembler::punpcklbw(dst, xmm0);
4870        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4871        addptr(rsp, 64);
4872      } else {
4873        subptr(rsp, 64);
4874        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4875        subptr(rsp, 64);
4876        evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4877        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4878        evmovdqul(xmm1, src, Assembler::AVX_512bit);
4879        Assembler::punpcklbw(xmm0, xmm1);
4880        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4881        evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4882        addptr(rsp, 64);
4883        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4884        addptr(rsp, 64);
4885      }
4886    }
4887  } else {
4888    Assembler::punpcklbw(dst, src);
4889  }
4890}
4891
4892// This instruction exists within macros, ergo we cannot control its input
4893// when emitted through those patterns.
4894void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) {
4895  if (VM_Version::supports_avx512nobw()) {
4896    int dst_enc = dst->encoding();
4897    int src_enc = src->encoding();
4898    if (dst_enc == src_enc) {
4899      if (dst_enc < 16) {
4900        Assembler::pshuflw(dst, src, mode);
4901      } else {
4902        subptr(rsp, 64);
4903        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4904        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4905        Assembler::pshuflw(xmm0, xmm0, mode);
4906        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4907        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4908        addptr(rsp, 64);
4909      }
4910    } else {
4911      if ((src_enc < 16) && (dst_enc < 16)) {
4912        Assembler::pshuflw(dst, src, mode);
4913      } else if (src_enc < 16) {
4914        subptr(rsp, 64);
4915        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4916        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4917        Assembler::pshuflw(xmm0, src, mode);
4918        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4919        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4920        addptr(rsp, 64);
4921      } else if (dst_enc < 16) {
4922        subptr(rsp, 64);
4923        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4924        evmovdqul(xmm0, src, Assembler::AVX_512bit);
4925        Assembler::pshuflw(dst, xmm0, mode);
4926        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4927        addptr(rsp, 64);
4928      } else {
4929        subptr(rsp, 64);
4930        evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
4931        subptr(rsp, 64);
4932        evmovdqul(Address(rsp, 0), xmm1, Assembler::AVX_512bit);
4933        evmovdqul(xmm0, dst, Assembler::AVX_512bit);
4934        evmovdqul(xmm1, src, Assembler::AVX_512bit);
4935        Assembler::pshuflw(xmm0, xmm1, mode);
4936        evmovdqul(dst, xmm0, Assembler::AVX_512bit);
4937        evmovdqul(xmm1, Address(rsp, 0), Assembler::AVX_512bit);
4938        addptr(rsp, 64);
4939        evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
4940        addptr(rsp, 64);
4941      }
4942    }
4943  } else {
4944    Assembler::pshuflw(dst, src, mode);
4945  }
4946}
4947
4948void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4949  if (reachable(src)) {
4950    vandpd(dst, nds, as_Address(src), vector_len);
4951  } else {
4952    lea(rscratch1, src);
4953    vandpd(dst, nds, Address(rscratch1, 0), vector_len);
4954  }
4955}
4956
4957void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
4958  if (reachable(src)) {
4959    vandps(dst, nds, as_Address(src), vector_len);
4960  } else {
4961    lea(rscratch1, src);
4962    vandps(dst, nds, Address(rscratch1, 0), vector_len);
4963  }
4964}
4965
4966void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4967  if (reachable(src)) {
4968    vdivsd(dst, nds, as_Address(src));
4969  } else {
4970    lea(rscratch1, src);
4971    vdivsd(dst, nds, Address(rscratch1, 0));
4972  }
4973}
4974
4975void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4976  if (reachable(src)) {
4977    vdivss(dst, nds, as_Address(src));
4978  } else {
4979    lea(rscratch1, src);
4980    vdivss(dst, nds, Address(rscratch1, 0));
4981  }
4982}
4983
4984void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4985  if (reachable(src)) {
4986    vmulsd(dst, nds, as_Address(src));
4987  } else {
4988    lea(rscratch1, src);
4989    vmulsd(dst, nds, Address(rscratch1, 0));
4990  }
4991}
4992
4993void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4994  if (reachable(src)) {
4995    vmulss(dst, nds, as_Address(src));
4996  } else {
4997    lea(rscratch1, src);
4998    vmulss(dst, nds, Address(rscratch1, 0));
4999  }
5000}
5001
5002void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5003  if (reachable(src)) {
5004    vsubsd(dst, nds, as_Address(src));
5005  } else {
5006    lea(rscratch1, src);
5007    vsubsd(dst, nds, Address(rscratch1, 0));
5008  }
5009}
5010
5011void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5012  if (reachable(src)) {
5013    vsubss(dst, nds, as_Address(src));
5014  } else {
5015    lea(rscratch1, src);
5016    vsubss(dst, nds, Address(rscratch1, 0));
5017  }
5018}
5019
5020void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5021  int nds_enc = nds->encoding();
5022  int dst_enc = dst->encoding();
5023  bool dst_upper_bank = (dst_enc > 15);
5024  bool nds_upper_bank = (nds_enc > 15);
5025  if (VM_Version::supports_avx512novl() &&
5026      (nds_upper_bank || dst_upper_bank)) {
5027    if (dst_upper_bank) {
5028      subptr(rsp, 64);
5029      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5030      movflt(xmm0, nds);
5031      vxorps(xmm0, xmm0, src, Assembler::AVX_128bit);
5032      movflt(dst, xmm0);
5033      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5034      addptr(rsp, 64);
5035    } else {
5036      movflt(dst, nds);
5037      vxorps(dst, dst, src, Assembler::AVX_128bit);
5038    }
5039  } else {
5040    vxorps(dst, nds, src, Assembler::AVX_128bit);
5041  }
5042}
5043
5044void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
5045  int nds_enc = nds->encoding();
5046  int dst_enc = dst->encoding();
5047  bool dst_upper_bank = (dst_enc > 15);
5048  bool nds_upper_bank = (nds_enc > 15);
5049  if (VM_Version::supports_avx512novl() &&
5050      (nds_upper_bank || dst_upper_bank)) {
5051    if (dst_upper_bank) {
5052      subptr(rsp, 64);
5053      evmovdqul(Address(rsp, 0), xmm0, Assembler::AVX_512bit);
5054      movdbl(xmm0, nds);
5055      vxorpd(xmm0, xmm0, src, Assembler::AVX_128bit);
5056      movdbl(dst, xmm0);
5057      evmovdqul(xmm0, Address(rsp, 0), Assembler::AVX_512bit);
5058      addptr(rsp, 64);
5059    } else {
5060      movdbl(dst, nds);
5061      vxorpd(dst, dst, src, Assembler::AVX_128bit);
5062    }
5063  } else {
5064    vxorpd(dst, nds, src, Assembler::AVX_128bit);
5065  }
5066}
5067
5068void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5069  if (reachable(src)) {
5070    vxorpd(dst, nds, as_Address(src), vector_len);
5071  } else {
5072    lea(rscratch1, src);
5073    vxorpd(dst, nds, Address(rscratch1, 0), vector_len);
5074  }
5075}
5076
5077void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) {
5078  if (reachable(src)) {
5079    vxorps(dst, nds, as_Address(src), vector_len);
5080  } else {
5081    lea(rscratch1, src);
5082    vxorps(dst, nds, Address(rscratch1, 0), vector_len);
5083  }
5084}
5085
5086
5087//////////////////////////////////////////////////////////////////////////////////
5088#if INCLUDE_ALL_GCS
5089
5090void MacroAssembler::g1_write_barrier_pre(Register obj,
5091                                          Register pre_val,
5092                                          Register thread,
5093                                          Register tmp,
5094                                          bool tosca_live,
5095                                          bool expand_call) {
5096
5097  // If expand_call is true then we expand the call_VM_leaf macro
5098  // directly to skip generating the check by
5099  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
5100
5101#ifdef _LP64
5102  assert(thread == r15_thread, "must be");
5103#endif // _LP64
5104
5105  Label done;
5106  Label runtime;
5107
5108  assert(pre_val != noreg, "check this code");
5109
5110  if (obj != noreg) {
5111    assert_different_registers(obj, pre_val, tmp);
5112    assert(pre_val != rax, "check this code");
5113  }
5114
5115  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
5116                                       SATBMarkQueue::byte_offset_of_active()));
5117  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
5118                                       SATBMarkQueue::byte_offset_of_index()));
5119  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
5120                                       SATBMarkQueue::byte_offset_of_buf()));
5121
5122
5123  // Is marking active?
5124  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
5125    cmpl(in_progress, 0);
5126  } else {
5127    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
5128    cmpb(in_progress, 0);
5129  }
5130  jcc(Assembler::equal, done);
5131
5132  // Do we need to load the previous value?
5133  if (obj != noreg) {
5134    load_heap_oop(pre_val, Address(obj, 0));
5135  }
5136
5137  // Is the previous value null?
5138  cmpptr(pre_val, (int32_t) NULL_WORD);
5139  jcc(Assembler::equal, done);
5140
5141  // Can we store original value in the thread's buffer?
5142  // Is index == 0?
5143  // (The index field is typed as size_t.)
5144
5145  movptr(tmp, index);                   // tmp := *index_adr
5146  cmpptr(tmp, 0);                       // tmp == 0?
5147  jcc(Assembler::equal, runtime);       // If yes, goto runtime
5148
5149  subptr(tmp, wordSize);                // tmp := tmp - wordSize
5150  movptr(index, tmp);                   // *index_adr := tmp
5151  addptr(tmp, buffer);                  // tmp := tmp + *buffer_adr
5152
5153  // Record the previous value
5154  movptr(Address(tmp, 0), pre_val);
5155  jmp(done);
5156
5157  bind(runtime);
5158  // save the live input values
5159  if(tosca_live) push(rax);
5160
5161  if (obj != noreg && obj != rax)
5162    push(obj);
5163
5164  if (pre_val != rax)
5165    push(pre_val);
5166
5167  // Calling the runtime using the regular call_VM_leaf mechanism generates
5168  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
5169  // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
5170  //
5171  // If we care generating the pre-barrier without a frame (e.g. in the
5172  // intrinsified Reference.get() routine) then ebp might be pointing to
5173  // the caller frame and so this check will most likely fail at runtime.
5174  //
5175  // Expanding the call directly bypasses the generation of the check.
5176  // So when we do not have have a full interpreter frame on the stack
5177  // expand_call should be passed true.
5178
5179  NOT_LP64( push(thread); )
5180
5181  if (expand_call) {
5182    LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
5183    pass_arg1(this, thread);
5184    pass_arg0(this, pre_val);
5185    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
5186  } else {
5187    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
5188  }
5189
5190  NOT_LP64( pop(thread); )
5191
5192  // save the live input values
5193  if (pre_val != rax)
5194    pop(pre_val);
5195
5196  if (obj != noreg && obj != rax)
5197    pop(obj);
5198
5199  if(tosca_live) pop(rax);
5200
5201  bind(done);
5202}
5203
5204void MacroAssembler::g1_write_barrier_post(Register store_addr,
5205                                           Register new_val,
5206                                           Register thread,
5207                                           Register tmp,
5208                                           Register tmp2) {
5209#ifdef _LP64
5210  assert(thread == r15_thread, "must be");
5211#endif // _LP64
5212
5213  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
5214                                       DirtyCardQueue::byte_offset_of_index()));
5215  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
5216                                       DirtyCardQueue::byte_offset_of_buf()));
5217
5218  CardTableModRefBS* ct =
5219    barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set());
5220  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
5221
5222  Label done;
5223  Label runtime;
5224
5225  // Does store cross heap regions?
5226
5227  movptr(tmp, store_addr);
5228  xorptr(tmp, new_val);
5229  shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
5230  jcc(Assembler::equal, done);
5231
5232  // crosses regions, storing NULL?
5233
5234  cmpptr(new_val, (int32_t) NULL_WORD);
5235  jcc(Assembler::equal, done);
5236
5237  // storing region crossing non-NULL, is card already dirty?
5238
5239  const Register card_addr = tmp;
5240  const Register cardtable = tmp2;
5241
5242  movptr(card_addr, store_addr);
5243  shrptr(card_addr, CardTableModRefBS::card_shift);
5244  // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
5245  // a valid address and therefore is not properly handled by the relocation code.
5246  movptr(cardtable, (intptr_t)ct->byte_map_base);
5247  addptr(card_addr, cardtable);
5248
5249  cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
5250  jcc(Assembler::equal, done);
5251
5252  membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
5253  cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
5254  jcc(Assembler::equal, done);
5255
5256
5257  // storing a region crossing, non-NULL oop, card is clean.
5258  // dirty card and log.
5259
5260  movb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
5261
5262  cmpl(queue_index, 0);
5263  jcc(Assembler::equal, runtime);
5264  subl(queue_index, wordSize);
5265  movptr(tmp2, buffer);
5266#ifdef _LP64
5267  movslq(rscratch1, queue_index);
5268  addq(tmp2, rscratch1);
5269  movq(Address(tmp2, 0), card_addr);
5270#else
5271  addl(tmp2, queue_index);
5272  movl(Address(tmp2, 0), card_addr);
5273#endif
5274  jmp(done);
5275
5276  bind(runtime);
5277  // save the live input values
5278  push(store_addr);
5279  push(new_val);
5280#ifdef _LP64
5281  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
5282#else
5283  push(thread);
5284  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
5285  pop(thread);
5286#endif
5287  pop(new_val);
5288  pop(store_addr);
5289
5290  bind(done);
5291}
5292
5293#endif // INCLUDE_ALL_GCS
5294//////////////////////////////////////////////////////////////////////////////////
5295
5296
5297void MacroAssembler::store_check(Register obj, Address dst) {
5298  store_check(obj);
5299}
5300
5301void MacroAssembler::store_check(Register obj) {
5302  // Does a store check for the oop in register obj. The content of
5303  // register obj is destroyed afterwards.
5304  BarrierSet* bs = Universe::heap()->barrier_set();
5305  assert(bs->kind() == BarrierSet::CardTableForRS ||
5306         bs->kind() == BarrierSet::CardTableExtension,
5307         "Wrong barrier set kind");
5308
5309  CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
5310  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
5311
5312  shrptr(obj, CardTableModRefBS::card_shift);
5313
5314  Address card_addr;
5315
5316  // The calculation for byte_map_base is as follows:
5317  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
5318  // So this essentially converts an address to a displacement and it will
5319  // never need to be relocated. On 64bit however the value may be too
5320  // large for a 32bit displacement.
5321  intptr_t disp = (intptr_t) ct->byte_map_base;
5322  if (is_simm32(disp)) {
5323    card_addr = Address(noreg, obj, Address::times_1, disp);
5324  } else {
5325    // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
5326    // displacement and done in a single instruction given favorable mapping and a
5327    // smarter version of as_Address. However, 'ExternalAddress' generates a relocation
5328    // entry and that entry is not properly handled by the relocation code.
5329    AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none);
5330    Address index(noreg, obj, Address::times_1);
5331    card_addr = as_Address(ArrayAddress(cardtable, index));
5332  }
5333
5334  int dirty = CardTableModRefBS::dirty_card_val();
5335  if (UseCondCardMark) {
5336    Label L_already_dirty;
5337    if (UseConcMarkSweepGC) {
5338      membar(Assembler::StoreLoad);
5339    }
5340    cmpb(card_addr, dirty);
5341    jcc(Assembler::equal, L_already_dirty);
5342    movb(card_addr, dirty);
5343    bind(L_already_dirty);
5344  } else {
5345    movb(card_addr, dirty);
5346  }
5347}
5348
5349void MacroAssembler::subptr(Register dst, int32_t imm32) {
5350  LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
5351}
5352
5353// Force generation of a 4 byte immediate value even if it fits into 8bit
5354void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
5355  LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
5356}
5357
5358void MacroAssembler::subptr(Register dst, Register src) {
5359  LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
5360}
5361
5362// C++ bool manipulation
5363void MacroAssembler::testbool(Register dst) {
5364  if(sizeof(bool) == 1)
5365    testb(dst, 0xff);
5366  else if(sizeof(bool) == 2) {
5367    // testw implementation needed for two byte bools
5368    ShouldNotReachHere();
5369  } else if(sizeof(bool) == 4)
5370    testl(dst, dst);
5371  else
5372    // unsupported
5373    ShouldNotReachHere();
5374}
5375
5376void MacroAssembler::testptr(Register dst, Register src) {
5377  LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
5378}
5379
5380// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
5381void MacroAssembler::tlab_allocate(Register obj,
5382                                   Register var_size_in_bytes,
5383                                   int con_size_in_bytes,
5384                                   Register t1,
5385                                   Register t2,
5386                                   Label& slow_case) {
5387  assert_different_registers(obj, t1, t2);
5388  assert_different_registers(obj, var_size_in_bytes, t1);
5389  Register end = t2;
5390  Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
5391
5392  verify_tlab();
5393
5394  NOT_LP64(get_thread(thread));
5395
5396  movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
5397  if (var_size_in_bytes == noreg) {
5398    lea(end, Address(obj, con_size_in_bytes));
5399  } else {
5400    lea(end, Address(obj, var_size_in_bytes, Address::times_1));
5401  }
5402  cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
5403  jcc(Assembler::above, slow_case);
5404
5405  // update the tlab top pointer
5406  movptr(Address(thread, JavaThread::tlab_top_offset()), end);
5407
5408  // recover var_size_in_bytes if necessary
5409  if (var_size_in_bytes == end) {
5410    subptr(var_size_in_bytes, obj);
5411  }
5412  verify_tlab();
5413}
5414
5415// Preserves rbx, and rdx.
5416Register MacroAssembler::tlab_refill(Label& retry,
5417                                     Label& try_eden,
5418                                     Label& slow_case) {
5419  Register top = rax;
5420  Register t1  = rcx;
5421  Register t2  = rsi;
5422  Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
5423  assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
5424  Label do_refill, discard_tlab;
5425
5426  if (!Universe::heap()->supports_inline_contig_alloc()) {
5427    // No allocation in the shared eden.
5428    jmp(slow_case);
5429  }
5430
5431  NOT_LP64(get_thread(thread_reg));
5432
5433  movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5434  movptr(t1,  Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
5435
5436  // calculate amount of free space
5437  subptr(t1, top);
5438  shrptr(t1, LogHeapWordSize);
5439
5440  // Retain tlab and allocate object in shared space if
5441  // the amount free in the tlab is too large to discard.
5442  cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
5443  jcc(Assembler::lessEqual, discard_tlab);
5444
5445  // Retain
5446  // %%% yuck as movptr...
5447  movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
5448  addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
5449  if (TLABStats) {
5450    // increment number of slow_allocations
5451    addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
5452  }
5453  jmp(try_eden);
5454
5455  bind(discard_tlab);
5456  if (TLABStats) {
5457    // increment number of refills
5458    addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
5459    // accumulate wastage -- t1 is amount free in tlab
5460    addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
5461  }
5462
5463  // if tlab is currently allocated (top or end != null) then
5464  // fill [top, end + alignment_reserve) with array object
5465  testptr(top, top);
5466  jcc(Assembler::zero, do_refill);
5467
5468  // set up the mark word
5469  movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
5470  // set the length to the remaining space
5471  subptr(t1, typeArrayOopDesc::header_size(T_INT));
5472  addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
5473  shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
5474  movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
5475  // set klass to intArrayKlass
5476  // dubious reloc why not an oop reloc?
5477  movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
5478  // store klass last.  concurrent gcs assumes klass length is valid if
5479  // klass field is not null.
5480  store_klass(top, t1);
5481
5482  movptr(t1, top);
5483  subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5484  incr_allocated_bytes(thread_reg, t1, 0);
5485
5486  // refill the tlab with an eden allocation
5487  bind(do_refill);
5488  movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
5489  shlptr(t1, LogHeapWordSize);
5490  // allocate new tlab, address returned in top
5491  eden_allocate(top, t1, 0, t2, slow_case);
5492
5493  // Check that t1 was preserved in eden_allocate.
5494#ifdef ASSERT
5495  if (UseTLAB) {
5496    Label ok;
5497    Register tsize = rsi;
5498    assert_different_registers(tsize, thread_reg, t1);
5499    push(tsize);
5500    movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
5501    shlptr(tsize, LogHeapWordSize);
5502    cmpptr(t1, tsize);
5503    jcc(Assembler::equal, ok);
5504    STOP("assert(t1 != tlab size)");
5505    should_not_reach_here();
5506
5507    bind(ok);
5508    pop(tsize);
5509  }
5510#endif
5511  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
5512  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
5513  addptr(top, t1);
5514  subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
5515  movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
5516  verify_tlab();
5517  jmp(retry);
5518
5519  return thread_reg; // for use by caller
5520}
5521
5522void MacroAssembler::incr_allocated_bytes(Register thread,
5523                                          Register var_size_in_bytes,
5524                                          int con_size_in_bytes,
5525                                          Register t1) {
5526  if (!thread->is_valid()) {
5527#ifdef _LP64
5528    thread = r15_thread;
5529#else
5530    assert(t1->is_valid(), "need temp reg");
5531    thread = t1;
5532    get_thread(thread);
5533#endif
5534  }
5535
5536#ifdef _LP64
5537  if (var_size_in_bytes->is_valid()) {
5538    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
5539  } else {
5540    addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
5541  }
5542#else
5543  if (var_size_in_bytes->is_valid()) {
5544    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
5545  } else {
5546    addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
5547  }
5548  adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
5549#endif
5550}
5551
5552void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
5553  pusha();
5554
5555  // if we are coming from c1, xmm registers may be live
5556  int num_xmm_regs = LP64_ONLY(16) NOT_LP64(8);
5557  if (UseAVX > 2) {
5558    num_xmm_regs = LP64_ONLY(32) NOT_LP64(8);
5559  }
5560
5561  if (UseSSE == 1)  {
5562    subptr(rsp, sizeof(jdouble)*8);
5563    for (int n = 0; n < 8; n++) {
5564      movflt(Address(rsp, n*sizeof(jdouble)), as_XMMRegister(n));
5565    }
5566  } else if (UseSSE >= 2)  {
5567    if (UseAVX > 2) {
5568      push(rbx);
5569      movl(rbx, 0xffff);
5570      kmovwl(k1, rbx);
5571      pop(rbx);
5572    }
5573#ifdef COMPILER2
5574    if (MaxVectorSize > 16) {
5575      if(UseAVX > 2) {
5576        // Save upper half of ZMM registers
5577        subptr(rsp, 32*num_xmm_regs);
5578        for (int n = 0; n < num_xmm_regs; n++) {
5579          vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
5580        }
5581      }
5582      assert(UseAVX > 0, "256 bit vectors are supported only with AVX");
5583      // Save upper half of YMM registers
5584      subptr(rsp, 16*num_xmm_regs);
5585      for (int n = 0; n < num_xmm_regs; n++) {
5586        vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
5587      }
5588    }
5589#endif
5590    // Save whole 128bit (16 bytes) XMM registers
5591    subptr(rsp, 16*num_xmm_regs);
5592#ifdef _LP64
5593    if (VM_Version::supports_evex()) {
5594      for (int n = 0; n < num_xmm_regs; n++) {
5595        vextractf32x4h(Address(rsp, n*16), as_XMMRegister(n), 0);
5596      }
5597    } else {
5598      for (int n = 0; n < num_xmm_regs; n++) {
5599        movdqu(Address(rsp, n*16), as_XMMRegister(n));
5600      }
5601    }
5602#else
5603    for (int n = 0; n < num_xmm_regs; n++) {
5604      movdqu(Address(rsp, n*16), as_XMMRegister(n));
5605    }
5606#endif
5607  }
5608
5609  // Preserve registers across runtime call
5610  int incoming_argument_and_return_value_offset = -1;
5611  if (num_fpu_regs_in_use > 1) {
5612    // Must preserve all other FPU regs (could alternatively convert
5613    // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
5614    // FPU state, but can not trust C compiler)
5615    NEEDS_CLEANUP;
5616    // NOTE that in this case we also push the incoming argument(s) to
5617    // the stack and restore it later; we also use this stack slot to
5618    // hold the return value from dsin, dcos etc.
5619    for (int i = 0; i < num_fpu_regs_in_use; i++) {
5620      subptr(rsp, sizeof(jdouble));
5621      fstp_d(Address(rsp, 0));
5622    }
5623    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
5624    for (int i = nb_args-1; i >= 0; i--) {
5625      fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
5626    }
5627  }
5628
5629  subptr(rsp, nb_args*sizeof(jdouble));
5630  for (int i = 0; i < nb_args; i++) {
5631    fstp_d(Address(rsp, i*sizeof(jdouble)));
5632  }
5633
5634#ifdef _LP64
5635  if (nb_args > 0) {
5636    movdbl(xmm0, Address(rsp, 0));
5637  }
5638  if (nb_args > 1) {
5639    movdbl(xmm1, Address(rsp, sizeof(jdouble)));
5640  }
5641  assert(nb_args <= 2, "unsupported number of args");
5642#endif // _LP64
5643
5644  // NOTE: we must not use call_VM_leaf here because that requires a
5645  // complete interpreter frame in debug mode -- same bug as 4387334
5646  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
5647  // do proper 64bit abi
5648
5649  NEEDS_CLEANUP;
5650  // Need to add stack banging before this runtime call if it needs to
5651  // be taken; however, there is no generic stack banging routine at
5652  // the MacroAssembler level
5653
5654  MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
5655
5656#ifdef _LP64
5657  movsd(Address(rsp, 0), xmm0);
5658  fld_d(Address(rsp, 0));
5659#endif // _LP64
5660  addptr(rsp, sizeof(jdouble)*nb_args);
5661  if (num_fpu_regs_in_use > 1) {
5662    // Must save return value to stack and then restore entire FPU
5663    // stack except incoming arguments
5664    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
5665    for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
5666      fld_d(Address(rsp, 0));
5667      addptr(rsp, sizeof(jdouble));
5668    }
5669    fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
5670    addptr(rsp, sizeof(jdouble)*nb_args);
5671  }
5672
5673  if (UseSSE == 1)  {
5674    for (int n = 0; n < 8; n++) {
5675      movflt(as_XMMRegister(n), Address(rsp, n*sizeof(jdouble)));
5676    }
5677    addptr(rsp, sizeof(jdouble)*8);
5678  } else if (UseSSE >= 2)  {
5679    // Restore whole 128bit (16 bytes) XMM registers
5680#ifdef _LP64
5681  if (VM_Version::supports_evex()) {
5682    for (int n = 0; n < num_xmm_regs; n++) {
5683      vinsertf32x4h(as_XMMRegister(n), Address(rsp, n*16), 0);
5684    }
5685  } else {
5686    for (int n = 0; n < num_xmm_regs; n++) {
5687      movdqu(as_XMMRegister(n), Address(rsp, n*16));
5688    }
5689  }
5690#else
5691  for (int n = 0; n < num_xmm_regs; n++) {
5692    movdqu(as_XMMRegister(n), Address(rsp, n*16));
5693  }
5694#endif
5695    addptr(rsp, 16*num_xmm_regs);
5696
5697#ifdef COMPILER2
5698    if (MaxVectorSize > 16) {
5699      // Restore upper half of YMM registers.
5700      for (int n = 0; n < num_xmm_regs; n++) {
5701        vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
5702      }
5703      addptr(rsp, 16*num_xmm_regs);
5704      if(UseAVX > 2) {
5705        for (int n = 0; n < num_xmm_regs; n++) {
5706          vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
5707        }
5708        addptr(rsp, 32*num_xmm_regs);
5709      }
5710    }
5711#endif
5712  }
5713  popa();
5714}
5715
5716static const double     pi_4 =  0.7853981633974483;
5717
5718void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
5719  // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
5720  // was attempted in this code; unfortunately it appears that the
5721  // switch to 80-bit precision and back causes this to be
5722  // unprofitable compared with simply performing a runtime call if
5723  // the argument is out of the (-pi/4, pi/4) range.
5724
5725  Register tmp = noreg;
5726  if (!VM_Version::supports_cmov()) {
5727    // fcmp needs a temporary so preserve rbx,
5728    tmp = rbx;
5729    push(tmp);
5730  }
5731
5732  Label slow_case, done;
5733
5734  ExternalAddress pi4_adr = (address)&pi_4;
5735  if (reachable(pi4_adr)) {
5736    // x ?<= pi/4
5737    fld_d(pi4_adr);
5738    fld_s(1);                // Stack:  X  PI/4  X
5739    fabs();                  // Stack: |X| PI/4  X
5740    fcmp(tmp);
5741    jcc(Assembler::above, slow_case);
5742
5743    // fastest case: -pi/4 <= x <= pi/4
5744    switch(trig) {
5745    case 's':
5746      fsin();
5747      break;
5748    case 'c':
5749      fcos();
5750      break;
5751    case 't':
5752      ftan();
5753      break;
5754    default:
5755      assert(false, "bad intrinsic");
5756      break;
5757    }
5758    jmp(done);
5759  }
5760
5761  // slow case: runtime call
5762  bind(slow_case);
5763
5764  switch(trig) {
5765  case 's':
5766    {
5767      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
5768    }
5769    break;
5770  case 'c':
5771    {
5772      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
5773    }
5774    break;
5775  case 't':
5776    {
5777      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
5778    }
5779    break;
5780  default:
5781    assert(false, "bad intrinsic");
5782    break;
5783  }
5784
5785  // Come here with result in F-TOS
5786  bind(done);
5787
5788  if (tmp != noreg) {
5789    pop(tmp);
5790  }
5791}
5792
5793
5794// Look up the method for a megamorphic invokeinterface call.
5795// The target method is determined by <intf_klass, itable_index>.
5796// The receiver klass is in recv_klass.
5797// On success, the result will be in method_result, and execution falls through.
5798// On failure, execution transfers to the given label.
5799void MacroAssembler::lookup_interface_method(Register recv_klass,
5800                                             Register intf_klass,
5801                                             RegisterOrConstant itable_index,
5802                                             Register method_result,
5803                                             Register scan_temp,
5804                                             Label& L_no_such_interface) {
5805  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
5806  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
5807         "caller must use same register for non-constant itable index as for method");
5808
5809  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
5810  int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
5811  int itentry_off = itableMethodEntry::method_offset_in_bytes();
5812  int scan_step   = itableOffsetEntry::size() * wordSize;
5813  int vte_size    = vtableEntry::size() * wordSize;
5814  Address::ScaleFactor times_vte_scale = Address::times_ptr;
5815  assert(vte_size == wordSize, "else adjust times_vte_scale");
5816
5817  movl(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
5818
5819  // %%% Could store the aligned, prescaled offset in the klassoop.
5820  lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
5821  if (HeapWordsPerLong > 1) {
5822    // Round up to align_object_offset boundary
5823    // see code for InstanceKlass::start_of_itable!
5824    round_to(scan_temp, BytesPerLong);
5825  }
5826
5827  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
5828  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
5829  lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
5830
5831  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
5832  //   if (scan->interface() == intf) {
5833  //     result = (klass + scan->offset() + itable_index);
5834  //   }
5835  // }
5836  Label search, found_method;
5837
5838  for (int peel = 1; peel >= 0; peel--) {
5839    movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
5840    cmpptr(intf_klass, method_result);
5841
5842    if (peel) {
5843      jccb(Assembler::equal, found_method);
5844    } else {
5845      jccb(Assembler::notEqual, search);
5846      // (invert the test to fall through to found_method...)
5847    }
5848
5849    if (!peel)  break;
5850
5851    bind(search);
5852
5853    // Check that the previous entry is non-null.  A null entry means that
5854    // the receiver class doesn't implement the interface, and wasn't the
5855    // same as when the caller was compiled.
5856    testptr(method_result, method_result);
5857    jcc(Assembler::zero, L_no_such_interface);
5858    addptr(scan_temp, scan_step);
5859  }
5860
5861  bind(found_method);
5862
5863  // Got a hit.
5864  movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
5865  movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
5866}
5867
5868
5869// virtual method calling
5870void MacroAssembler::lookup_virtual_method(Register recv_klass,
5871                                           RegisterOrConstant vtable_index,
5872                                           Register method_result) {
5873  const int base = InstanceKlass::vtable_start_offset() * wordSize;
5874  assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
5875  Address vtable_entry_addr(recv_klass,
5876                            vtable_index, Address::times_ptr,
5877                            base + vtableEntry::method_offset_in_bytes());
5878  movptr(method_result, vtable_entry_addr);
5879}
5880
5881
5882void MacroAssembler::check_klass_subtype(Register sub_klass,
5883                           Register super_klass,
5884                           Register temp_reg,
5885                           Label& L_success) {
5886  Label L_failure;
5887  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
5888  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
5889  bind(L_failure);
5890}
5891
5892
5893void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
5894                                                   Register super_klass,
5895                                                   Register temp_reg,
5896                                                   Label* L_success,
5897                                                   Label* L_failure,
5898                                                   Label* L_slow_path,
5899                                        RegisterOrConstant super_check_offset) {
5900  assert_different_registers(sub_klass, super_klass, temp_reg);
5901  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
5902  if (super_check_offset.is_register()) {
5903    assert_different_registers(sub_klass, super_klass,
5904                               super_check_offset.as_register());
5905  } else if (must_load_sco) {
5906    assert(temp_reg != noreg, "supply either a temp or a register offset");
5907  }
5908
5909  Label L_fallthrough;
5910  int label_nulls = 0;
5911  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
5912  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
5913  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
5914  assert(label_nulls <= 1, "at most one NULL in the batch");
5915
5916  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
5917  int sco_offset = in_bytes(Klass::super_check_offset_offset());
5918  Address super_check_offset_addr(super_klass, sco_offset);
5919
5920  // Hacked jcc, which "knows" that L_fallthrough, at least, is in
5921  // range of a jccb.  If this routine grows larger, reconsider at
5922  // least some of these.
5923#define local_jcc(assembler_cond, label)                                \
5924  if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
5925  else                             jcc( assembler_cond, label) /*omit semi*/
5926
5927  // Hacked jmp, which may only be used just before L_fallthrough.
5928#define final_jmp(label)                                                \
5929  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
5930  else                            jmp(label)                /*omit semi*/
5931
5932  // If the pointers are equal, we are done (e.g., String[] elements).
5933  // This self-check enables sharing of secondary supertype arrays among
5934  // non-primary types such as array-of-interface.  Otherwise, each such
5935  // type would need its own customized SSA.
5936  // We move this check to the front of the fast path because many
5937  // type checks are in fact trivially successful in this manner,
5938  // so we get a nicely predicted branch right at the start of the check.
5939  cmpptr(sub_klass, super_klass);
5940  local_jcc(Assembler::equal, *L_success);
5941
5942  // Check the supertype display:
5943  if (must_load_sco) {
5944    // Positive movl does right thing on LP64.
5945    movl(temp_reg, super_check_offset_addr);
5946    super_check_offset = RegisterOrConstant(temp_reg);
5947  }
5948  Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
5949  cmpptr(super_klass, super_check_addr); // load displayed supertype
5950
5951  // This check has worked decisively for primary supers.
5952  // Secondary supers are sought in the super_cache ('super_cache_addr').
5953  // (Secondary supers are interfaces and very deeply nested subtypes.)
5954  // This works in the same check above because of a tricky aliasing
5955  // between the super_cache and the primary super display elements.
5956  // (The 'super_check_addr' can address either, as the case requires.)
5957  // Note that the cache is updated below if it does not help us find
5958  // what we need immediately.
5959  // So if it was a primary super, we can just fail immediately.
5960  // Otherwise, it's the slow path for us (no success at this point).
5961
5962  if (super_check_offset.is_register()) {
5963    local_jcc(Assembler::equal, *L_success);
5964    cmpl(super_check_offset.as_register(), sc_offset);
5965    if (L_failure == &L_fallthrough) {
5966      local_jcc(Assembler::equal, *L_slow_path);
5967    } else {
5968      local_jcc(Assembler::notEqual, *L_failure);
5969      final_jmp(*L_slow_path);
5970    }
5971  } else if (super_check_offset.as_constant() == sc_offset) {
5972    // Need a slow path; fast failure is impossible.
5973    if (L_slow_path == &L_fallthrough) {
5974      local_jcc(Assembler::equal, *L_success);
5975    } else {
5976      local_jcc(Assembler::notEqual, *L_slow_path);
5977      final_jmp(*L_success);
5978    }
5979  } else {
5980    // No slow path; it's a fast decision.
5981    if (L_failure == &L_fallthrough) {
5982      local_jcc(Assembler::equal, *L_success);
5983    } else {
5984      local_jcc(Assembler::notEqual, *L_failure);
5985      final_jmp(*L_success);
5986    }
5987  }
5988
5989  bind(L_fallthrough);
5990
5991#undef local_jcc
5992#undef final_jmp
5993}
5994
5995
5996void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
5997                                                   Register super_klass,
5998                                                   Register temp_reg,
5999                                                   Register temp2_reg,
6000                                                   Label* L_success,
6001                                                   Label* L_failure,
6002                                                   bool set_cond_codes) {
6003  assert_different_registers(sub_klass, super_klass, temp_reg);
6004  if (temp2_reg != noreg)
6005    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
6006#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
6007
6008  Label L_fallthrough;
6009  int label_nulls = 0;
6010  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
6011  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
6012  assert(label_nulls <= 1, "at most one NULL in the batch");
6013
6014  // a couple of useful fields in sub_klass:
6015  int ss_offset = in_bytes(Klass::secondary_supers_offset());
6016  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
6017  Address secondary_supers_addr(sub_klass, ss_offset);
6018  Address super_cache_addr(     sub_klass, sc_offset);
6019
6020  // Do a linear scan of the secondary super-klass chain.
6021  // This code is rarely used, so simplicity is a virtue here.
6022  // The repne_scan instruction uses fixed registers, which we must spill.
6023  // Don't worry too much about pre-existing connections with the input regs.
6024
6025  assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
6026  assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
6027
6028  // Get super_klass value into rax (even if it was in rdi or rcx).
6029  bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
6030  if (super_klass != rax || UseCompressedOops) {
6031    if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
6032    mov(rax, super_klass);
6033  }
6034  if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
6035  if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
6036
6037#ifndef PRODUCT
6038  int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
6039  ExternalAddress pst_counter_addr((address) pst_counter);
6040  NOT_LP64(  incrementl(pst_counter_addr) );
6041  LP64_ONLY( lea(rcx, pst_counter_addr) );
6042  LP64_ONLY( incrementl(Address(rcx, 0)) );
6043#endif //PRODUCT
6044
6045  // We will consult the secondary-super array.
6046  movptr(rdi, secondary_supers_addr);
6047  // Load the array length.  (Positive movl does right thing on LP64.)
6048  movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
6049  // Skip to start of data.
6050  addptr(rdi, Array<Klass*>::base_offset_in_bytes());
6051
6052  // Scan RCX words at [RDI] for an occurrence of RAX.
6053  // Set NZ/Z based on last compare.
6054  // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
6055  // not change flags (only scas instruction which is repeated sets flags).
6056  // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
6057
6058    testptr(rax,rax); // Set Z = 0
6059    repne_scan();
6060
6061  // Unspill the temp. registers:
6062  if (pushed_rdi)  pop(rdi);
6063  if (pushed_rcx)  pop(rcx);
6064  if (pushed_rax)  pop(rax);
6065
6066  if (set_cond_codes) {
6067    // Special hack for the AD files:  rdi is guaranteed non-zero.
6068    assert(!pushed_rdi, "rdi must be left non-NULL");
6069    // Also, the condition codes are properly set Z/NZ on succeed/failure.
6070  }
6071
6072  if (L_failure == &L_fallthrough)
6073        jccb(Assembler::notEqual, *L_failure);
6074  else  jcc(Assembler::notEqual, *L_failure);
6075
6076  // Success.  Cache the super we found and proceed in triumph.
6077  movptr(super_cache_addr, super_klass);
6078
6079  if (L_success != &L_fallthrough) {
6080    jmp(*L_success);
6081  }
6082
6083#undef IS_A_TEMP
6084
6085  bind(L_fallthrough);
6086}
6087
6088
6089void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
6090  if (VM_Version::supports_cmov()) {
6091    cmovl(cc, dst, src);
6092  } else {
6093    Label L;
6094    jccb(negate_condition(cc), L);
6095    movl(dst, src);
6096    bind(L);
6097  }
6098}
6099
6100void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
6101  if (VM_Version::supports_cmov()) {
6102    cmovl(cc, dst, src);
6103  } else {
6104    Label L;
6105    jccb(negate_condition(cc), L);
6106    movl(dst, src);
6107    bind(L);
6108  }
6109}
6110
6111void MacroAssembler::verify_oop(Register reg, const char* s) {
6112  if (!VerifyOops) return;
6113
6114  // Pass register number to verify_oop_subroutine
6115  const char* b = NULL;
6116  {
6117    ResourceMark rm;
6118    stringStream ss;
6119    ss.print("verify_oop: %s: %s", reg->name(), s);
6120    b = code_string(ss.as_string());
6121  }
6122  BLOCK_COMMENT("verify_oop {");
6123#ifdef _LP64
6124  push(rscratch1);                    // save r10, trashed by movptr()
6125#endif
6126  push(rax);                          // save rax,
6127  push(reg);                          // pass register argument
6128  ExternalAddress buffer((address) b);
6129  // avoid using pushptr, as it modifies scratch registers
6130  // and our contract is not to modify anything
6131  movptr(rax, buffer.addr());
6132  push(rax);
6133  // call indirectly to solve generation ordering problem
6134  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
6135  call(rax);
6136  // Caller pops the arguments (oop, message) and restores rax, r10
6137  BLOCK_COMMENT("} verify_oop");
6138}
6139
6140
6141RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
6142                                                      Register tmp,
6143                                                      int offset) {
6144  intptr_t value = *delayed_value_addr;
6145  if (value != 0)
6146    return RegisterOrConstant(value + offset);
6147
6148  // load indirectly to solve generation ordering problem
6149  movptr(tmp, ExternalAddress((address) delayed_value_addr));
6150
6151#ifdef ASSERT
6152  { Label L;
6153    testptr(tmp, tmp);
6154    if (WizardMode) {
6155      const char* buf = NULL;
6156      {
6157        ResourceMark rm;
6158        stringStream ss;
6159        ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]);
6160        buf = code_string(ss.as_string());
6161      }
6162      jcc(Assembler::notZero, L);
6163      STOP(buf);
6164    } else {
6165      jccb(Assembler::notZero, L);
6166      hlt();
6167    }
6168    bind(L);
6169  }
6170#endif
6171
6172  if (offset != 0)
6173    addptr(tmp, offset);
6174
6175  return RegisterOrConstant(tmp);
6176}
6177
6178
6179Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
6180                                         int extra_slot_offset) {
6181  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
6182  int stackElementSize = Interpreter::stackElementSize;
6183  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
6184#ifdef ASSERT
6185  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
6186  assert(offset1 - offset == stackElementSize, "correct arithmetic");
6187#endif
6188  Register             scale_reg    = noreg;
6189  Address::ScaleFactor scale_factor = Address::no_scale;
6190  if (arg_slot.is_constant()) {
6191    offset += arg_slot.as_constant() * stackElementSize;
6192  } else {
6193    scale_reg    = arg_slot.as_register();
6194    scale_factor = Address::times(stackElementSize);
6195  }
6196  offset += wordSize;           // return PC is on stack
6197  return Address(rsp, scale_reg, scale_factor, offset);
6198}
6199
6200
6201void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
6202  if (!VerifyOops) return;
6203
6204  // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
6205  // Pass register number to verify_oop_subroutine
6206  const char* b = NULL;
6207  {
6208    ResourceMark rm;
6209    stringStream ss;
6210    ss.print("verify_oop_addr: %s", s);
6211    b = code_string(ss.as_string());
6212  }
6213#ifdef _LP64
6214  push(rscratch1);                    // save r10, trashed by movptr()
6215#endif
6216  push(rax);                          // save rax,
6217  // addr may contain rsp so we will have to adjust it based on the push
6218  // we just did (and on 64 bit we do two pushes)
6219  // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
6220  // stores rax into addr which is backwards of what was intended.
6221  if (addr.uses(rsp)) {
6222    lea(rax, addr);
6223    pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
6224  } else {
6225    pushptr(addr);
6226  }
6227
6228  ExternalAddress buffer((address) b);
6229  // pass msg argument
6230  // avoid using pushptr, as it modifies scratch registers
6231  // and our contract is not to modify anything
6232  movptr(rax, buffer.addr());
6233  push(rax);
6234
6235  // call indirectly to solve generation ordering problem
6236  movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
6237  call(rax);
6238  // Caller pops the arguments (addr, message) and restores rax, r10.
6239}
6240
6241void MacroAssembler::verify_tlab() {
6242#ifdef ASSERT
6243  if (UseTLAB && VerifyOops) {
6244    Label next, ok;
6245    Register t1 = rsi;
6246    Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
6247
6248    push(t1);
6249    NOT_LP64(push(thread_reg));
6250    NOT_LP64(get_thread(thread_reg));
6251
6252    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
6253    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
6254    jcc(Assembler::aboveEqual, next);
6255    STOP("assert(top >= start)");
6256    should_not_reach_here();
6257
6258    bind(next);
6259    movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
6260    cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
6261    jcc(Assembler::aboveEqual, ok);
6262    STOP("assert(top <= end)");
6263    should_not_reach_here();
6264
6265    bind(ok);
6266    NOT_LP64(pop(thread_reg));
6267    pop(t1);
6268  }
6269#endif
6270}
6271
6272class ControlWord {
6273 public:
6274  int32_t _value;
6275
6276  int  rounding_control() const        { return  (_value >> 10) & 3      ; }
6277  int  precision_control() const       { return  (_value >>  8) & 3      ; }
6278  bool precision() const               { return ((_value >>  5) & 1) != 0; }
6279  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
6280  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
6281  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
6282  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
6283  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
6284
6285  void print() const {
6286    // rounding control
6287    const char* rc;
6288    switch (rounding_control()) {
6289      case 0: rc = "round near"; break;
6290      case 1: rc = "round down"; break;
6291      case 2: rc = "round up  "; break;
6292      case 3: rc = "chop      "; break;
6293    };
6294    // precision control
6295    const char* pc;
6296    switch (precision_control()) {
6297      case 0: pc = "24 bits "; break;
6298      case 1: pc = "reserved"; break;
6299      case 2: pc = "53 bits "; break;
6300      case 3: pc = "64 bits "; break;
6301    };
6302    // flags
6303    char f[9];
6304    f[0] = ' ';
6305    f[1] = ' ';
6306    f[2] = (precision   ()) ? 'P' : 'p';
6307    f[3] = (underflow   ()) ? 'U' : 'u';
6308    f[4] = (overflow    ()) ? 'O' : 'o';
6309    f[5] = (zero_divide ()) ? 'Z' : 'z';
6310    f[6] = (denormalized()) ? 'D' : 'd';
6311    f[7] = (invalid     ()) ? 'I' : 'i';
6312    f[8] = '\x0';
6313    // output
6314    printf("%04x  masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
6315  }
6316
6317};
6318
6319class StatusWord {
6320 public:
6321  int32_t _value;
6322
6323  bool busy() const                    { return ((_value >> 15) & 1) != 0; }
6324  bool C3() const                      { return ((_value >> 14) & 1) != 0; }
6325  bool C2() const                      { return ((_value >> 10) & 1) != 0; }
6326  bool C1() const                      { return ((_value >>  9) & 1) != 0; }
6327  bool C0() const                      { return ((_value >>  8) & 1) != 0; }
6328  int  top() const                     { return  (_value >> 11) & 7      ; }
6329  bool error_status() const            { return ((_value >>  7) & 1) != 0; }
6330  bool stack_fault() const             { return ((_value >>  6) & 1) != 0; }
6331  bool precision() const               { return ((_value >>  5) & 1) != 0; }
6332  bool underflow() const               { return ((_value >>  4) & 1) != 0; }
6333  bool overflow() const                { return ((_value >>  3) & 1) != 0; }
6334  bool zero_divide() const             { return ((_value >>  2) & 1) != 0; }
6335  bool denormalized() const            { return ((_value >>  1) & 1) != 0; }
6336  bool invalid() const                 { return ((_value >>  0) & 1) != 0; }
6337
6338  void print() const {
6339    // condition codes
6340    char c[5];
6341    c[0] = (C3()) ? '3' : '-';
6342    c[1] = (C2()) ? '2' : '-';
6343    c[2] = (C1()) ? '1' : '-';
6344    c[3] = (C0()) ? '0' : '-';
6345    c[4] = '\x0';
6346    // flags
6347    char f[9];
6348    f[0] = (error_status()) ? 'E' : '-';
6349    f[1] = (stack_fault ()) ? 'S' : '-';
6350    f[2] = (precision   ()) ? 'P' : '-';
6351    f[3] = (underflow   ()) ? 'U' : '-';
6352    f[4] = (overflow    ()) ? 'O' : '-';
6353    f[5] = (zero_divide ()) ? 'Z' : '-';
6354    f[6] = (denormalized()) ? 'D' : '-';
6355    f[7] = (invalid     ()) ? 'I' : '-';
6356    f[8] = '\x0';
6357    // output
6358    printf("%04x  flags = %s, cc =  %s, top = %d", _value & 0xFFFF, f, c, top());
6359  }
6360
6361};
6362
6363class TagWord {
6364 public:
6365  int32_t _value;
6366
6367  int tag_at(int i) const              { return (_value >> (i*2)) & 3; }
6368
6369  void print() const {
6370    printf("%04x", _value & 0xFFFF);
6371  }
6372
6373};
6374
6375class FPU_Register {
6376 public:
6377  int32_t _m0;
6378  int32_t _m1;
6379  int16_t _ex;
6380
6381  bool is_indefinite() const           {
6382    return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
6383  }
6384
6385  void print() const {
6386    char  sign = (_ex < 0) ? '-' : '+';
6387    const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : "   ";
6388    printf("%c%04hx.%08x%08x  %s", sign, _ex, _m1, _m0, kind);
6389  };
6390
6391};
6392
6393class FPU_State {
6394 public:
6395  enum {
6396    register_size       = 10,
6397    number_of_registers =  8,
6398    register_mask       =  7
6399  };
6400
6401  ControlWord  _control_word;
6402  StatusWord   _status_word;
6403  TagWord      _tag_word;
6404  int32_t      _error_offset;
6405  int32_t      _error_selector;
6406  int32_t      _data_offset;
6407  int32_t      _data_selector;
6408  int8_t       _register[register_size * number_of_registers];
6409
6410  int tag_for_st(int i) const          { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
6411  FPU_Register* st(int i) const        { return (FPU_Register*)&_register[register_size * i]; }
6412
6413  const char* tag_as_string(int tag) const {
6414    switch (tag) {
6415      case 0: return "valid";
6416      case 1: return "zero";
6417      case 2: return "special";
6418      case 3: return "empty";
6419    }
6420    ShouldNotReachHere();
6421    return NULL;
6422  }
6423
6424  void print() const {
6425    // print computation registers
6426    { int t = _status_word.top();
6427      for (int i = 0; i < number_of_registers; i++) {
6428        int j = (i - t) & register_mask;
6429        printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
6430        st(j)->print();
6431        printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
6432      }
6433    }
6434    printf("\n");
6435    // print control registers
6436    printf("ctrl = "); _control_word.print(); printf("\n");
6437    printf("stat = "); _status_word .print(); printf("\n");
6438    printf("tags = "); _tag_word    .print(); printf("\n");
6439  }
6440
6441};
6442
6443class Flag_Register {
6444 public:
6445  int32_t _value;
6446
6447  bool overflow() const                { return ((_value >> 11) & 1) != 0; }
6448  bool direction() const               { return ((_value >> 10) & 1) != 0; }
6449  bool sign() const                    { return ((_value >>  7) & 1) != 0; }
6450  bool zero() const                    { return ((_value >>  6) & 1) != 0; }
6451  bool auxiliary_carry() const         { return ((_value >>  4) & 1) != 0; }
6452  bool parity() const                  { return ((_value >>  2) & 1) != 0; }
6453  bool carry() const                   { return ((_value >>  0) & 1) != 0; }
6454
6455  void print() const {
6456    // flags
6457    char f[8];
6458    f[0] = (overflow       ()) ? 'O' : '-';
6459    f[1] = (direction      ()) ? 'D' : '-';
6460    f[2] = (sign           ()) ? 'S' : '-';
6461    f[3] = (zero           ()) ? 'Z' : '-';
6462    f[4] = (auxiliary_carry()) ? 'A' : '-';
6463    f[5] = (parity         ()) ? 'P' : '-';
6464    f[6] = (carry          ()) ? 'C' : '-';
6465    f[7] = '\x0';
6466    // output
6467    printf("%08x  flags = %s", _value, f);
6468  }
6469
6470};
6471
6472class IU_Register {
6473 public:
6474  int32_t _value;
6475
6476  void print() const {
6477    printf("%08x  %11d", _value, _value);
6478  }
6479
6480};
6481
6482class IU_State {
6483 public:
6484  Flag_Register _eflags;
6485  IU_Register   _rdi;
6486  IU_Register   _rsi;
6487  IU_Register   _rbp;
6488  IU_Register   _rsp;
6489  IU_Register   _rbx;
6490  IU_Register   _rdx;
6491  IU_Register   _rcx;
6492  IU_Register   _rax;
6493
6494  void print() const {
6495    // computation registers
6496    printf("rax,  = "); _rax.print(); printf("\n");
6497    printf("rbx,  = "); _rbx.print(); printf("\n");
6498    printf("rcx  = "); _rcx.print(); printf("\n");
6499    printf("rdx  = "); _rdx.print(); printf("\n");
6500    printf("rdi  = "); _rdi.print(); printf("\n");
6501    printf("rsi  = "); _rsi.print(); printf("\n");
6502    printf("rbp,  = "); _rbp.print(); printf("\n");
6503    printf("rsp  = "); _rsp.print(); printf("\n");
6504    printf("\n");
6505    // control registers
6506    printf("flgs = "); _eflags.print(); printf("\n");
6507  }
6508};
6509
6510
6511class CPU_State {
6512 public:
6513  FPU_State _fpu_state;
6514  IU_State  _iu_state;
6515
6516  void print() const {
6517    printf("--------------------------------------------------\n");
6518    _iu_state .print();
6519    printf("\n");
6520    _fpu_state.print();
6521    printf("--------------------------------------------------\n");
6522  }
6523
6524};
6525
6526
6527static void _print_CPU_state(CPU_State* state) {
6528  state->print();
6529};
6530
6531
6532void MacroAssembler::print_CPU_state() {
6533  push_CPU_state();
6534  push(rsp);                // pass CPU state
6535  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
6536  addptr(rsp, wordSize);       // discard argument
6537  pop_CPU_state();
6538}
6539
6540
6541static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
6542  static int counter = 0;
6543  FPU_State* fs = &state->_fpu_state;
6544  counter++;
6545  // For leaf calls, only verify that the top few elements remain empty.
6546  // We only need 1 empty at the top for C2 code.
6547  if( stack_depth < 0 ) {
6548    if( fs->tag_for_st(7) != 3 ) {
6549      printf("FPR7 not empty\n");
6550      state->print();
6551      assert(false, "error");
6552      return false;
6553    }
6554    return true;                // All other stack states do not matter
6555  }
6556
6557  assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
6558         "bad FPU control word");
6559
6560  // compute stack depth
6561  int i = 0;
6562  while (i < FPU_State::number_of_registers && fs->tag_for_st(i)  < 3) i++;
6563  int d = i;
6564  while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
6565  // verify findings
6566  if (i != FPU_State::number_of_registers) {
6567    // stack not contiguous
6568    printf("%s: stack not contiguous at ST%d\n", s, i);
6569    state->print();
6570    assert(false, "error");
6571    return false;
6572  }
6573  // check if computed stack depth corresponds to expected stack depth
6574  if (stack_depth < 0) {
6575    // expected stack depth is -stack_depth or less
6576    if (d > -stack_depth) {
6577      // too many elements on the stack
6578      printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
6579      state->print();
6580      assert(false, "error");
6581      return false;
6582    }
6583  } else {
6584    // expected stack depth is stack_depth
6585    if (d != stack_depth) {
6586      // wrong stack depth
6587      printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
6588      state->print();
6589      assert(false, "error");
6590      return false;
6591    }
6592  }
6593  // everything is cool
6594  return true;
6595}
6596
6597
6598void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
6599  if (!VerifyFPU) return;
6600  push_CPU_state();
6601  push(rsp);                // pass CPU state
6602  ExternalAddress msg((address) s);
6603  // pass message string s
6604  pushptr(msg.addr());
6605  push(stack_depth);        // pass stack depth
6606  call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
6607  addptr(rsp, 3 * wordSize);   // discard arguments
6608  // check for error
6609  { Label L;
6610    testl(rax, rax);
6611    jcc(Assembler::notZero, L);
6612    int3();                  // break if error condition
6613    bind(L);
6614  }
6615  pop_CPU_state();
6616}
6617
6618void MacroAssembler::restore_cpu_control_state_after_jni() {
6619  // Either restore the MXCSR register after returning from the JNI Call
6620  // or verify that it wasn't changed (with -Xcheck:jni flag).
6621  if (VM_Version::supports_sse()) {
6622    if (RestoreMXCSROnJNICalls) {
6623      ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
6624    } else if (CheckJNICalls) {
6625      call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
6626    }
6627  }
6628  if (VM_Version::supports_avx()) {
6629    // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
6630    vzeroupper();
6631  }
6632
6633#ifndef _LP64
6634  // Either restore the x87 floating pointer control word after returning
6635  // from the JNI call or verify that it wasn't changed.
6636  if (CheckJNICalls) {
6637    call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
6638  }
6639#endif // _LP64
6640}
6641
6642
6643void MacroAssembler::load_klass(Register dst, Register src) {
6644#ifdef _LP64
6645  if (UseCompressedClassPointers) {
6646    movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
6647    decode_klass_not_null(dst);
6648  } else
6649#endif
6650    movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
6651}
6652
6653void MacroAssembler::load_prototype_header(Register dst, Register src) {
6654  load_klass(dst, src);
6655  movptr(dst, Address(dst, Klass::prototype_header_offset()));
6656}
6657
6658void MacroAssembler::store_klass(Register dst, Register src) {
6659#ifdef _LP64
6660  if (UseCompressedClassPointers) {
6661    encode_klass_not_null(src);
6662    movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
6663  } else
6664#endif
6665    movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
6666}
6667
6668void MacroAssembler::load_heap_oop(Register dst, Address src) {
6669#ifdef _LP64
6670  // FIXME: Must change all places where we try to load the klass.
6671  if (UseCompressedOops) {
6672    movl(dst, src);
6673    decode_heap_oop(dst);
6674  } else
6675#endif
6676    movptr(dst, src);
6677}
6678
6679// Doesn't do verfication, generates fixed size code
6680void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
6681#ifdef _LP64
6682  if (UseCompressedOops) {
6683    movl(dst, src);
6684    decode_heap_oop_not_null(dst);
6685  } else
6686#endif
6687    movptr(dst, src);
6688}
6689
6690void MacroAssembler::store_heap_oop(Address dst, Register src) {
6691#ifdef _LP64
6692  if (UseCompressedOops) {
6693    assert(!dst.uses(src), "not enough registers");
6694    encode_heap_oop(src);
6695    movl(dst, src);
6696  } else
6697#endif
6698    movptr(dst, src);
6699}
6700
6701void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
6702  assert_different_registers(src1, tmp);
6703#ifdef _LP64
6704  if (UseCompressedOops) {
6705    bool did_push = false;
6706    if (tmp == noreg) {
6707      tmp = rax;
6708      push(tmp);
6709      did_push = true;
6710      assert(!src2.uses(rsp), "can't push");
6711    }
6712    load_heap_oop(tmp, src2);
6713    cmpptr(src1, tmp);
6714    if (did_push)  pop(tmp);
6715  } else
6716#endif
6717    cmpptr(src1, src2);
6718}
6719
6720// Used for storing NULLs.
6721void MacroAssembler::store_heap_oop_null(Address dst) {
6722#ifdef _LP64
6723  if (UseCompressedOops) {
6724    movl(dst, (int32_t)NULL_WORD);
6725  } else {
6726    movslq(dst, (int32_t)NULL_WORD);
6727  }
6728#else
6729  movl(dst, (int32_t)NULL_WORD);
6730#endif
6731}
6732
6733#ifdef _LP64
6734void MacroAssembler::store_klass_gap(Register dst, Register src) {
6735  if (UseCompressedClassPointers) {
6736    // Store to klass gap in destination
6737    movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
6738  }
6739}
6740
6741#ifdef ASSERT
6742void MacroAssembler::verify_heapbase(const char* msg) {
6743  assert (UseCompressedOops, "should be compressed");
6744  assert (Universe::heap() != NULL, "java heap should be initialized");
6745  if (CheckCompressedOops) {
6746    Label ok;
6747    push(rscratch1); // cmpptr trashes rscratch1
6748    cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
6749    jcc(Assembler::equal, ok);
6750    STOP(msg);
6751    bind(ok);
6752    pop(rscratch1);
6753  }
6754}
6755#endif
6756
6757// Algorithm must match oop.inline.hpp encode_heap_oop.
6758void MacroAssembler::encode_heap_oop(Register r) {
6759#ifdef ASSERT
6760  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
6761#endif
6762  verify_oop(r, "broken oop in encode_heap_oop");
6763  if (Universe::narrow_oop_base() == NULL) {
6764    if (Universe::narrow_oop_shift() != 0) {
6765      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6766      shrq(r, LogMinObjAlignmentInBytes);
6767    }
6768    return;
6769  }
6770  testq(r, r);
6771  cmovq(Assembler::equal, r, r12_heapbase);
6772  subq(r, r12_heapbase);
6773  shrq(r, LogMinObjAlignmentInBytes);
6774}
6775
6776void MacroAssembler::encode_heap_oop_not_null(Register r) {
6777#ifdef ASSERT
6778  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
6779  if (CheckCompressedOops) {
6780    Label ok;
6781    testq(r, r);
6782    jcc(Assembler::notEqual, ok);
6783    STOP("null oop passed to encode_heap_oop_not_null");
6784    bind(ok);
6785  }
6786#endif
6787  verify_oop(r, "broken oop in encode_heap_oop_not_null");
6788  if (Universe::narrow_oop_base() != NULL) {
6789    subq(r, r12_heapbase);
6790  }
6791  if (Universe::narrow_oop_shift() != 0) {
6792    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6793    shrq(r, LogMinObjAlignmentInBytes);
6794  }
6795}
6796
6797void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
6798#ifdef ASSERT
6799  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
6800  if (CheckCompressedOops) {
6801    Label ok;
6802    testq(src, src);
6803    jcc(Assembler::notEqual, ok);
6804    STOP("null oop passed to encode_heap_oop_not_null2");
6805    bind(ok);
6806  }
6807#endif
6808  verify_oop(src, "broken oop in encode_heap_oop_not_null2");
6809  if (dst != src) {
6810    movq(dst, src);
6811  }
6812  if (Universe::narrow_oop_base() != NULL) {
6813    subq(dst, r12_heapbase);
6814  }
6815  if (Universe::narrow_oop_shift() != 0) {
6816    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6817    shrq(dst, LogMinObjAlignmentInBytes);
6818  }
6819}
6820
6821void  MacroAssembler::decode_heap_oop(Register r) {
6822#ifdef ASSERT
6823  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
6824#endif
6825  if (Universe::narrow_oop_base() == NULL) {
6826    if (Universe::narrow_oop_shift() != 0) {
6827      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6828      shlq(r, LogMinObjAlignmentInBytes);
6829    }
6830  } else {
6831    Label done;
6832    shlq(r, LogMinObjAlignmentInBytes);
6833    jccb(Assembler::equal, done);
6834    addq(r, r12_heapbase);
6835    bind(done);
6836  }
6837  verify_oop(r, "broken oop in decode_heap_oop");
6838}
6839
6840void  MacroAssembler::decode_heap_oop_not_null(Register r) {
6841  // Note: it will change flags
6842  assert (UseCompressedOops, "should only be used for compressed headers");
6843  assert (Universe::heap() != NULL, "java heap should be initialized");
6844  // Cannot assert, unverified entry point counts instructions (see .ad file)
6845  // vtableStubs also counts instructions in pd_code_size_limit.
6846  // Also do not verify_oop as this is called by verify_oop.
6847  if (Universe::narrow_oop_shift() != 0) {
6848    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6849    shlq(r, LogMinObjAlignmentInBytes);
6850    if (Universe::narrow_oop_base() != NULL) {
6851      addq(r, r12_heapbase);
6852    }
6853  } else {
6854    assert (Universe::narrow_oop_base() == NULL, "sanity");
6855  }
6856}
6857
6858void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
6859  // Note: it will change flags
6860  assert (UseCompressedOops, "should only be used for compressed headers");
6861  assert (Universe::heap() != NULL, "java heap should be initialized");
6862  // Cannot assert, unverified entry point counts instructions (see .ad file)
6863  // vtableStubs also counts instructions in pd_code_size_limit.
6864  // Also do not verify_oop as this is called by verify_oop.
6865  if (Universe::narrow_oop_shift() != 0) {
6866    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
6867    if (LogMinObjAlignmentInBytes == Address::times_8) {
6868      leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
6869    } else {
6870      if (dst != src) {
6871        movq(dst, src);
6872      }
6873      shlq(dst, LogMinObjAlignmentInBytes);
6874      if (Universe::narrow_oop_base() != NULL) {
6875        addq(dst, r12_heapbase);
6876      }
6877    }
6878  } else {
6879    assert (Universe::narrow_oop_base() == NULL, "sanity");
6880    if (dst != src) {
6881      movq(dst, src);
6882    }
6883  }
6884}
6885
6886void MacroAssembler::encode_klass_not_null(Register r) {
6887  if (Universe::narrow_klass_base() != NULL) {
6888    // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
6889    assert(r != r12_heapbase, "Encoding a klass in r12");
6890    mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
6891    subq(r, r12_heapbase);
6892  }
6893  if (Universe::narrow_klass_shift() != 0) {
6894    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6895    shrq(r, LogKlassAlignmentInBytes);
6896  }
6897  if (Universe::narrow_klass_base() != NULL) {
6898    reinit_heapbase();
6899  }
6900}
6901
6902void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
6903  if (dst == src) {
6904    encode_klass_not_null(src);
6905  } else {
6906    if (Universe::narrow_klass_base() != NULL) {
6907      mov64(dst, (int64_t)Universe::narrow_klass_base());
6908      negq(dst);
6909      addq(dst, src);
6910    } else {
6911      movptr(dst, src);
6912    }
6913    if (Universe::narrow_klass_shift() != 0) {
6914      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6915      shrq(dst, LogKlassAlignmentInBytes);
6916    }
6917  }
6918}
6919
6920// Function instr_size_for_decode_klass_not_null() counts the instructions
6921// generated by decode_klass_not_null(register r) and reinit_heapbase(),
6922// when (Universe::heap() != NULL).  Hence, if the instructions they
6923// generate change, then this method needs to be updated.
6924int MacroAssembler::instr_size_for_decode_klass_not_null() {
6925  assert (UseCompressedClassPointers, "only for compressed klass ptrs");
6926  if (Universe::narrow_klass_base() != NULL) {
6927    // mov64 + addq + shlq? + mov64  (for reinit_heapbase()).
6928    return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
6929  } else {
6930    // longest load decode klass function, mov64, leaq
6931    return 16;
6932  }
6933}
6934
6935// !!! If the instructions that get generated here change then function
6936// instr_size_for_decode_klass_not_null() needs to get updated.
6937void  MacroAssembler::decode_klass_not_null(Register r) {
6938  // Note: it will change flags
6939  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6940  assert(r != r12_heapbase, "Decoding a klass in r12");
6941  // Cannot assert, unverified entry point counts instructions (see .ad file)
6942  // vtableStubs also counts instructions in pd_code_size_limit.
6943  // Also do not verify_oop as this is called by verify_oop.
6944  if (Universe::narrow_klass_shift() != 0) {
6945    assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6946    shlq(r, LogKlassAlignmentInBytes);
6947  }
6948  // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
6949  if (Universe::narrow_klass_base() != NULL) {
6950    mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
6951    addq(r, r12_heapbase);
6952    reinit_heapbase();
6953  }
6954}
6955
6956void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
6957  // Note: it will change flags
6958  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6959  if (dst == src) {
6960    decode_klass_not_null(dst);
6961  } else {
6962    // Cannot assert, unverified entry point counts instructions (see .ad file)
6963    // vtableStubs also counts instructions in pd_code_size_limit.
6964    // Also do not verify_oop as this is called by verify_oop.
6965    mov64(dst, (int64_t)Universe::narrow_klass_base());
6966    if (Universe::narrow_klass_shift() != 0) {
6967      assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6968      assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
6969      leaq(dst, Address(dst, src, Address::times_8, 0));
6970    } else {
6971      addq(dst, src);
6972    }
6973  }
6974}
6975
6976void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
6977  assert (UseCompressedOops, "should only be used for compressed headers");
6978  assert (Universe::heap() != NULL, "java heap should be initialized");
6979  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6980  int oop_index = oop_recorder()->find_index(obj);
6981  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6982  mov_narrow_oop(dst, oop_index, rspec);
6983}
6984
6985void  MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
6986  assert (UseCompressedOops, "should only be used for compressed headers");
6987  assert (Universe::heap() != NULL, "java heap should be initialized");
6988  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6989  int oop_index = oop_recorder()->find_index(obj);
6990  RelocationHolder rspec = oop_Relocation::spec(oop_index);
6991  mov_narrow_oop(dst, oop_index, rspec);
6992}
6993
6994void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
6995  assert (UseCompressedClassPointers, "should only be used for compressed headers");
6996  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6997  int klass_index = oop_recorder()->find_index(k);
6998  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6999  mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
7000}
7001
7002void  MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
7003  assert (UseCompressedClassPointers, "should only be used for compressed headers");
7004  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7005  int klass_index = oop_recorder()->find_index(k);
7006  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
7007  mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
7008}
7009
7010void  MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
7011  assert (UseCompressedOops, "should only be used for compressed headers");
7012  assert (Universe::heap() != NULL, "java heap should be initialized");
7013  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7014  int oop_index = oop_recorder()->find_index(obj);
7015  RelocationHolder rspec = oop_Relocation::spec(oop_index);
7016  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
7017}
7018
7019void  MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
7020  assert (UseCompressedOops, "should only be used for compressed headers");
7021  assert (Universe::heap() != NULL, "java heap should be initialized");
7022  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7023  int oop_index = oop_recorder()->find_index(obj);
7024  RelocationHolder rspec = oop_Relocation::spec(oop_index);
7025  Assembler::cmp_narrow_oop(dst, oop_index, rspec);
7026}
7027
7028void  MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
7029  assert (UseCompressedClassPointers, "should only be used for compressed headers");
7030  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7031  int klass_index = oop_recorder()->find_index(k);
7032  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
7033  Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
7034}
7035
7036void  MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
7037  assert (UseCompressedClassPointers, "should only be used for compressed headers");
7038  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
7039  int klass_index = oop_recorder()->find_index(k);
7040  RelocationHolder rspec = metadata_Relocation::spec(klass_index);
7041  Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
7042}
7043
7044void MacroAssembler::reinit_heapbase() {
7045  if (UseCompressedOops || UseCompressedClassPointers) {
7046    if (Universe::heap() != NULL) {
7047      if (Universe::narrow_oop_base() == NULL) {
7048        MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
7049      } else {
7050        mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
7051      }
7052    } else {
7053      movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
7054    }
7055  }
7056}
7057
7058#endif // _LP64
7059
7060
7061// C2 compiled method's prolog code.
7062void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) {
7063
7064  // WARNING: Initial instruction MUST be 5 bytes or longer so that
7065  // NativeJump::patch_verified_entry will be able to patch out the entry
7066  // code safely. The push to verify stack depth is ok at 5 bytes,
7067  // the frame allocation can be either 3 or 6 bytes. So if we don't do
7068  // stack bang then we must use the 6 byte frame allocation even if
7069  // we have no frame. :-(
7070  assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
7071
7072  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
7073  // Remove word for return addr
7074  framesize -= wordSize;
7075  stack_bang_size -= wordSize;
7076
7077  // Calls to C2R adapters often do not accept exceptional returns.
7078  // We require that their callers must bang for them.  But be careful, because
7079  // some VM calls (such as call site linkage) can use several kilobytes of
7080  // stack.  But the stack safety zone should account for that.
7081  // See bugs 4446381, 4468289, 4497237.
7082  if (stack_bang_size > 0) {
7083    generate_stack_overflow_check(stack_bang_size);
7084
7085    // We always push rbp, so that on return to interpreter rbp, will be
7086    // restored correctly and we can correct the stack.
7087    push(rbp);
7088    // Save caller's stack pointer into RBP if the frame pointer is preserved.
7089    if (PreserveFramePointer) {
7090      mov(rbp, rsp);
7091    }
7092    // Remove word for ebp
7093    framesize -= wordSize;
7094
7095    // Create frame
7096    if (framesize) {
7097      subptr(rsp, framesize);
7098    }
7099  } else {
7100    // Create frame (force generation of a 4 byte immediate value)
7101    subptr_imm32(rsp, framesize);
7102
7103    // Save RBP register now.
7104    framesize -= wordSize;
7105    movptr(Address(rsp, framesize), rbp);
7106    // Save caller's stack pointer into RBP if the frame pointer is preserved.
7107    if (PreserveFramePointer) {
7108      movptr(rbp, rsp);
7109      if (framesize > 0) {
7110        addptr(rbp, framesize);
7111      }
7112    }
7113  }
7114
7115  if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
7116    framesize -= wordSize;
7117    movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
7118  }
7119
7120#ifndef _LP64
7121  // If method sets FPU control word do it now
7122  if (fp_mode_24b) {
7123    fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
7124  }
7125  if (UseSSE >= 2 && VerifyFPU) {
7126    verify_FPU(0, "FPU stack must be clean on entry");
7127  }
7128#endif
7129
7130#ifdef ASSERT
7131  if (VerifyStackAtCalls) {
7132    Label L;
7133    push(rax);
7134    mov(rax, rsp);
7135    andptr(rax, StackAlignmentInBytes-1);
7136    cmpptr(rax, StackAlignmentInBytes-wordSize);
7137    pop(rax);
7138    jcc(Assembler::equal, L);
7139    STOP("Stack is not properly aligned!");
7140    bind(L);
7141  }
7142#endif
7143
7144}
7145
7146void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
7147  // cnt - number of qwords (8-byte words).
7148  // base - start address, qword aligned.
7149  assert(base==rdi, "base register must be edi for rep stos");
7150  assert(tmp==rax,   "tmp register must be eax for rep stos");
7151  assert(cnt==rcx,   "cnt register must be ecx for rep stos");
7152
7153  xorptr(tmp, tmp);
7154  if (UseFastStosb) {
7155    shlptr(cnt,3); // convert to number of bytes
7156    rep_stosb();
7157  } else {
7158    NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
7159    rep_stos();
7160  }
7161}
7162
7163#ifdef COMPILER2
7164
7165// IndexOf for constant substrings with size >= 8 chars
7166// which don't need to be loaded through stack.
7167void MacroAssembler::string_indexofC8(Register str1, Register str2,
7168                                      Register cnt1, Register cnt2,
7169                                      int int_cnt2,  Register result,
7170                                      XMMRegister vec, Register tmp,
7171                                      int ae) {
7172  ShortBranchVerifier sbv(this);
7173  assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7174  assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7175  assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
7176
7177  // This method uses the pcmpestri instruction with bound registers
7178  //   inputs:
7179  //     xmm - substring
7180  //     rax - substring length (elements count)
7181  //     mem - scanned string
7182  //     rdx - string length (elements count)
7183  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
7184  //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
7185  //   outputs:
7186  //     rcx - matched index in string
7187  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
7188  int mode   = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
7189  int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
7190  Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
7191  Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
7192
7193  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
7194        RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
7195        MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
7196
7197  // Note, inline_string_indexOf() generates checks:
7198  // if (substr.count > string.count) return -1;
7199  // if (substr.count == 0) return 0;
7200  assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
7201
7202  // Load substring.
7203  if (ae == StrIntrinsicNode::UL) {
7204    pmovzxbw(vec, Address(str2, 0));
7205  } else {
7206    movdqu(vec, Address(str2, 0));
7207  }
7208  movl(cnt2, int_cnt2);
7209  movptr(result, str1); // string addr
7210
7211  if (int_cnt2 > stride) {
7212    jmpb(SCAN_TO_SUBSTR);
7213
7214    // Reload substr for rescan, this code
7215    // is executed only for large substrings (> 8 chars)
7216    bind(RELOAD_SUBSTR);
7217    if (ae == StrIntrinsicNode::UL) {
7218      pmovzxbw(vec, Address(str2, 0));
7219    } else {
7220      movdqu(vec, Address(str2, 0));
7221    }
7222    negptr(cnt2); // Jumped here with negative cnt2, convert to positive
7223
7224    bind(RELOAD_STR);
7225    // We came here after the beginning of the substring was
7226    // matched but the rest of it was not so we need to search
7227    // again. Start from the next element after the previous match.
7228
7229    // cnt2 is number of substring reminding elements and
7230    // cnt1 is number of string reminding elements when cmp failed.
7231    // Restored cnt1 = cnt1 - cnt2 + int_cnt2
7232    subl(cnt1, cnt2);
7233    addl(cnt1, int_cnt2);
7234    movl(cnt2, int_cnt2); // Now restore cnt2
7235
7236    decrementl(cnt1);     // Shift to next element
7237    cmpl(cnt1, cnt2);
7238    jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7239
7240    addptr(result, (1<<scale1));
7241
7242  } // (int_cnt2 > 8)
7243
7244  // Scan string for start of substr in 16-byte vectors
7245  bind(SCAN_TO_SUBSTR);
7246  pcmpestri(vec, Address(result, 0), mode);
7247  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
7248  subl(cnt1, stride);
7249  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
7250  cmpl(cnt1, cnt2);
7251  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7252  addptr(result, 16);
7253  jmpb(SCAN_TO_SUBSTR);
7254
7255  // Found a potential substr
7256  bind(FOUND_CANDIDATE);
7257  // Matched whole vector if first element matched (tmp(rcx) == 0).
7258  if (int_cnt2 == stride) {
7259    jccb(Assembler::overflow, RET_FOUND);    // OF == 1
7260  } else { // int_cnt2 > 8
7261    jccb(Assembler::overflow, FOUND_SUBSTR);
7262  }
7263  // After pcmpestri tmp(rcx) contains matched element index
7264  // Compute start addr of substr
7265  lea(result, Address(result, tmp, scale1));
7266
7267  // Make sure string is still long enough
7268  subl(cnt1, tmp);
7269  cmpl(cnt1, cnt2);
7270  if (int_cnt2 == stride) {
7271    jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
7272  } else { // int_cnt2 > 8
7273    jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
7274  }
7275  // Left less then substring.
7276
7277  bind(RET_NOT_FOUND);
7278  movl(result, -1);
7279  jmpb(EXIT);
7280
7281  if (int_cnt2 > stride) {
7282    // This code is optimized for the case when whole substring
7283    // is matched if its head is matched.
7284    bind(MATCH_SUBSTR_HEAD);
7285    pcmpestri(vec, Address(result, 0), mode);
7286    // Reload only string if does not match
7287    jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
7288
7289    Label CONT_SCAN_SUBSTR;
7290    // Compare the rest of substring (> 8 chars).
7291    bind(FOUND_SUBSTR);
7292    // First 8 chars are already matched.
7293    negptr(cnt2);
7294    addptr(cnt2, stride);
7295
7296    bind(SCAN_SUBSTR);
7297    subl(cnt1, stride);
7298    cmpl(cnt2, -stride); // Do not read beyond substring
7299    jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
7300    // Back-up strings to avoid reading beyond substring:
7301    // cnt1 = cnt1 - cnt2 + 8
7302    addl(cnt1, cnt2); // cnt2 is negative
7303    addl(cnt1, stride);
7304    movl(cnt2, stride); negptr(cnt2);
7305    bind(CONT_SCAN_SUBSTR);
7306    if (int_cnt2 < (int)G) {
7307      int tail_off1 = int_cnt2<<scale1;
7308      int tail_off2 = int_cnt2<<scale2;
7309      if (ae == StrIntrinsicNode::UL) {
7310        pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
7311      } else {
7312        movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
7313      }
7314      pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
7315    } else {
7316      // calculate index in register to avoid integer overflow (int_cnt2*2)
7317      movl(tmp, int_cnt2);
7318      addptr(tmp, cnt2);
7319      if (ae == StrIntrinsicNode::UL) {
7320        pmovzxbw(vec, Address(str2, tmp, scale2, 0));
7321      } else {
7322        movdqu(vec, Address(str2, tmp, scale2, 0));
7323      }
7324      pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
7325    }
7326    // Need to reload strings pointers if not matched whole vector
7327    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
7328    addptr(cnt2, stride);
7329    jcc(Assembler::negative, SCAN_SUBSTR);
7330    // Fall through if found full substring
7331
7332  } // (int_cnt2 > 8)
7333
7334  bind(RET_FOUND);
7335  // Found result if we matched full small substring.
7336  // Compute substr offset
7337  subptr(result, str1);
7338  if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7339    shrl(result, 1); // index
7340  }
7341  bind(EXIT);
7342
7343} // string_indexofC8
7344
7345// Small strings are loaded through stack if they cross page boundary.
7346void MacroAssembler::string_indexof(Register str1, Register str2,
7347                                    Register cnt1, Register cnt2,
7348                                    int int_cnt2,  Register result,
7349                                    XMMRegister vec, Register tmp,
7350                                    int ae) {
7351  ShortBranchVerifier sbv(this);
7352  assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7353  assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7354  assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
7355
7356  //
7357  // int_cnt2 is length of small (< 8 chars) constant substring
7358  // or (-1) for non constant substring in which case its length
7359  // is in cnt2 register.
7360  //
7361  // Note, inline_string_indexOf() generates checks:
7362  // if (substr.count > string.count) return -1;
7363  // if (substr.count == 0) return 0;
7364  //
7365  int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
7366  assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
7367  // This method uses the pcmpestri instruction with bound registers
7368  //   inputs:
7369  //     xmm - substring
7370  //     rax - substring length (elements count)
7371  //     mem - scanned string
7372  //     rdx - string length (elements count)
7373  //     0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
7374  //     0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
7375  //   outputs:
7376  //     rcx - matched index in string
7377  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
7378  int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
7379  Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
7380  Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
7381
7382  Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
7383        RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
7384        FOUND_CANDIDATE;
7385
7386  { //========================================================
7387    // We don't know where these strings are located
7388    // and we can't read beyond them. Load them through stack.
7389    Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
7390
7391    movptr(tmp, rsp); // save old SP
7392
7393    if (int_cnt2 > 0) {     // small (< 8 chars) constant substring
7394      if (int_cnt2 == (1>>scale2)) { // One byte
7395        assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
7396        load_unsigned_byte(result, Address(str2, 0));
7397        movdl(vec, result); // move 32 bits
7398      } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) {  // Three bytes
7399        // Not enough header space in 32-bit VM: 12+3 = 15.
7400        movl(result, Address(str2, -1));
7401        shrl(result, 8);
7402        movdl(vec, result); // move 32 bits
7403      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) {  // One char
7404        load_unsigned_short(result, Address(str2, 0));
7405        movdl(vec, result); // move 32 bits
7406      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
7407        movdl(vec, Address(str2, 0)); // move 32 bits
7408      } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
7409        movq(vec, Address(str2, 0));  // move 64 bits
7410      } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
7411        // Array header size is 12 bytes in 32-bit VM
7412        // + 6 bytes for 3 chars == 18 bytes,
7413        // enough space to load vec and shift.
7414        assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
7415        if (ae == StrIntrinsicNode::UL) {
7416          int tail_off = int_cnt2-8;
7417          pmovzxbw(vec, Address(str2, tail_off));
7418          psrldq(vec, -2*tail_off);
7419        }
7420        else {
7421          int tail_off = int_cnt2*(1<<scale2);
7422          movdqu(vec, Address(str2, tail_off-16));
7423          psrldq(vec, 16-tail_off);
7424        }
7425      }
7426    } else { // not constant substring
7427      cmpl(cnt2, stride);
7428      jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
7429
7430      // We can read beyond string if srt+16 does not cross page boundary
7431      // since heaps are aligned and mapped by pages.
7432      assert(os::vm_page_size() < (int)G, "default page should be small");
7433      movl(result, str2); // We need only low 32 bits
7434      andl(result, (os::vm_page_size()-1));
7435      cmpl(result, (os::vm_page_size()-16));
7436      jccb(Assembler::belowEqual, CHECK_STR);
7437
7438      // Move small strings to stack to allow load 16 bytes into vec.
7439      subptr(rsp, 16);
7440      int stk_offset = wordSize-(1<<scale2);
7441      push(cnt2);
7442
7443      bind(COPY_SUBSTR);
7444      if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
7445        load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
7446        movb(Address(rsp, cnt2, scale2, stk_offset), result);
7447      } else if (ae == StrIntrinsicNode::UU) {
7448        load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
7449        movw(Address(rsp, cnt2, scale2, stk_offset), result);
7450      }
7451      decrement(cnt2);
7452      jccb(Assembler::notZero, COPY_SUBSTR);
7453
7454      pop(cnt2);
7455      movptr(str2, rsp);  // New substring address
7456    } // non constant
7457
7458    bind(CHECK_STR);
7459    cmpl(cnt1, stride);
7460    jccb(Assembler::aboveEqual, BIG_STRINGS);
7461
7462    // Check cross page boundary.
7463    movl(result, str1); // We need only low 32 bits
7464    andl(result, (os::vm_page_size()-1));
7465    cmpl(result, (os::vm_page_size()-16));
7466    jccb(Assembler::belowEqual, BIG_STRINGS);
7467
7468    subptr(rsp, 16);
7469    int stk_offset = -(1<<scale1);
7470    if (int_cnt2 < 0) { // not constant
7471      push(cnt2);
7472      stk_offset += wordSize;
7473    }
7474    movl(cnt2, cnt1);
7475
7476    bind(COPY_STR);
7477    if (ae == StrIntrinsicNode::LL) {
7478      load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
7479      movb(Address(rsp, cnt2, scale1, stk_offset), result);
7480    } else {
7481      load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
7482      movw(Address(rsp, cnt2, scale1, stk_offset), result);
7483    }
7484    decrement(cnt2);
7485    jccb(Assembler::notZero, COPY_STR);
7486
7487    if (int_cnt2 < 0) { // not constant
7488      pop(cnt2);
7489    }
7490    movptr(str1, rsp);  // New string address
7491
7492    bind(BIG_STRINGS);
7493    // Load substring.
7494    if (int_cnt2 < 0) { // -1
7495      if (ae == StrIntrinsicNode::UL) {
7496        pmovzxbw(vec, Address(str2, 0));
7497      } else {
7498        movdqu(vec, Address(str2, 0));
7499      }
7500      push(cnt2);       // substr count
7501      push(str2);       // substr addr
7502      push(str1);       // string addr
7503    } else {
7504      // Small (< 8 chars) constant substrings are loaded already.
7505      movl(cnt2, int_cnt2);
7506    }
7507    push(tmp);  // original SP
7508
7509  } // Finished loading
7510
7511  //========================================================
7512  // Start search
7513  //
7514
7515  movptr(result, str1); // string addr
7516
7517  if (int_cnt2  < 0) {  // Only for non constant substring
7518    jmpb(SCAN_TO_SUBSTR);
7519
7520    // SP saved at sp+0
7521    // String saved at sp+1*wordSize
7522    // Substr saved at sp+2*wordSize
7523    // Substr count saved at sp+3*wordSize
7524
7525    // Reload substr for rescan, this code
7526    // is executed only for large substrings (> 8 chars)
7527    bind(RELOAD_SUBSTR);
7528    movptr(str2, Address(rsp, 2*wordSize));
7529    movl(cnt2, Address(rsp, 3*wordSize));
7530    if (ae == StrIntrinsicNode::UL) {
7531      pmovzxbw(vec, Address(str2, 0));
7532    } else {
7533      movdqu(vec, Address(str2, 0));
7534    }
7535    // We came here after the beginning of the substring was
7536    // matched but the rest of it was not so we need to search
7537    // again. Start from the next element after the previous match.
7538    subptr(str1, result); // Restore counter
7539    if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7540      shrl(str1, 1);
7541    }
7542    addl(cnt1, str1);
7543    decrementl(cnt1);   // Shift to next element
7544    cmpl(cnt1, cnt2);
7545    jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7546
7547    addptr(result, (1<<scale1));
7548  } // non constant
7549
7550  // Scan string for start of substr in 16-byte vectors
7551  bind(SCAN_TO_SUBSTR);
7552  assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
7553  pcmpestri(vec, Address(result, 0), mode);
7554  jccb(Assembler::below, FOUND_CANDIDATE);   // CF == 1
7555  subl(cnt1, stride);
7556  jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
7557  cmpl(cnt1, cnt2);
7558  jccb(Assembler::negative, RET_NOT_FOUND);  // Left less then substring
7559  addptr(result, 16);
7560
7561  bind(ADJUST_STR);
7562  cmpl(cnt1, stride); // Do not read beyond string
7563  jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
7564  // Back-up string to avoid reading beyond string.
7565  lea(result, Address(result, cnt1, scale1, -16));
7566  movl(cnt1, stride);
7567  jmpb(SCAN_TO_SUBSTR);
7568
7569  // Found a potential substr
7570  bind(FOUND_CANDIDATE);
7571  // After pcmpestri tmp(rcx) contains matched element index
7572
7573  // Make sure string is still long enough
7574  subl(cnt1, tmp);
7575  cmpl(cnt1, cnt2);
7576  jccb(Assembler::greaterEqual, FOUND_SUBSTR);
7577  // Left less then substring.
7578
7579  bind(RET_NOT_FOUND);
7580  movl(result, -1);
7581  jmpb(CLEANUP);
7582
7583  bind(FOUND_SUBSTR);
7584  // Compute start addr of substr
7585  lea(result, Address(result, tmp, scale1));
7586  if (int_cnt2 > 0) { // Constant substring
7587    // Repeat search for small substring (< 8 chars)
7588    // from new point without reloading substring.
7589    // Have to check that we don't read beyond string.
7590    cmpl(tmp, stride-int_cnt2);
7591    jccb(Assembler::greater, ADJUST_STR);
7592    // Fall through if matched whole substring.
7593  } else { // non constant
7594    assert(int_cnt2 == -1, "should be != 0");
7595
7596    addl(tmp, cnt2);
7597    // Found result if we matched whole substring.
7598    cmpl(tmp, stride);
7599    jccb(Assembler::lessEqual, RET_FOUND);
7600
7601    // Repeat search for small substring (<= 8 chars)
7602    // from new point 'str1' without reloading substring.
7603    cmpl(cnt2, stride);
7604    // Have to check that we don't read beyond string.
7605    jccb(Assembler::lessEqual, ADJUST_STR);
7606
7607    Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
7608    // Compare the rest of substring (> 8 chars).
7609    movptr(str1, result);
7610
7611    cmpl(tmp, cnt2);
7612    // First 8 chars are already matched.
7613    jccb(Assembler::equal, CHECK_NEXT);
7614
7615    bind(SCAN_SUBSTR);
7616    pcmpestri(vec, Address(str1, 0), mode);
7617    // Need to reload strings pointers if not matched whole vector
7618    jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
7619
7620    bind(CHECK_NEXT);
7621    subl(cnt2, stride);
7622    jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
7623    addptr(str1, 16);
7624    if (ae == StrIntrinsicNode::UL) {
7625      addptr(str2, 8);
7626    } else {
7627      addptr(str2, 16);
7628    }
7629    subl(cnt1, stride);
7630    cmpl(cnt2, stride); // Do not read beyond substring
7631    jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
7632    // Back-up strings to avoid reading beyond substring.
7633
7634    if (ae == StrIntrinsicNode::UL) {
7635      lea(str2, Address(str2, cnt2, scale2, -8));
7636      lea(str1, Address(str1, cnt2, scale1, -16));
7637    } else {
7638      lea(str2, Address(str2, cnt2, scale2, -16));
7639      lea(str1, Address(str1, cnt2, scale1, -16));
7640    }
7641    subl(cnt1, cnt2);
7642    movl(cnt2, stride);
7643    addl(cnt1, stride);
7644    bind(CONT_SCAN_SUBSTR);
7645    if (ae == StrIntrinsicNode::UL) {
7646      pmovzxbw(vec, Address(str2, 0));
7647    } else {
7648      movdqu(vec, Address(str2, 0));
7649    }
7650    jmpb(SCAN_SUBSTR);
7651
7652    bind(RET_FOUND_LONG);
7653    movptr(str1, Address(rsp, wordSize));
7654  } // non constant
7655
7656  bind(RET_FOUND);
7657  // Compute substr offset
7658  subptr(result, str1);
7659  if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
7660    shrl(result, 1); // index
7661  }
7662  bind(CLEANUP);
7663  pop(rsp); // restore SP
7664
7665} // string_indexof
7666
7667void MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
7668                                         XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
7669  ShortBranchVerifier sbv(this);
7670  assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
7671  assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7672
7673  int stride = 8;
7674
7675  Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
7676        SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
7677        RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
7678        FOUND_SEQ_CHAR, DONE_LABEL;
7679
7680  movptr(result, str1);
7681  if (UseAVX >= 2) {
7682    cmpl(cnt1, stride);
7683    jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
7684    cmpl(cnt1, 2*stride);
7685    jccb(Assembler::less, SCAN_TO_8_CHAR_INIT);
7686    movdl(vec1, ch);
7687    vpbroadcastw(vec1, vec1);
7688    vpxor(vec2, vec2);
7689    movl(tmp, cnt1);
7690    andl(tmp, 0xFFFFFFF0);  //vector count (in chars)
7691    andl(cnt1,0x0000000F);  //tail count (in chars)
7692
7693    bind(SCAN_TO_16_CHAR_LOOP);
7694    vmovdqu(vec3, Address(result, 0));
7695    vpcmpeqw(vec3, vec3, vec1, 1);
7696    vptest(vec2, vec3);
7697    jcc(Assembler::carryClear, FOUND_CHAR);
7698    addptr(result, 32);
7699    subl(tmp, 2*stride);
7700    jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
7701    jmp(SCAN_TO_8_CHAR);
7702    bind(SCAN_TO_8_CHAR_INIT);
7703    movdl(vec1, ch);
7704    pshuflw(vec1, vec1, 0x00);
7705    pshufd(vec1, vec1, 0);
7706    pxor(vec2, vec2);
7707  }
7708  bind(SCAN_TO_8_CHAR);
7709  cmpl(cnt1, stride);
7710  if (UseAVX >= 2) {
7711    jccb(Assembler::less, SCAN_TO_CHAR);
7712  } else {
7713    jccb(Assembler::less, SCAN_TO_CHAR_LOOP);
7714    movdl(vec1, ch);
7715    pshuflw(vec1, vec1, 0x00);
7716    pshufd(vec1, vec1, 0);
7717    pxor(vec2, vec2);
7718  }
7719  movl(tmp, cnt1);
7720  andl(tmp, 0xFFFFFFF8);  //vector count (in chars)
7721  andl(cnt1,0x00000007);  //tail count (in chars)
7722
7723  bind(SCAN_TO_8_CHAR_LOOP);
7724  movdqu(vec3, Address(result, 0));
7725  pcmpeqw(vec3, vec1);
7726  ptest(vec2, vec3);
7727  jcc(Assembler::carryClear, FOUND_CHAR);
7728  addptr(result, 16);
7729  subl(tmp, stride);
7730  jccb(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
7731  bind(SCAN_TO_CHAR);
7732  testl(cnt1, cnt1);
7733  jcc(Assembler::zero, RET_NOT_FOUND);
7734  bind(SCAN_TO_CHAR_LOOP);
7735  load_unsigned_short(tmp, Address(result, 0));
7736  cmpl(ch, tmp);
7737  jccb(Assembler::equal, FOUND_SEQ_CHAR);
7738  addptr(result, 2);
7739  subl(cnt1, 1);
7740  jccb(Assembler::zero, RET_NOT_FOUND);
7741  jmp(SCAN_TO_CHAR_LOOP);
7742
7743  bind(RET_NOT_FOUND);
7744  movl(result, -1);
7745  jmpb(DONE_LABEL);
7746
7747  bind(FOUND_CHAR);
7748  if (UseAVX >= 2) {
7749    vpmovmskb(tmp, vec3);
7750  } else {
7751    pmovmskb(tmp, vec3);
7752  }
7753  bsfl(ch, tmp);
7754  addl(result, ch);
7755
7756  bind(FOUND_SEQ_CHAR);
7757  subptr(result, str1);
7758  shrl(result, 1);
7759
7760  bind(DONE_LABEL);
7761} // string_indexof_char
7762
7763// helper function for string_compare
7764void MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
7765                                        Address::ScaleFactor scale, Address::ScaleFactor scale1,
7766                                        Address::ScaleFactor scale2, Register index, int ae) {
7767  if (ae == StrIntrinsicNode::LL) {
7768    load_unsigned_byte(elem1, Address(str1, index, scale, 0));
7769    load_unsigned_byte(elem2, Address(str2, index, scale, 0));
7770  } else if (ae == StrIntrinsicNode::UU) {
7771    load_unsigned_short(elem1, Address(str1, index, scale, 0));
7772    load_unsigned_short(elem2, Address(str2, index, scale, 0));
7773  } else {
7774    load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
7775    load_unsigned_short(elem2, Address(str2, index, scale2, 0));
7776  }
7777}
7778
7779// Compare strings, used for char[] and byte[].
7780void MacroAssembler::string_compare(Register str1, Register str2,
7781                                    Register cnt1, Register cnt2, Register result,
7782                                    XMMRegister vec1, int ae) {
7783  ShortBranchVerifier sbv(this);
7784  Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
7785  Label COMPARE_WIDE_VECTORS_LOOP_FAILED;  // used only _LP64 && AVX3
7786  int stride, stride2, adr_stride, adr_stride1, adr_stride2;
7787  int stride2x2 = 0x40;
7788  Address::ScaleFactor scale, scale1, scale2;
7789
7790  if (ae != StrIntrinsicNode::LL) {
7791    stride2x2 = 0x20;
7792  }
7793
7794  if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
7795    shrl(cnt2, 1);
7796  }
7797  // Compute the minimum of the string lengths and the
7798  // difference of the string lengths (stack).
7799  // Do the conditional move stuff
7800  movl(result, cnt1);
7801  subl(cnt1, cnt2);
7802  push(cnt1);
7803  cmov32(Assembler::lessEqual, cnt2, result);    // cnt2 = min(cnt1, cnt2)
7804
7805  // Is the minimum length zero?
7806  testl(cnt2, cnt2);
7807  jcc(Assembler::zero, LENGTH_DIFF_LABEL);
7808  if (ae == StrIntrinsicNode::LL) {
7809    // Load first bytes
7810    load_unsigned_byte(result, Address(str1, 0));  // result = str1[0]
7811    load_unsigned_byte(cnt1, Address(str2, 0));    // cnt1   = str2[0]
7812  } else if (ae == StrIntrinsicNode::UU) {
7813    // Load first characters
7814    load_unsigned_short(result, Address(str1, 0));
7815    load_unsigned_short(cnt1, Address(str2, 0));
7816  } else {
7817    load_unsigned_byte(result, Address(str1, 0));
7818    load_unsigned_short(cnt1, Address(str2, 0));
7819  }
7820  subl(result, cnt1);
7821  jcc(Assembler::notZero,  POP_LABEL);
7822
7823  if (ae == StrIntrinsicNode::UU) {
7824    // Divide length by 2 to get number of chars
7825    shrl(cnt2, 1);
7826  }
7827  cmpl(cnt2, 1);
7828  jcc(Assembler::equal, LENGTH_DIFF_LABEL);
7829
7830  // Check if the strings start at the same location and setup scale and stride
7831  if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7832    cmpptr(str1, str2);
7833    jcc(Assembler::equal, LENGTH_DIFF_LABEL);
7834    if (ae == StrIntrinsicNode::LL) {
7835      scale = Address::times_1;
7836      stride = 16;
7837    } else {
7838      scale = Address::times_2;
7839      stride = 8;
7840    }
7841  } else {
7842    scale = Address::no_scale;  // not used
7843    scale1 = Address::times_1;
7844    scale2 = Address::times_2;
7845    stride = 8;
7846  }
7847
7848  if (UseAVX >= 2 && UseSSE42Intrinsics) {
7849    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
7850    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
7851    Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
7852    Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
7853    Label COMPARE_TAIL_LONG;
7854    Label COMPARE_WIDE_VECTORS_LOOP_AVX3;  // used only _LP64 && AVX3
7855
7856    int pcmpmask = 0x19;
7857    if (ae == StrIntrinsicNode::LL) {
7858      pcmpmask &= ~0x01;
7859    }
7860
7861    // Setup to compare 16-chars (32-bytes) vectors,
7862    // start from first character again because it has aligned address.
7863    if (ae == StrIntrinsicNode::LL) {
7864      stride2 = 32;
7865    } else {
7866      stride2 = 16;
7867    }
7868    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7869      adr_stride = stride << scale;
7870    } else {
7871      adr_stride1 = 8;  //stride << scale1;
7872      adr_stride2 = 16; //stride << scale2;
7873    }
7874
7875    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
7876    // rax and rdx are used by pcmpestri as elements counters
7877    movl(result, cnt2);
7878    andl(cnt2, ~(stride2-1));   // cnt2 holds the vector count
7879    jcc(Assembler::zero, COMPARE_TAIL_LONG);
7880
7881    // fast path : compare first 2 8-char vectors.
7882    bind(COMPARE_16_CHARS);
7883    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7884      movdqu(vec1, Address(str1, 0));
7885    } else {
7886      pmovzxbw(vec1, Address(str1, 0));
7887    }
7888    pcmpestri(vec1, Address(str2, 0), pcmpmask);
7889    jccb(Assembler::below, COMPARE_INDEX_CHAR);
7890
7891    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7892      movdqu(vec1, Address(str1, adr_stride));
7893      pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
7894    } else {
7895      pmovzxbw(vec1, Address(str1, adr_stride1));
7896      pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
7897    }
7898    jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
7899    addl(cnt1, stride);
7900
7901    // Compare the characters at index in cnt1
7902    bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
7903    load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
7904    subl(result, cnt2);
7905    jmp(POP_LABEL);
7906
7907    // Setup the registers to start vector comparison loop
7908    bind(COMPARE_WIDE_VECTORS);
7909    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7910      lea(str1, Address(str1, result, scale));
7911      lea(str2, Address(str2, result, scale));
7912    } else {
7913      lea(str1, Address(str1, result, scale1));
7914      lea(str2, Address(str2, result, scale2));
7915    }
7916    subl(result, stride2);
7917    subl(cnt2, stride2);
7918    jcc(Assembler::zero, COMPARE_WIDE_TAIL);
7919    negptr(result);
7920
7921    //  In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
7922    bind(COMPARE_WIDE_VECTORS_LOOP);
7923
7924#ifdef _LP64
7925    if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
7926      cmpl(cnt2, stride2x2);
7927      jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
7928      testl(cnt2, stride2x2-1);   // cnt2 holds the vector count
7929      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2);   // means we cannot subtract by 0x40
7930
7931      bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
7932      if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7933        evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
7934        evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
7935      } else {
7936        vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
7937        evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
7938      }
7939      kortestql(k7, k7);
7940      jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED);     // miscompare
7941      addptr(result, stride2x2);  // update since we already compared at this addr
7942      subl(cnt2, stride2x2);      // and sub the size too
7943      jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
7944
7945      vpxor(vec1, vec1);
7946      jmpb(COMPARE_WIDE_TAIL);
7947    }//if (VM_Version::supports_avx512vlbw())
7948#endif // _LP64
7949
7950
7951    bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7952    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7953      vmovdqu(vec1, Address(str1, result, scale));
7954      vpxor(vec1, Address(str2, result, scale));
7955    } else {
7956      vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
7957      vpxor(vec1, Address(str2, result, scale2));
7958    }
7959    vptest(vec1, vec1);
7960    jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
7961    addptr(result, stride2);
7962    subl(cnt2, stride2);
7963    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
7964    // clean upper bits of YMM registers
7965    vpxor(vec1, vec1);
7966
7967    // compare wide vectors tail
7968    bind(COMPARE_WIDE_TAIL);
7969    testptr(result, result);
7970    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
7971
7972    movl(result, stride2);
7973    movl(cnt2, result);
7974    negptr(result);
7975    jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
7976
7977    // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
7978    bind(VECTOR_NOT_EQUAL);
7979    // clean upper bits of YMM registers
7980    vpxor(vec1, vec1);
7981    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7982      lea(str1, Address(str1, result, scale));
7983      lea(str2, Address(str2, result, scale));
7984    } else {
7985      lea(str1, Address(str1, result, scale1));
7986      lea(str2, Address(str2, result, scale2));
7987    }
7988    jmp(COMPARE_16_CHARS);
7989
7990    // Compare tail chars, length between 1 to 15 chars
7991    bind(COMPARE_TAIL_LONG);
7992    movl(cnt2, result);
7993    cmpl(cnt2, stride);
7994    jccb(Assembler::less, COMPARE_SMALL_STR);
7995
7996    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
7997      movdqu(vec1, Address(str1, 0));
7998    } else {
7999      pmovzxbw(vec1, Address(str1, 0));
8000    }
8001    pcmpestri(vec1, Address(str2, 0), pcmpmask);
8002    jcc(Assembler::below, COMPARE_INDEX_CHAR);
8003    subptr(cnt2, stride);
8004    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
8005    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8006      lea(str1, Address(str1, result, scale));
8007      lea(str2, Address(str2, result, scale));
8008    } else {
8009      lea(str1, Address(str1, result, scale1));
8010      lea(str2, Address(str2, result, scale2));
8011    }
8012    negptr(cnt2);
8013    jmpb(WHILE_HEAD_LABEL);
8014
8015    bind(COMPARE_SMALL_STR);
8016  } else if (UseSSE42Intrinsics) {
8017    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8018    Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
8019    int pcmpmask = 0x19;
8020    // Setup to compare 8-char (16-byte) vectors,
8021    // start from first character again because it has aligned address.
8022    movl(result, cnt2);
8023    andl(cnt2, ~(stride - 1));   // cnt2 holds the vector count
8024    if (ae == StrIntrinsicNode::LL) {
8025      pcmpmask &= ~0x01;
8026    }
8027    jccb(Assembler::zero, COMPARE_TAIL);
8028    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8029      lea(str1, Address(str1, result, scale));
8030      lea(str2, Address(str2, result, scale));
8031    } else {
8032      lea(str1, Address(str1, result, scale1));
8033      lea(str2, Address(str2, result, scale2));
8034    }
8035    negptr(result);
8036
8037    // pcmpestri
8038    //   inputs:
8039    //     vec1- substring
8040    //     rax - negative string length (elements count)
8041    //     mem - scanned string
8042    //     rdx - string length (elements count)
8043    //     pcmpmask - cmp mode: 11000 (string compare with negated result)
8044    //               + 00 (unsigned bytes) or  + 01 (unsigned shorts)
8045    //   outputs:
8046    //     rcx - first mismatched element index
8047    assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
8048
8049    bind(COMPARE_WIDE_VECTORS);
8050    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8051      movdqu(vec1, Address(str1, result, scale));
8052      pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
8053    } else {
8054      pmovzxbw(vec1, Address(str1, result, scale1));
8055      pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
8056    }
8057    // After pcmpestri cnt1(rcx) contains mismatched element index
8058
8059    jccb(Assembler::below, VECTOR_NOT_EQUAL);  // CF==1
8060    addptr(result, stride);
8061    subptr(cnt2, stride);
8062    jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
8063
8064    // compare wide vectors tail
8065    testptr(result, result);
8066    jccb(Assembler::zero, LENGTH_DIFF_LABEL);
8067
8068    movl(cnt2, stride);
8069    movl(result, stride);
8070    negptr(result);
8071    if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8072      movdqu(vec1, Address(str1, result, scale));
8073      pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
8074    } else {
8075      pmovzxbw(vec1, Address(str1, result, scale1));
8076      pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
8077    }
8078    jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
8079
8080    // Mismatched characters in the vectors
8081    bind(VECTOR_NOT_EQUAL);
8082    addptr(cnt1, result);
8083    load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
8084    subl(result, cnt2);
8085    jmpb(POP_LABEL);
8086
8087    bind(COMPARE_TAIL); // limit is zero
8088    movl(cnt2, result);
8089    // Fallthru to tail compare
8090  }
8091  // Shift str2 and str1 to the end of the arrays, negate min
8092  if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
8093    lea(str1, Address(str1, cnt2, scale));
8094    lea(str2, Address(str2, cnt2, scale));
8095  } else {
8096    lea(str1, Address(str1, cnt2, scale1));
8097    lea(str2, Address(str2, cnt2, scale2));
8098  }
8099  decrementl(cnt2);  // first character was compared already
8100  negptr(cnt2);
8101
8102  // Compare the rest of the elements
8103  bind(WHILE_HEAD_LABEL);
8104  load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
8105  subl(result, cnt1);
8106  jccb(Assembler::notZero, POP_LABEL);
8107  increment(cnt2);
8108  jccb(Assembler::notZero, WHILE_HEAD_LABEL);
8109
8110  // Strings are equal up to min length.  Return the length difference.
8111  bind(LENGTH_DIFF_LABEL);
8112  pop(result);
8113  if (ae == StrIntrinsicNode::UU) {
8114    // Divide diff by 2 to get number of chars
8115    sarl(result, 1);
8116  }
8117  jmpb(DONE_LABEL);
8118
8119#ifdef _LP64
8120  if (VM_Version::supports_avx512vlbw()) {
8121
8122    bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
8123
8124    kmovql(cnt1, k7);
8125    notq(cnt1);
8126    bsfq(cnt2, cnt1);
8127    if (ae != StrIntrinsicNode::LL) {
8128      // Divide diff by 2 to get number of chars
8129      sarl(cnt2, 1);
8130    }
8131    addq(result, cnt2);
8132    if (ae == StrIntrinsicNode::LL) {
8133      load_unsigned_byte(cnt1, Address(str2, result));
8134      load_unsigned_byte(result, Address(str1, result));
8135    } else if (ae == StrIntrinsicNode::UU) {
8136      load_unsigned_short(cnt1, Address(str2, result, scale));
8137      load_unsigned_short(result, Address(str1, result, scale));
8138    } else {
8139      load_unsigned_short(cnt1, Address(str2, result, scale2));
8140      load_unsigned_byte(result, Address(str1, result, scale1));
8141    }
8142    subl(result, cnt1);
8143    jmpb(POP_LABEL);
8144  }//if (VM_Version::supports_avx512vlbw())
8145#endif // _LP64
8146
8147  // Discard the stored length difference
8148  bind(POP_LABEL);
8149  pop(cnt1);
8150
8151  // That's it
8152  bind(DONE_LABEL);
8153  if(ae == StrIntrinsicNode::UL) {
8154    negl(result);
8155  }
8156
8157}
8158
8159// Search for Non-ASCII character (Negative byte value) in a byte array,
8160// return true if it has any and false otherwise.
8161void MacroAssembler::has_negatives(Register ary1, Register len,
8162                                   Register result, Register tmp1,
8163                                   XMMRegister vec1, XMMRegister vec2) {
8164
8165  // rsi: byte array
8166  // rcx: len
8167  // rax: result
8168  ShortBranchVerifier sbv(this);
8169  assert_different_registers(ary1, len, result, tmp1);
8170  assert_different_registers(vec1, vec2);
8171  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
8172
8173  // len == 0
8174  testl(len, len);
8175  jcc(Assembler::zero, FALSE_LABEL);
8176
8177  movl(result, len); // copy
8178
8179  if (UseAVX >= 2 && UseSSE >= 2) {
8180    // With AVX2, use 32-byte vector compare
8181    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8182
8183    // Compare 32-byte vectors
8184    andl(result, 0x0000001f);  //   tail count (in bytes)
8185    andl(len, 0xffffffe0);   // vector count (in bytes)
8186    jccb(Assembler::zero, COMPARE_TAIL);
8187
8188    lea(ary1, Address(ary1, len, Address::times_1));
8189    negptr(len);
8190
8191    movl(tmp1, 0x80808080);   // create mask to test for Unicode chars in vector
8192    movdl(vec2, tmp1);
8193    vpbroadcastd(vec2, vec2);
8194
8195    bind(COMPARE_WIDE_VECTORS);
8196    vmovdqu(vec1, Address(ary1, len, Address::times_1));
8197    vptest(vec1, vec2);
8198    jccb(Assembler::notZero, TRUE_LABEL);
8199    addptr(len, 32);
8200    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8201
8202    testl(result, result);
8203    jccb(Assembler::zero, FALSE_LABEL);
8204
8205    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
8206    vptest(vec1, vec2);
8207    jccb(Assembler::notZero, TRUE_LABEL);
8208    jmpb(FALSE_LABEL);
8209
8210    bind(COMPARE_TAIL); // len is zero
8211    movl(len, result);
8212    // Fallthru to tail compare
8213  } else if (UseSSE42Intrinsics) {
8214    assert(UseSSE >= 4, "SSE4 must be  for SSE4.2 intrinsics to be available");
8215    // With SSE4.2, use double quad vector compare
8216    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8217
8218    // Compare 16-byte vectors
8219    andl(result, 0x0000000f);  //   tail count (in bytes)
8220    andl(len, 0xfffffff0);   // vector count (in bytes)
8221    jccb(Assembler::zero, COMPARE_TAIL);
8222
8223    lea(ary1, Address(ary1, len, Address::times_1));
8224    negptr(len);
8225
8226    movl(tmp1, 0x80808080);
8227    movdl(vec2, tmp1);
8228    pshufd(vec2, vec2, 0);
8229
8230    bind(COMPARE_WIDE_VECTORS);
8231    movdqu(vec1, Address(ary1, len, Address::times_1));
8232    ptest(vec1, vec2);
8233    jccb(Assembler::notZero, TRUE_LABEL);
8234    addptr(len, 16);
8235    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8236
8237    testl(result, result);
8238    jccb(Assembler::zero, FALSE_LABEL);
8239
8240    movdqu(vec1, Address(ary1, result, Address::times_1, -16));
8241    ptest(vec1, vec2);
8242    jccb(Assembler::notZero, TRUE_LABEL);
8243    jmpb(FALSE_LABEL);
8244
8245    bind(COMPARE_TAIL); // len is zero
8246    movl(len, result);
8247    // Fallthru to tail compare
8248  }
8249
8250  // Compare 4-byte vectors
8251  andl(len, 0xfffffffc); // vector count (in bytes)
8252  jccb(Assembler::zero, COMPARE_CHAR);
8253
8254  lea(ary1, Address(ary1, len, Address::times_1));
8255  negptr(len);
8256
8257  bind(COMPARE_VECTORS);
8258  movl(tmp1, Address(ary1, len, Address::times_1));
8259  andl(tmp1, 0x80808080);
8260  jccb(Assembler::notZero, TRUE_LABEL);
8261  addptr(len, 4);
8262  jcc(Assembler::notZero, COMPARE_VECTORS);
8263
8264  // Compare trailing char (final 2 bytes), if any
8265  bind(COMPARE_CHAR);
8266  testl(result, 0x2);   // tail  char
8267  jccb(Assembler::zero, COMPARE_BYTE);
8268  load_unsigned_short(tmp1, Address(ary1, 0));
8269  andl(tmp1, 0x00008080);
8270  jccb(Assembler::notZero, TRUE_LABEL);
8271  subptr(result, 2);
8272  lea(ary1, Address(ary1, 2));
8273
8274  bind(COMPARE_BYTE);
8275  testl(result, 0x1);   // tail  byte
8276  jccb(Assembler::zero, FALSE_LABEL);
8277  load_unsigned_byte(tmp1, Address(ary1, 0));
8278  andl(tmp1, 0x00000080);
8279  jccb(Assembler::notEqual, TRUE_LABEL);
8280  jmpb(FALSE_LABEL);
8281
8282  bind(TRUE_LABEL);
8283  movl(result, 1);   // return true
8284  jmpb(DONE);
8285
8286  bind(FALSE_LABEL);
8287  xorl(result, result); // return false
8288
8289  // That's it
8290  bind(DONE);
8291  if (UseAVX >= 2 && UseSSE >= 2) {
8292    // clean upper bits of YMM registers
8293    vpxor(vec1, vec1);
8294    vpxor(vec2, vec2);
8295  }
8296}
8297
8298// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
8299void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
8300                                   Register limit, Register result, Register chr,
8301                                   XMMRegister vec1, XMMRegister vec2, bool is_char) {
8302  ShortBranchVerifier sbv(this);
8303  Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
8304
8305  int length_offset  = arrayOopDesc::length_offset_in_bytes();
8306  int base_offset    = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
8307
8308  if (is_array_equ) {
8309    // Check the input args
8310    cmpptr(ary1, ary2);
8311    jcc(Assembler::equal, TRUE_LABEL);
8312
8313    // Need additional checks for arrays_equals.
8314    testptr(ary1, ary1);
8315    jcc(Assembler::zero, FALSE_LABEL);
8316    testptr(ary2, ary2);
8317    jcc(Assembler::zero, FALSE_LABEL);
8318
8319    // Check the lengths
8320    movl(limit, Address(ary1, length_offset));
8321    cmpl(limit, Address(ary2, length_offset));
8322    jcc(Assembler::notEqual, FALSE_LABEL);
8323  }
8324
8325  // count == 0
8326  testl(limit, limit);
8327  jcc(Assembler::zero, TRUE_LABEL);
8328
8329  if (is_array_equ) {
8330    // Load array address
8331    lea(ary1, Address(ary1, base_offset));
8332    lea(ary2, Address(ary2, base_offset));
8333  }
8334
8335  if (is_array_equ && is_char) {
8336    // arrays_equals when used for char[].
8337    shll(limit, 1);      // byte count != 0
8338  }
8339  movl(result, limit); // copy
8340
8341  if (UseAVX >= 2) {
8342    // With AVX2, use 32-byte vector compare
8343    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8344
8345    // Compare 32-byte vectors
8346    andl(result, 0x0000001f);  //   tail count (in bytes)
8347    andl(limit, 0xffffffe0);   // vector count (in bytes)
8348    jcc(Assembler::zero, COMPARE_TAIL);
8349
8350    lea(ary1, Address(ary1, limit, Address::times_1));
8351    lea(ary2, Address(ary2, limit, Address::times_1));
8352    negptr(limit);
8353
8354    bind(COMPARE_WIDE_VECTORS);
8355
8356#ifdef _LP64
8357    if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
8358      Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
8359
8360      cmpl(limit, -64);
8361      jccb(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
8362
8363      bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
8364
8365      evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
8366      evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
8367      kortestql(k7, k7);
8368      jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
8369      addptr(limit, 64);  // update since we already compared at this addr
8370      cmpl(limit, -64);
8371      jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
8372
8373      // At this point we may still need to compare -limit+result bytes.
8374      // We could execute the next two instruction and just continue via non-wide path:
8375      //  cmpl(limit, 0);
8376      //  jcc(Assembler::equal, COMPARE_TAIL);  // true
8377      // But since we stopped at the points ary{1,2}+limit which are
8378      // not farther than 64 bytes from the ends of arrays ary{1,2}+result
8379      // (|limit| <= 32 and result < 32),
8380      // we may just compare the last 64 bytes.
8381      //
8382      addptr(result, -64);   // it is safe, bc we just came from this area
8383      evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
8384      evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
8385      kortestql(k7, k7);
8386      jcc(Assembler::aboveEqual, FALSE_LABEL);     // miscompare
8387
8388      jmp(TRUE_LABEL);
8389
8390      bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
8391
8392    }//if (VM_Version::supports_avx512vlbw())
8393#endif //_LP64
8394
8395    vmovdqu(vec1, Address(ary1, limit, Address::times_1));
8396    vmovdqu(vec2, Address(ary2, limit, Address::times_1));
8397    vpxor(vec1, vec2);
8398
8399    vptest(vec1, vec1);
8400    jccb(Assembler::notZero, FALSE_LABEL);
8401    addptr(limit, 32);
8402    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8403
8404    testl(result, result);
8405    jccb(Assembler::zero, TRUE_LABEL);
8406
8407    vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
8408    vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
8409    vpxor(vec1, vec2);
8410
8411    vptest(vec1, vec1);
8412    jccb(Assembler::notZero, FALSE_LABEL);
8413    jmpb(TRUE_LABEL);
8414
8415    bind(COMPARE_TAIL); // limit is zero
8416    movl(limit, result);
8417    // Fallthru to tail compare
8418  } else if (UseSSE42Intrinsics) {
8419    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8420    // With SSE4.2, use double quad vector compare
8421    Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
8422
8423    // Compare 16-byte vectors
8424    andl(result, 0x0000000f);  //   tail count (in bytes)
8425    andl(limit, 0xfffffff0);   // vector count (in bytes)
8426    jccb(Assembler::zero, COMPARE_TAIL);
8427
8428    lea(ary1, Address(ary1, limit, Address::times_1));
8429    lea(ary2, Address(ary2, limit, Address::times_1));
8430    negptr(limit);
8431
8432    bind(COMPARE_WIDE_VECTORS);
8433    movdqu(vec1, Address(ary1, limit, Address::times_1));
8434    movdqu(vec2, Address(ary2, limit, Address::times_1));
8435    pxor(vec1, vec2);
8436
8437    ptest(vec1, vec1);
8438    jccb(Assembler::notZero, FALSE_LABEL);
8439    addptr(limit, 16);
8440    jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
8441
8442    testl(result, result);
8443    jccb(Assembler::zero, TRUE_LABEL);
8444
8445    movdqu(vec1, Address(ary1, result, Address::times_1, -16));
8446    movdqu(vec2, Address(ary2, result, Address::times_1, -16));
8447    pxor(vec1, vec2);
8448
8449    ptest(vec1, vec1);
8450    jccb(Assembler::notZero, FALSE_LABEL);
8451    jmpb(TRUE_LABEL);
8452
8453    bind(COMPARE_TAIL); // limit is zero
8454    movl(limit, result);
8455    // Fallthru to tail compare
8456  }
8457
8458  // Compare 4-byte vectors
8459  andl(limit, 0xfffffffc); // vector count (in bytes)
8460  jccb(Assembler::zero, COMPARE_CHAR);
8461
8462  lea(ary1, Address(ary1, limit, Address::times_1));
8463  lea(ary2, Address(ary2, limit, Address::times_1));
8464  negptr(limit);
8465
8466  bind(COMPARE_VECTORS);
8467  movl(chr, Address(ary1, limit, Address::times_1));
8468  cmpl(chr, Address(ary2, limit, Address::times_1));
8469  jccb(Assembler::notEqual, FALSE_LABEL);
8470  addptr(limit, 4);
8471  jcc(Assembler::notZero, COMPARE_VECTORS);
8472
8473  // Compare trailing char (final 2 bytes), if any
8474  bind(COMPARE_CHAR);
8475  testl(result, 0x2);   // tail  char
8476  jccb(Assembler::zero, COMPARE_BYTE);
8477  load_unsigned_short(chr, Address(ary1, 0));
8478  load_unsigned_short(limit, Address(ary2, 0));
8479  cmpl(chr, limit);
8480  jccb(Assembler::notEqual, FALSE_LABEL);
8481
8482  if (is_array_equ && is_char) {
8483    bind(COMPARE_BYTE);
8484  } else {
8485    lea(ary1, Address(ary1, 2));
8486    lea(ary2, Address(ary2, 2));
8487
8488    bind(COMPARE_BYTE);
8489    testl(result, 0x1);   // tail  byte
8490    jccb(Assembler::zero, TRUE_LABEL);
8491    load_unsigned_byte(chr, Address(ary1, 0));
8492    load_unsigned_byte(limit, Address(ary2, 0));
8493    cmpl(chr, limit);
8494    jccb(Assembler::notEqual, FALSE_LABEL);
8495  }
8496  bind(TRUE_LABEL);
8497  movl(result, 1);   // return true
8498  jmpb(DONE);
8499
8500  bind(FALSE_LABEL);
8501  xorl(result, result); // return false
8502
8503  // That's it
8504  bind(DONE);
8505  if (UseAVX >= 2) {
8506    // clean upper bits of YMM registers
8507    vpxor(vec1, vec1);
8508    vpxor(vec2, vec2);
8509  }
8510}
8511
8512#endif
8513
8514void MacroAssembler::generate_fill(BasicType t, bool aligned,
8515                                   Register to, Register value, Register count,
8516                                   Register rtmp, XMMRegister xtmp) {
8517  ShortBranchVerifier sbv(this);
8518  assert_different_registers(to, value, count, rtmp);
8519  Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
8520  Label L_fill_2_bytes, L_fill_4_bytes;
8521
8522  int shift = -1;
8523  switch (t) {
8524    case T_BYTE:
8525      shift = 2;
8526      break;
8527    case T_SHORT:
8528      shift = 1;
8529      break;
8530    case T_INT:
8531      shift = 0;
8532      break;
8533    default: ShouldNotReachHere();
8534  }
8535
8536  if (t == T_BYTE) {
8537    andl(value, 0xff);
8538    movl(rtmp, value);
8539    shll(rtmp, 8);
8540    orl(value, rtmp);
8541  }
8542  if (t == T_SHORT) {
8543    andl(value, 0xffff);
8544  }
8545  if (t == T_BYTE || t == T_SHORT) {
8546    movl(rtmp, value);
8547    shll(rtmp, 16);
8548    orl(value, rtmp);
8549  }
8550
8551  cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
8552  jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
8553  if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
8554    // align source address at 4 bytes address boundary
8555    if (t == T_BYTE) {
8556      // One byte misalignment happens only for byte arrays
8557      testptr(to, 1);
8558      jccb(Assembler::zero, L_skip_align1);
8559      movb(Address(to, 0), value);
8560      increment(to);
8561      decrement(count);
8562      BIND(L_skip_align1);
8563    }
8564    // Two bytes misalignment happens only for byte and short (char) arrays
8565    testptr(to, 2);
8566    jccb(Assembler::zero, L_skip_align2);
8567    movw(Address(to, 0), value);
8568    addptr(to, 2);
8569    subl(count, 1<<(shift-1));
8570    BIND(L_skip_align2);
8571  }
8572  if (UseSSE < 2) {
8573    Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8574    // Fill 32-byte chunks
8575    subl(count, 8 << shift);
8576    jcc(Assembler::less, L_check_fill_8_bytes);
8577    align(16);
8578
8579    BIND(L_fill_32_bytes_loop);
8580
8581    for (int i = 0; i < 32; i += 4) {
8582      movl(Address(to, i), value);
8583    }
8584
8585    addptr(to, 32);
8586    subl(count, 8 << shift);
8587    jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
8588    BIND(L_check_fill_8_bytes);
8589    addl(count, 8 << shift);
8590    jccb(Assembler::zero, L_exit);
8591    jmpb(L_fill_8_bytes);
8592
8593    //
8594    // length is too short, just fill qwords
8595    //
8596    BIND(L_fill_8_bytes_loop);
8597    movl(Address(to, 0), value);
8598    movl(Address(to, 4), value);
8599    addptr(to, 8);
8600    BIND(L_fill_8_bytes);
8601    subl(count, 1 << (shift + 1));
8602    jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
8603    // fall through to fill 4 bytes
8604  } else {
8605    Label L_fill_32_bytes;
8606    if (!UseUnalignedLoadStores) {
8607      // align to 8 bytes, we know we are 4 byte aligned to start
8608      testptr(to, 4);
8609      jccb(Assembler::zero, L_fill_32_bytes);
8610      movl(Address(to, 0), value);
8611      addptr(to, 4);
8612      subl(count, 1<<shift);
8613    }
8614    BIND(L_fill_32_bytes);
8615    {
8616      assert( UseSSE >= 2, "supported cpu only" );
8617      Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
8618      if (UseAVX > 2) {
8619        movl(rtmp, 0xffff);
8620        kmovwl(k1, rtmp);
8621      }
8622      movdl(xtmp, value);
8623      if (UseAVX > 2 && UseUnalignedLoadStores) {
8624        // Fill 64-byte chunks
8625        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8626        evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit);
8627
8628        subl(count, 16 << shift);
8629        jcc(Assembler::less, L_check_fill_32_bytes);
8630        align(16);
8631
8632        BIND(L_fill_64_bytes_loop);
8633        evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit);
8634        addptr(to, 64);
8635        subl(count, 16 << shift);
8636        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8637
8638        BIND(L_check_fill_32_bytes);
8639        addl(count, 8 << shift);
8640        jccb(Assembler::less, L_check_fill_8_bytes);
8641        vmovdqu(Address(to, 0), xtmp);
8642        addptr(to, 32);
8643        subl(count, 8 << shift);
8644
8645        BIND(L_check_fill_8_bytes);
8646      } else if (UseAVX == 2 && UseUnalignedLoadStores) {
8647        // Fill 64-byte chunks
8648        Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
8649        vpbroadcastd(xtmp, xtmp);
8650
8651        subl(count, 16 << shift);
8652        jcc(Assembler::less, L_check_fill_32_bytes);
8653        align(16);
8654
8655        BIND(L_fill_64_bytes_loop);
8656        vmovdqu(Address(to, 0), xtmp);
8657        vmovdqu(Address(to, 32), xtmp);
8658        addptr(to, 64);
8659        subl(count, 16 << shift);
8660        jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
8661
8662        BIND(L_check_fill_32_bytes);
8663        addl(count, 8 << shift);
8664        jccb(Assembler::less, L_check_fill_8_bytes);
8665        vmovdqu(Address(to, 0), xtmp);
8666        addptr(to, 32);
8667        subl(count, 8 << shift);
8668
8669        BIND(L_check_fill_8_bytes);
8670        // clean upper bits of YMM registers
8671        movdl(xtmp, value);
8672        pshufd(xtmp, xtmp, 0);
8673      } else {
8674        // Fill 32-byte chunks
8675        pshufd(xtmp, xtmp, 0);
8676
8677        subl(count, 8 << shift);
8678        jcc(Assembler::less, L_check_fill_8_bytes);
8679        align(16);
8680
8681        BIND(L_fill_32_bytes_loop);
8682
8683        if (UseUnalignedLoadStores) {
8684          movdqu(Address(to, 0), xtmp);
8685          movdqu(Address(to, 16), xtmp);
8686        } else {
8687          movq(Address(to, 0), xtmp);
8688          movq(Address(to, 8), xtmp);
8689          movq(Address(to, 16), xtmp);
8690          movq(Address(to, 24), xtmp);
8691        }
8692
8693        addptr(to, 32);
8694        subl(count, 8 << shift);
8695        jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
8696
8697        BIND(L_check_fill_8_bytes);
8698      }
8699      addl(count, 8 << shift);
8700      jccb(Assembler::zero, L_exit);
8701      jmpb(L_fill_8_bytes);
8702
8703      //
8704      // length is too short, just fill qwords
8705      //
8706      BIND(L_fill_8_bytes_loop);
8707      movq(Address(to, 0), xtmp);
8708      addptr(to, 8);
8709      BIND(L_fill_8_bytes);
8710      subl(count, 1 << (shift + 1));
8711      jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
8712    }
8713  }
8714  // fill trailing 4 bytes
8715  BIND(L_fill_4_bytes);
8716  testl(count, 1<<shift);
8717  jccb(Assembler::zero, L_fill_2_bytes);
8718  movl(Address(to, 0), value);
8719  if (t == T_BYTE || t == T_SHORT) {
8720    addptr(to, 4);
8721    BIND(L_fill_2_bytes);
8722    // fill trailing 2 bytes
8723    testl(count, 1<<(shift-1));
8724    jccb(Assembler::zero, L_fill_byte);
8725    movw(Address(to, 0), value);
8726    if (t == T_BYTE) {
8727      addptr(to, 2);
8728      BIND(L_fill_byte);
8729      // fill trailing byte
8730      testl(count, 1);
8731      jccb(Assembler::zero, L_exit);
8732      movb(Address(to, 0), value);
8733    } else {
8734      BIND(L_fill_byte);
8735    }
8736  } else {
8737    BIND(L_fill_2_bytes);
8738  }
8739  BIND(L_exit);
8740}
8741
8742// encode char[] to byte[] in ISO_8859_1
8743void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
8744                                      XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8745                                      XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8746                                      Register tmp5, Register result) {
8747  // rsi: src
8748  // rdi: dst
8749  // rdx: len
8750  // rcx: tmp5
8751  // rax: result
8752  ShortBranchVerifier sbv(this);
8753  assert_different_registers(src, dst, len, tmp5, result);
8754  Label L_done, L_copy_1_char, L_copy_1_char_exit;
8755
8756  // set result
8757  xorl(result, result);
8758  // check for zero length
8759  testl(len, len);
8760  jcc(Assembler::zero, L_done);
8761  movl(result, len);
8762
8763  // Setup pointers
8764  lea(src, Address(src, len, Address::times_2)); // char[]
8765  lea(dst, Address(dst, len, Address::times_1)); // byte[]
8766  negptr(len);
8767
8768  if (UseSSE42Intrinsics || UseAVX >= 2) {
8769    assert(UseSSE42Intrinsics ? UseSSE >= 4 : true, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
8770    Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
8771    Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
8772
8773    if (UseAVX >= 2) {
8774      Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
8775      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8776      movdl(tmp1Reg, tmp5);
8777      vpbroadcastd(tmp1Reg, tmp1Reg);
8778      jmpb(L_chars_32_check);
8779
8780      bind(L_copy_32_chars);
8781      vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
8782      vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
8783      vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8784      vptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8785      jccb(Assembler::notZero, L_copy_32_chars_exit);
8786      vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1);
8787      vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1);
8788      vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
8789
8790      bind(L_chars_32_check);
8791      addptr(len, 32);
8792      jccb(Assembler::lessEqual, L_copy_32_chars);
8793
8794      bind(L_copy_32_chars_exit);
8795      subptr(len, 16);
8796      jccb(Assembler::greater, L_copy_16_chars_exit);
8797
8798    } else if (UseSSE42Intrinsics) {
8799      movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vector
8800      movdl(tmp1Reg, tmp5);
8801      pshufd(tmp1Reg, tmp1Reg, 0);
8802      jmpb(L_chars_16_check);
8803    }
8804
8805    bind(L_copy_16_chars);
8806    if (UseAVX >= 2) {
8807      vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
8808      vptest(tmp2Reg, tmp1Reg);
8809      jccb(Assembler::notZero, L_copy_16_chars_exit);
8810      vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1);
8811      vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1);
8812    } else {
8813      if (UseAVX > 0) {
8814        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8815        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8816        vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0);
8817      } else {
8818        movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
8819        por(tmp2Reg, tmp3Reg);
8820        movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
8821        por(tmp2Reg, tmp4Reg);
8822      }
8823      ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in  vector
8824      jccb(Assembler::notZero, L_copy_16_chars_exit);
8825      packuswb(tmp3Reg, tmp4Reg);
8826    }
8827    movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
8828
8829    bind(L_chars_16_check);
8830    addptr(len, 16);
8831    jccb(Assembler::lessEqual, L_copy_16_chars);
8832
8833    bind(L_copy_16_chars_exit);
8834    if (UseAVX >= 2) {
8835      // clean upper bits of YMM registers
8836      vpxor(tmp2Reg, tmp2Reg);
8837      vpxor(tmp3Reg, tmp3Reg);
8838      vpxor(tmp4Reg, tmp4Reg);
8839      movdl(tmp1Reg, tmp5);
8840      pshufd(tmp1Reg, tmp1Reg, 0);
8841    }
8842    subptr(len, 8);
8843    jccb(Assembler::greater, L_copy_8_chars_exit);
8844
8845    bind(L_copy_8_chars);
8846    movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
8847    ptest(tmp3Reg, tmp1Reg);
8848    jccb(Assembler::notZero, L_copy_8_chars_exit);
8849    packuswb(tmp3Reg, tmp1Reg);
8850    movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
8851    addptr(len, 8);
8852    jccb(Assembler::lessEqual, L_copy_8_chars);
8853
8854    bind(L_copy_8_chars_exit);
8855    subptr(len, 8);
8856    jccb(Assembler::zero, L_done);
8857  }
8858
8859  bind(L_copy_1_char);
8860  load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
8861  testl(tmp5, 0xff00);      // check if Unicode char
8862  jccb(Assembler::notZero, L_copy_1_char_exit);
8863  movb(Address(dst, len, Address::times_1, 0), tmp5);
8864  addptr(len, 1);
8865  jccb(Assembler::less, L_copy_1_char);
8866
8867  bind(L_copy_1_char_exit);
8868  addptr(result, len); // len is negative count of not processed elements
8869  bind(L_done);
8870}
8871
8872#ifdef _LP64
8873/**
8874 * Helper for multiply_to_len().
8875 */
8876void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
8877  addq(dest_lo, src1);
8878  adcq(dest_hi, 0);
8879  addq(dest_lo, src2);
8880  adcq(dest_hi, 0);
8881}
8882
8883/**
8884 * Multiply 64 bit by 64 bit first loop.
8885 */
8886void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
8887                                           Register y, Register y_idx, Register z,
8888                                           Register carry, Register product,
8889                                           Register idx, Register kdx) {
8890  //
8891  //  jlong carry, x[], y[], z[];
8892  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
8893  //    huge_128 product = y[idx] * x[xstart] + carry;
8894  //    z[kdx] = (jlong)product;
8895  //    carry  = (jlong)(product >>> 64);
8896  //  }
8897  //  z[xstart] = carry;
8898  //
8899
8900  Label L_first_loop, L_first_loop_exit;
8901  Label L_one_x, L_one_y, L_multiply;
8902
8903  decrementl(xstart);
8904  jcc(Assembler::negative, L_one_x);
8905
8906  movq(x_xstart, Address(x, xstart, Address::times_4,  0));
8907  rorq(x_xstart, 32); // convert big-endian to little-endian
8908
8909  bind(L_first_loop);
8910  decrementl(idx);
8911  jcc(Assembler::negative, L_first_loop_exit);
8912  decrementl(idx);
8913  jcc(Assembler::negative, L_one_y);
8914  movq(y_idx, Address(y, idx, Address::times_4,  0));
8915  rorq(y_idx, 32); // convert big-endian to little-endian
8916  bind(L_multiply);
8917  movq(product, x_xstart);
8918  mulq(y_idx); // product(rax) * y_idx -> rdx:rax
8919  addq(product, carry);
8920  adcq(rdx, 0);
8921  subl(kdx, 2);
8922  movl(Address(z, kdx, Address::times_4,  4), product);
8923  shrq(product, 32);
8924  movl(Address(z, kdx, Address::times_4,  0), product);
8925  movq(carry, rdx);
8926  jmp(L_first_loop);
8927
8928  bind(L_one_y);
8929  movl(y_idx, Address(y,  0));
8930  jmp(L_multiply);
8931
8932  bind(L_one_x);
8933  movl(x_xstart, Address(x,  0));
8934  jmp(L_first_loop);
8935
8936  bind(L_first_loop_exit);
8937}
8938
8939/**
8940 * Multiply 64 bit by 64 bit and add 128 bit.
8941 */
8942void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
8943                                            Register yz_idx, Register idx,
8944                                            Register carry, Register product, int offset) {
8945  //     huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
8946  //     z[kdx] = (jlong)product;
8947
8948  movq(yz_idx, Address(y, idx, Address::times_4,  offset));
8949  rorq(yz_idx, 32); // convert big-endian to little-endian
8950  movq(product, x_xstart);
8951  mulq(yz_idx);     // product(rax) * yz_idx -> rdx:product(rax)
8952  movq(yz_idx, Address(z, idx, Address::times_4,  offset));
8953  rorq(yz_idx, 32); // convert big-endian to little-endian
8954
8955  add2_with_carry(rdx, product, carry, yz_idx);
8956
8957  movl(Address(z, idx, Address::times_4,  offset+4), product);
8958  shrq(product, 32);
8959  movl(Address(z, idx, Address::times_4,  offset), product);
8960
8961}
8962
8963/**
8964 * Multiply 128 bit by 128 bit. Unrolled inner loop.
8965 */
8966void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
8967                                             Register yz_idx, Register idx, Register jdx,
8968                                             Register carry, Register product,
8969                                             Register carry2) {
8970  //   jlong carry, x[], y[], z[];
8971  //   int kdx = ystart+1;
8972  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
8973  //     huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
8974  //     z[kdx+idx+1] = (jlong)product;
8975  //     jlong carry2  = (jlong)(product >>> 64);
8976  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
8977  //     z[kdx+idx] = (jlong)product;
8978  //     carry  = (jlong)(product >>> 64);
8979  //   }
8980  //   idx += 2;
8981  //   if (idx > 0) {
8982  //     product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
8983  //     z[kdx+idx] = (jlong)product;
8984  //     carry  = (jlong)(product >>> 64);
8985  //   }
8986  //
8987
8988  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
8989
8990  movl(jdx, idx);
8991  andl(jdx, 0xFFFFFFFC);
8992  shrl(jdx, 2);
8993
8994  bind(L_third_loop);
8995  subl(jdx, 1);
8996  jcc(Assembler::negative, L_third_loop_exit);
8997  subl(idx, 4);
8998
8999  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
9000  movq(carry2, rdx);
9001
9002  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
9003  movq(carry, rdx);
9004  jmp(L_third_loop);
9005
9006  bind (L_third_loop_exit);
9007
9008  andl (idx, 0x3);
9009  jcc(Assembler::zero, L_post_third_loop_done);
9010
9011  Label L_check_1;
9012  subl(idx, 2);
9013  jcc(Assembler::negative, L_check_1);
9014
9015  multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
9016  movq(carry, rdx);
9017
9018  bind (L_check_1);
9019  addl (idx, 0x2);
9020  andl (idx, 0x1);
9021  subl(idx, 1);
9022  jcc(Assembler::negative, L_post_third_loop_done);
9023
9024  movl(yz_idx, Address(y, idx, Address::times_4,  0));
9025  movq(product, x_xstart);
9026  mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
9027  movl(yz_idx, Address(z, idx, Address::times_4,  0));
9028
9029  add2_with_carry(rdx, product, yz_idx, carry);
9030
9031  movl(Address(z, idx, Address::times_4,  0), product);
9032  shrq(product, 32);
9033
9034  shlq(rdx, 32);
9035  orq(product, rdx);
9036  movq(carry, product);
9037
9038  bind(L_post_third_loop_done);
9039}
9040
9041/**
9042 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
9043 *
9044 */
9045void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
9046                                                  Register carry, Register carry2,
9047                                                  Register idx, Register jdx,
9048                                                  Register yz_idx1, Register yz_idx2,
9049                                                  Register tmp, Register tmp3, Register tmp4) {
9050  assert(UseBMI2Instructions, "should be used only when BMI2 is available");
9051
9052  //   jlong carry, x[], y[], z[];
9053  //   int kdx = ystart+1;
9054  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
9055  //     huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
9056  //     jlong carry2  = (jlong)(tmp3 >>> 64);
9057  //     huge_128 tmp4 = (y[idx]   * rdx) + z[kdx+idx] + carry2;
9058  //     carry  = (jlong)(tmp4 >>> 64);
9059  //     z[kdx+idx+1] = (jlong)tmp3;
9060  //     z[kdx+idx] = (jlong)tmp4;
9061  //   }
9062  //   idx += 2;
9063  //   if (idx > 0) {
9064  //     yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
9065  //     z[kdx+idx] = (jlong)yz_idx1;
9066  //     carry  = (jlong)(yz_idx1 >>> 64);
9067  //   }
9068  //
9069
9070  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
9071
9072  movl(jdx, idx);
9073  andl(jdx, 0xFFFFFFFC);
9074  shrl(jdx, 2);
9075
9076  bind(L_third_loop);
9077  subl(jdx, 1);
9078  jcc(Assembler::negative, L_third_loop_exit);
9079  subl(idx, 4);
9080
9081  movq(yz_idx1,  Address(y, idx, Address::times_4,  8));
9082  rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
9083  movq(yz_idx2, Address(y, idx, Address::times_4,  0));
9084  rorxq(yz_idx2, yz_idx2, 32);
9085
9086  mulxq(tmp4, tmp3, yz_idx1);  //  yz_idx1 * rdx -> tmp4:tmp3
9087  mulxq(carry2, tmp, yz_idx2); //  yz_idx2 * rdx -> carry2:tmp
9088
9089  movq(yz_idx1,  Address(z, idx, Address::times_4,  8));
9090  rorxq(yz_idx1, yz_idx1, 32);
9091  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
9092  rorxq(yz_idx2, yz_idx2, 32);
9093
9094  if (VM_Version::supports_adx()) {
9095    adcxq(tmp3, carry);
9096    adoxq(tmp3, yz_idx1);
9097
9098    adcxq(tmp4, tmp);
9099    adoxq(tmp4, yz_idx2);
9100
9101    movl(carry, 0); // does not affect flags
9102    adcxq(carry2, carry);
9103    adoxq(carry2, carry);
9104  } else {
9105    add2_with_carry(tmp4, tmp3, carry, yz_idx1);
9106    add2_with_carry(carry2, tmp4, tmp, yz_idx2);
9107  }
9108  movq(carry, carry2);
9109
9110  movl(Address(z, idx, Address::times_4, 12), tmp3);
9111  shrq(tmp3, 32);
9112  movl(Address(z, idx, Address::times_4,  8), tmp3);
9113
9114  movl(Address(z, idx, Address::times_4,  4), tmp4);
9115  shrq(tmp4, 32);
9116  movl(Address(z, idx, Address::times_4,  0), tmp4);
9117
9118  jmp(L_third_loop);
9119
9120  bind (L_third_loop_exit);
9121
9122  andl (idx, 0x3);
9123  jcc(Assembler::zero, L_post_third_loop_done);
9124
9125  Label L_check_1;
9126  subl(idx, 2);
9127  jcc(Assembler::negative, L_check_1);
9128
9129  movq(yz_idx1, Address(y, idx, Address::times_4,  0));
9130  rorxq(yz_idx1, yz_idx1, 32);
9131  mulxq(tmp4, tmp3, yz_idx1); //  yz_idx1 * rdx -> tmp4:tmp3
9132  movq(yz_idx2, Address(z, idx, Address::times_4,  0));
9133  rorxq(yz_idx2, yz_idx2, 32);
9134
9135  add2_with_carry(tmp4, tmp3, carry, yz_idx2);
9136
9137  movl(Address(z, idx, Address::times_4,  4), tmp3);
9138  shrq(tmp3, 32);
9139  movl(Address(z, idx, Address::times_4,  0), tmp3);
9140  movq(carry, tmp4);
9141
9142  bind (L_check_1);
9143  addl (idx, 0x2);
9144  andl (idx, 0x1);
9145  subl(idx, 1);
9146  jcc(Assembler::negative, L_post_third_loop_done);
9147  movl(tmp4, Address(y, idx, Address::times_4,  0));
9148  mulxq(carry2, tmp3, tmp4);  //  tmp4 * rdx -> carry2:tmp3
9149  movl(tmp4, Address(z, idx, Address::times_4,  0));
9150
9151  add2_with_carry(carry2, tmp3, tmp4, carry);
9152
9153  movl(Address(z, idx, Address::times_4,  0), tmp3);
9154  shrq(tmp3, 32);
9155
9156  shlq(carry2, 32);
9157  orq(tmp3, carry2);
9158  movq(carry, tmp3);
9159
9160  bind(L_post_third_loop_done);
9161}
9162
9163/**
9164 * Code for BigInteger::multiplyToLen() instrinsic.
9165 *
9166 * rdi: x
9167 * rax: xlen
9168 * rsi: y
9169 * rcx: ylen
9170 * r8:  z
9171 * r11: zlen
9172 * r12: tmp1
9173 * r13: tmp2
9174 * r14: tmp3
9175 * r15: tmp4
9176 * rbx: tmp5
9177 *
9178 */
9179void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
9180                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
9181  ShortBranchVerifier sbv(this);
9182  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
9183
9184  push(tmp1);
9185  push(tmp2);
9186  push(tmp3);
9187  push(tmp4);
9188  push(tmp5);
9189
9190  push(xlen);
9191  push(zlen);
9192
9193  const Register idx = tmp1;
9194  const Register kdx = tmp2;
9195  const Register xstart = tmp3;
9196
9197  const Register y_idx = tmp4;
9198  const Register carry = tmp5;
9199  const Register product  = xlen;
9200  const Register x_xstart = zlen;  // reuse register
9201
9202  // First Loop.
9203  //
9204  //  final static long LONG_MASK = 0xffffffffL;
9205  //  int xstart = xlen - 1;
9206  //  int ystart = ylen - 1;
9207  //  long carry = 0;
9208  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
9209  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
9210  //    z[kdx] = (int)product;
9211  //    carry = product >>> 32;
9212  //  }
9213  //  z[xstart] = (int)carry;
9214  //
9215
9216  movl(idx, ylen);      // idx = ylen;
9217  movl(kdx, zlen);      // kdx = xlen+ylen;
9218  xorq(carry, carry);   // carry = 0;
9219
9220  Label L_done;
9221
9222  movl(xstart, xlen);
9223  decrementl(xstart);
9224  jcc(Assembler::negative, L_done);
9225
9226  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
9227
9228  Label L_second_loop;
9229  testl(kdx, kdx);
9230  jcc(Assembler::zero, L_second_loop);
9231
9232  Label L_carry;
9233  subl(kdx, 1);
9234  jcc(Assembler::zero, L_carry);
9235
9236  movl(Address(z, kdx, Address::times_4,  0), carry);
9237  shrq(carry, 32);
9238  subl(kdx, 1);
9239
9240  bind(L_carry);
9241  movl(Address(z, kdx, Address::times_4,  0), carry);
9242
9243  // Second and third (nested) loops.
9244  //
9245  // for (int i = xstart-1; i >= 0; i--) { // Second loop
9246  //   carry = 0;
9247  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
9248  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
9249  //                    (z[k] & LONG_MASK) + carry;
9250  //     z[k] = (int)product;
9251  //     carry = product >>> 32;
9252  //   }
9253  //   z[i] = (int)carry;
9254  // }
9255  //
9256  // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
9257
9258  const Register jdx = tmp1;
9259
9260  bind(L_second_loop);
9261  xorl(carry, carry);    // carry = 0;
9262  movl(jdx, ylen);       // j = ystart+1
9263
9264  subl(xstart, 1);       // i = xstart-1;
9265  jcc(Assembler::negative, L_done);
9266
9267  push (z);
9268
9269  Label L_last_x;
9270  lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
9271  subl(xstart, 1);       // i = xstart-1;
9272  jcc(Assembler::negative, L_last_x);
9273
9274  if (UseBMI2Instructions) {
9275    movq(rdx,  Address(x, xstart, Address::times_4,  0));
9276    rorxq(rdx, rdx, 32); // convert big-endian to little-endian
9277  } else {
9278    movq(x_xstart, Address(x, xstart, Address::times_4,  0));
9279    rorq(x_xstart, 32);  // convert big-endian to little-endian
9280  }
9281
9282  Label L_third_loop_prologue;
9283  bind(L_third_loop_prologue);
9284
9285  push (x);
9286  push (xstart);
9287  push (ylen);
9288
9289
9290  if (UseBMI2Instructions) {
9291    multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
9292  } else { // !UseBMI2Instructions
9293    multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
9294  }
9295
9296  pop(ylen);
9297  pop(xlen);
9298  pop(x);
9299  pop(z);
9300
9301  movl(tmp3, xlen);
9302  addl(tmp3, 1);
9303  movl(Address(z, tmp3, Address::times_4,  0), carry);
9304  subl(tmp3, 1);
9305  jccb(Assembler::negative, L_done);
9306
9307  shrq(carry, 32);
9308  movl(Address(z, tmp3, Address::times_4,  0), carry);
9309  jmp(L_second_loop);
9310
9311  // Next infrequent code is moved outside loops.
9312  bind(L_last_x);
9313  if (UseBMI2Instructions) {
9314    movl(rdx, Address(x,  0));
9315  } else {
9316    movl(x_xstart, Address(x,  0));
9317  }
9318  jmp(L_third_loop_prologue);
9319
9320  bind(L_done);
9321
9322  pop(zlen);
9323  pop(xlen);
9324
9325  pop(tmp5);
9326  pop(tmp4);
9327  pop(tmp3);
9328  pop(tmp2);
9329  pop(tmp1);
9330}
9331
9332void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale,
9333  Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){
9334  assert(UseSSE42Intrinsics, "SSE4.2 must be enabled.");
9335  Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP;
9336  Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL;
9337  Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL;
9338  Label SAME_TILL_END, DONE;
9339  Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL;
9340
9341  //scale is in rcx in both Win64 and Unix
9342  ShortBranchVerifier sbv(this);
9343
9344  shlq(length);
9345  xorq(result, result);
9346
9347  cmpq(length, 8);
9348  jcc(Assembler::equal, VECTOR8_LOOP);
9349  jcc(Assembler::less, VECTOR4_TAIL);
9350
9351  if (UseAVX >= 2){
9352
9353    cmpq(length, 16);
9354    jcc(Assembler::equal, VECTOR16_LOOP);
9355    jcc(Assembler::less, VECTOR8_LOOP);
9356
9357    cmpq(length, 32);
9358    jccb(Assembler::less, VECTOR16_TAIL);
9359
9360    subq(length, 32);
9361    bind(VECTOR32_LOOP);
9362    vmovdqu(rymm0, Address(obja, result));
9363    vmovdqu(rymm1, Address(objb, result));
9364    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit);
9365    vptest(rymm2, rymm2);
9366    jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
9367    addq(result, 32);
9368    subq(length, 32);
9369    jccb(Assembler::greaterEqual, VECTOR32_LOOP);
9370    addq(length, 32);
9371    jcc(Assembler::equal, SAME_TILL_END);
9372    //falling through if less than 32 bytes left //close the branch here.
9373
9374    bind(VECTOR16_TAIL);
9375    cmpq(length, 16);
9376    jccb(Assembler::less, VECTOR8_TAIL);
9377    bind(VECTOR16_LOOP);
9378    movdqu(rymm0, Address(obja, result));
9379    movdqu(rymm1, Address(objb, result));
9380    vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit);
9381    ptest(rymm2, rymm2);
9382    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
9383    addq(result, 16);
9384    subq(length, 16);
9385    jcc(Assembler::equal, SAME_TILL_END);
9386    //falling through if less than 16 bytes left
9387  } else {//regular intrinsics
9388
9389    cmpq(length, 16);
9390    jccb(Assembler::less, VECTOR8_TAIL);
9391
9392    subq(length, 16);
9393    bind(VECTOR16_LOOP);
9394    movdqu(rymm0, Address(obja, result));
9395    movdqu(rymm1, Address(objb, result));
9396    pxor(rymm0, rymm1);
9397    ptest(rymm0, rymm0);
9398    jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found
9399    addq(result, 16);
9400    subq(length, 16);
9401    jccb(Assembler::greaterEqual, VECTOR16_LOOP);
9402    addq(length, 16);
9403    jcc(Assembler::equal, SAME_TILL_END);
9404    //falling through if less than 16 bytes left
9405  }
9406
9407  bind(VECTOR8_TAIL);
9408  cmpq(length, 8);
9409  jccb(Assembler::less, VECTOR4_TAIL);
9410  bind(VECTOR8_LOOP);
9411  movq(tmp1, Address(obja, result));
9412  movq(tmp2, Address(objb, result));
9413  xorq(tmp1, tmp2);
9414  testq(tmp1, tmp1);
9415  jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found
9416  addq(result, 8);
9417  subq(length, 8);
9418  jcc(Assembler::equal, SAME_TILL_END);
9419  //falling through if less than 8 bytes left
9420
9421  bind(VECTOR4_TAIL);
9422  cmpq(length, 4);
9423  jccb(Assembler::less, BYTES_TAIL);
9424  bind(VECTOR4_LOOP);
9425  movl(tmp1, Address(obja, result));
9426  xorl(tmp1, Address(objb, result));
9427  testl(tmp1, tmp1);
9428  jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found
9429  addq(result, 4);
9430  subq(length, 4);
9431  jcc(Assembler::equal, SAME_TILL_END);
9432  //falling through if less than 4 bytes left
9433
9434  bind(BYTES_TAIL);
9435  bind(BYTES_LOOP);
9436  load_unsigned_byte(tmp1, Address(obja, result));
9437  load_unsigned_byte(tmp2, Address(objb, result));
9438  xorl(tmp1, tmp2);
9439  testl(tmp1, tmp1);
9440  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9441  decq(length);
9442  jccb(Assembler::zero, SAME_TILL_END);
9443  incq(result);
9444  load_unsigned_byte(tmp1, Address(obja, result));
9445  load_unsigned_byte(tmp2, Address(objb, result));
9446  xorl(tmp1, tmp2);
9447  testl(tmp1, tmp1);
9448  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9449  decq(length);
9450  jccb(Assembler::zero, SAME_TILL_END);
9451  incq(result);
9452  load_unsigned_byte(tmp1, Address(obja, result));
9453  load_unsigned_byte(tmp2, Address(objb, result));
9454  xorl(tmp1, tmp2);
9455  testl(tmp1, tmp1);
9456  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
9457  jmpb(SAME_TILL_END);
9458
9459  if (UseAVX >= 2){
9460    bind(VECTOR32_NOT_EQUAL);
9461    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit);
9462    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit);
9463    vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit);
9464    vpmovmskb(tmp1, rymm0);
9465    bsfq(tmp1, tmp1);
9466    addq(result, tmp1);
9467    shrq(result);
9468    jmpb(DONE);
9469  }
9470
9471  bind(VECTOR16_NOT_EQUAL);
9472  if (UseAVX >= 2){
9473    vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit);
9474    vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit);
9475    pxor(rymm0, rymm2);
9476  } else {
9477    pcmpeqb(rymm2, rymm2);
9478    pxor(rymm0, rymm1);
9479    pcmpeqb(rymm0, rymm1);
9480    pxor(rymm0, rymm2);
9481  }
9482  pmovmskb(tmp1, rymm0);
9483  bsfq(tmp1, tmp1);
9484  addq(result, tmp1);
9485  shrq(result);
9486  jmpb(DONE);
9487
9488  bind(VECTOR8_NOT_EQUAL);
9489  bind(VECTOR4_NOT_EQUAL);
9490  bsfq(tmp1, tmp1);
9491  shrq(tmp1, 3);
9492  addq(result, tmp1);
9493  bind(BYTES_NOT_EQUAL);
9494  shrq(result);
9495  jmpb(DONE);
9496
9497  bind(SAME_TILL_END);
9498  mov64(result, -1);
9499
9500  bind(DONE);
9501}
9502
9503
9504//Helper functions for square_to_len()
9505
9506/**
9507 * Store the squares of x[], right shifted one bit (divided by 2) into z[]
9508 * Preserves x and z and modifies rest of the registers.
9509 */
9510void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9511  // Perform square and right shift by 1
9512  // Handle odd xlen case first, then for even xlen do the following
9513  // jlong carry = 0;
9514  // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
9515  //     huge_128 product = x[j:j+1] * x[j:j+1];
9516  //     z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
9517  //     z[i+2:i+3] = (jlong)(product >>> 1);
9518  //     carry = (jlong)product;
9519  // }
9520
9521  xorq(tmp5, tmp5);     // carry
9522  xorq(rdxReg, rdxReg);
9523  xorl(tmp1, tmp1);     // index for x
9524  xorl(tmp4, tmp4);     // index for z
9525
9526  Label L_first_loop, L_first_loop_exit;
9527
9528  testl(xlen, 1);
9529  jccb(Assembler::zero, L_first_loop); //jump if xlen is even
9530
9531  // Square and right shift by 1 the odd element using 32 bit multiply
9532  movl(raxReg, Address(x, tmp1, Address::times_4, 0));
9533  imulq(raxReg, raxReg);
9534  shrq(raxReg, 1);
9535  adcq(tmp5, 0);
9536  movq(Address(z, tmp4, Address::times_4, 0), raxReg);
9537  incrementl(tmp1);
9538  addl(tmp4, 2);
9539
9540  // Square and  right shift by 1 the rest using 64 bit multiply
9541  bind(L_first_loop);
9542  cmpptr(tmp1, xlen);
9543  jccb(Assembler::equal, L_first_loop_exit);
9544
9545  // Square
9546  movq(raxReg, Address(x, tmp1, Address::times_4,  0));
9547  rorq(raxReg, 32);    // convert big-endian to little-endian
9548  mulq(raxReg);        // 64-bit multiply rax * rax -> rdx:rax
9549
9550  // Right shift by 1 and save carry
9551  shrq(tmp5, 1);       // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
9552  rcrq(rdxReg, 1);
9553  rcrq(raxReg, 1);
9554  adcq(tmp5, 0);
9555
9556  // Store result in z
9557  movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
9558  movq(Address(z, tmp4, Address::times_4, 8), raxReg);
9559
9560  // Update indices for x and z
9561  addl(tmp1, 2);
9562  addl(tmp4, 4);
9563  jmp(L_first_loop);
9564
9565  bind(L_first_loop_exit);
9566}
9567
9568
9569/**
9570 * Perform the following multiply add operation using BMI2 instructions
9571 * carry:sum = sum + op1*op2 + carry
9572 * op2 should be in rdx
9573 * op2 is preserved, all other registers are modified
9574 */
9575void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
9576  // assert op2 is rdx
9577  mulxq(tmp2, op1, op1);  //  op1 * op2 -> tmp2:op1
9578  addq(sum, carry);
9579  adcq(tmp2, 0);
9580  addq(sum, op1);
9581  adcq(tmp2, 0);
9582  movq(carry, tmp2);
9583}
9584
9585/**
9586 * Perform the following multiply add operation:
9587 * carry:sum = sum + op1*op2 + carry
9588 * Preserves op1, op2 and modifies rest of registers
9589 */
9590void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
9591  // rdx:rax = op1 * op2
9592  movq(raxReg, op2);
9593  mulq(op1);
9594
9595  //  rdx:rax = sum + carry + rdx:rax
9596  addq(sum, carry);
9597  adcq(rdxReg, 0);
9598  addq(sum, raxReg);
9599  adcq(rdxReg, 0);
9600
9601  // carry:sum = rdx:sum
9602  movq(carry, rdxReg);
9603}
9604
9605/**
9606 * Add 64 bit long carry into z[] with carry propogation.
9607 * Preserves z and carry register values and modifies rest of registers.
9608 *
9609 */
9610void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
9611  Label L_fourth_loop, L_fourth_loop_exit;
9612
9613  movl(tmp1, 1);
9614  subl(zlen, 2);
9615  addq(Address(z, zlen, Address::times_4, 0), carry);
9616
9617  bind(L_fourth_loop);
9618  jccb(Assembler::carryClear, L_fourth_loop_exit);
9619  subl(zlen, 2);
9620  jccb(Assembler::negative, L_fourth_loop_exit);
9621  addq(Address(z, zlen, Address::times_4, 0), tmp1);
9622  jmp(L_fourth_loop);
9623  bind(L_fourth_loop_exit);
9624}
9625
9626/**
9627 * Shift z[] left by 1 bit.
9628 * Preserves x, len, z and zlen registers and modifies rest of the registers.
9629 *
9630 */
9631void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
9632
9633  Label L_fifth_loop, L_fifth_loop_exit;
9634
9635  // Fifth loop
9636  // Perform primitiveLeftShift(z, zlen, 1)
9637
9638  const Register prev_carry = tmp1;
9639  const Register new_carry = tmp4;
9640  const Register value = tmp2;
9641  const Register zidx = tmp3;
9642
9643  // int zidx, carry;
9644  // long value;
9645  // carry = 0;
9646  // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
9647  //    (carry:value)  = (z[i] << 1) | carry ;
9648  //    z[i] = value;
9649  // }
9650
9651  movl(zidx, zlen);
9652  xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
9653
9654  bind(L_fifth_loop);
9655  decl(zidx);  // Use decl to preserve carry flag
9656  decl(zidx);
9657  jccb(Assembler::negative, L_fifth_loop_exit);
9658
9659  if (UseBMI2Instructions) {
9660     movq(value, Address(z, zidx, Address::times_4, 0));
9661     rclq(value, 1);
9662     rorxq(value, value, 32);
9663     movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
9664  }
9665  else {
9666    // clear new_carry
9667    xorl(new_carry, new_carry);
9668
9669    // Shift z[i] by 1, or in previous carry and save new carry
9670    movq(value, Address(z, zidx, Address::times_4, 0));
9671    shlq(value, 1);
9672    adcl(new_carry, 0);
9673
9674    orq(value, prev_carry);
9675    rorq(value, 0x20);
9676    movq(Address(z, zidx, Address::times_4,  0), value);  // Store back in big endian form
9677
9678    // Set previous carry = new carry
9679    movl(prev_carry, new_carry);
9680  }
9681  jmp(L_fifth_loop);
9682
9683  bind(L_fifth_loop_exit);
9684}
9685
9686
9687/**
9688 * Code for BigInteger::squareToLen() intrinsic
9689 *
9690 * rdi: x
9691 * rsi: len
9692 * r8:  z
9693 * rcx: zlen
9694 * r12: tmp1
9695 * r13: tmp2
9696 * r14: tmp3
9697 * r15: tmp4
9698 * rbx: tmp5
9699 *
9700 */
9701void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9702
9703  Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
9704  push(tmp1);
9705  push(tmp2);
9706  push(tmp3);
9707  push(tmp4);
9708  push(tmp5);
9709
9710  // First loop
9711  // Store the squares, right shifted one bit (i.e., divided by 2).
9712  square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
9713
9714  // Add in off-diagonal sums.
9715  //
9716  // Second, third (nested) and fourth loops.
9717  // zlen +=2;
9718  // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
9719  //    carry = 0;
9720  //    long op2 = x[xidx:xidx+1];
9721  //    for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
9722  //       k -= 2;
9723  //       long op1 = x[j:j+1];
9724  //       long sum = z[k:k+1];
9725  //       carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
9726  //       z[k:k+1] = sum;
9727  //    }
9728  //    add_one_64(z, k, carry, tmp_regs);
9729  // }
9730
9731  const Register carry = tmp5;
9732  const Register sum = tmp3;
9733  const Register op1 = tmp4;
9734  Register op2 = tmp2;
9735
9736  push(zlen);
9737  push(len);
9738  addl(zlen,2);
9739  bind(L_second_loop);
9740  xorq(carry, carry);
9741  subl(zlen, 4);
9742  subl(len, 2);
9743  push(zlen);
9744  push(len);
9745  cmpl(len, 0);
9746  jccb(Assembler::lessEqual, L_second_loop_exit);
9747
9748  // Multiply an array by one 64 bit long.
9749  if (UseBMI2Instructions) {
9750    op2 = rdxReg;
9751    movq(op2, Address(x, len, Address::times_4,  0));
9752    rorxq(op2, op2, 32);
9753  }
9754  else {
9755    movq(op2, Address(x, len, Address::times_4,  0));
9756    rorq(op2, 32);
9757  }
9758
9759  bind(L_third_loop);
9760  decrementl(len);
9761  jccb(Assembler::negative, L_third_loop_exit);
9762  decrementl(len);
9763  jccb(Assembler::negative, L_last_x);
9764
9765  movq(op1, Address(x, len, Address::times_4,  0));
9766  rorq(op1, 32);
9767
9768  bind(L_multiply);
9769  subl(zlen, 2);
9770  movq(sum, Address(z, zlen, Address::times_4,  0));
9771
9772  // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
9773  if (UseBMI2Instructions) {
9774    multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
9775  }
9776  else {
9777    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9778  }
9779
9780  movq(Address(z, zlen, Address::times_4, 0), sum);
9781
9782  jmp(L_third_loop);
9783  bind(L_third_loop_exit);
9784
9785  // Fourth loop
9786  // Add 64 bit long carry into z with carry propogation.
9787  // Uses offsetted zlen.
9788  add_one_64(z, zlen, carry, tmp1);
9789
9790  pop(len);
9791  pop(zlen);
9792  jmp(L_second_loop);
9793
9794  // Next infrequent code is moved outside loops.
9795  bind(L_last_x);
9796  movl(op1, Address(x, 0));
9797  jmp(L_multiply);
9798
9799  bind(L_second_loop_exit);
9800  pop(len);
9801  pop(zlen);
9802  pop(len);
9803  pop(zlen);
9804
9805  // Fifth loop
9806  // Shift z left 1 bit.
9807  lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
9808
9809  // z[zlen-1] |= x[len-1] & 1;
9810  movl(tmp3, Address(x, len, Address::times_4, -4));
9811  andl(tmp3, 1);
9812  orl(Address(z, zlen, Address::times_4,  -4), tmp3);
9813
9814  pop(tmp5);
9815  pop(tmp4);
9816  pop(tmp3);
9817  pop(tmp2);
9818  pop(tmp1);
9819}
9820
9821/**
9822 * Helper function for mul_add()
9823 * Multiply the in[] by int k and add to out[] starting at offset offs using
9824 * 128 bit by 32 bit multiply and return the carry in tmp5.
9825 * Only quad int aligned length of in[] is operated on in this function.
9826 * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
9827 * This function preserves out, in and k registers.
9828 * len and offset point to the appropriate index in "in" & "out" correspondingly
9829 * tmp5 has the carry.
9830 * other registers are temporary and are modified.
9831 *
9832 */
9833void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
9834  Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
9835  Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9836
9837  Label L_first_loop, L_first_loop_exit;
9838
9839  movl(tmp1, len);
9840  shrl(tmp1, 2);
9841
9842  bind(L_first_loop);
9843  subl(tmp1, 1);
9844  jccb(Assembler::negative, L_first_loop_exit);
9845
9846  subl(len, 4);
9847  subl(offset, 4);
9848
9849  Register op2 = tmp2;
9850  const Register sum = tmp3;
9851  const Register op1 = tmp4;
9852  const Register carry = tmp5;
9853
9854  if (UseBMI2Instructions) {
9855    op2 = rdxReg;
9856  }
9857
9858  movq(op1, Address(in, len, Address::times_4,  8));
9859  rorq(op1, 32);
9860  movq(sum, Address(out, offset, Address::times_4,  8));
9861  rorq(sum, 32);
9862  if (UseBMI2Instructions) {
9863    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9864  }
9865  else {
9866    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9867  }
9868  // Store back in big endian from little endian
9869  rorq(sum, 0x20);
9870  movq(Address(out, offset, Address::times_4,  8), sum);
9871
9872  movq(op1, Address(in, len, Address::times_4,  0));
9873  rorq(op1, 32);
9874  movq(sum, Address(out, offset, Address::times_4,  0));
9875  rorq(sum, 32);
9876  if (UseBMI2Instructions) {
9877    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9878  }
9879  else {
9880    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9881  }
9882  // Store back in big endian from little endian
9883  rorq(sum, 0x20);
9884  movq(Address(out, offset, Address::times_4,  0), sum);
9885
9886  jmp(L_first_loop);
9887  bind(L_first_loop_exit);
9888}
9889
9890/**
9891 * Code for BigInteger::mulAdd() intrinsic
9892 *
9893 * rdi: out
9894 * rsi: in
9895 * r11: offs (out.length - offset)
9896 * rcx: len
9897 * r8:  k
9898 * r12: tmp1
9899 * r13: tmp2
9900 * r14: tmp3
9901 * r15: tmp4
9902 * rbx: tmp5
9903 * Multiply the in[] by word k and add to out[], return the carry in rax
9904 */
9905void MacroAssembler::mul_add(Register out, Register in, Register offs,
9906   Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
9907   Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
9908
9909  Label L_carry, L_last_in, L_done;
9910
9911// carry = 0;
9912// for (int j=len-1; j >= 0; j--) {
9913//    long product = (in[j] & LONG_MASK) * kLong +
9914//                   (out[offs] & LONG_MASK) + carry;
9915//    out[offs--] = (int)product;
9916//    carry = product >>> 32;
9917// }
9918//
9919  push(tmp1);
9920  push(tmp2);
9921  push(tmp3);
9922  push(tmp4);
9923  push(tmp5);
9924
9925  Register op2 = tmp2;
9926  const Register sum = tmp3;
9927  const Register op1 = tmp4;
9928  const Register carry =  tmp5;
9929
9930  if (UseBMI2Instructions) {
9931    op2 = rdxReg;
9932    movl(op2, k);
9933  }
9934  else {
9935    movl(op2, k);
9936  }
9937
9938  xorq(carry, carry);
9939
9940  //First loop
9941
9942  //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
9943  //The carry is in tmp5
9944  mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
9945
9946  //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
9947  decrementl(len);
9948  jccb(Assembler::negative, L_carry);
9949  decrementl(len);
9950  jccb(Assembler::negative, L_last_in);
9951
9952  movq(op1, Address(in, len, Address::times_4,  0));
9953  rorq(op1, 32);
9954
9955  subl(offs, 2);
9956  movq(sum, Address(out, offs, Address::times_4,  0));
9957  rorq(sum, 32);
9958
9959  if (UseBMI2Instructions) {
9960    multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
9961  }
9962  else {
9963    multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
9964  }
9965
9966  // Store back in big endian from little endian
9967  rorq(sum, 0x20);
9968  movq(Address(out, offs, Address::times_4,  0), sum);
9969
9970  testl(len, len);
9971  jccb(Assembler::zero, L_carry);
9972
9973  //Multiply the last in[] entry, if any
9974  bind(L_last_in);
9975  movl(op1, Address(in, 0));
9976  movl(sum, Address(out, offs, Address::times_4,  -4));
9977
9978  movl(raxReg, k);
9979  mull(op1); //tmp4 * eax -> edx:eax
9980  addl(sum, carry);
9981  adcl(rdxReg, 0);
9982  addl(sum, raxReg);
9983  adcl(rdxReg, 0);
9984  movl(carry, rdxReg);
9985
9986  movl(Address(out, offs, Address::times_4,  -4), sum);
9987
9988  bind(L_carry);
9989  //return tmp5/carry as carry in rax
9990  movl(rax, carry);
9991
9992  bind(L_done);
9993  pop(tmp5);
9994  pop(tmp4);
9995  pop(tmp3);
9996  pop(tmp2);
9997  pop(tmp1);
9998}
9999#endif
10000
10001/**
10002 * Emits code to update CRC-32 with a byte value according to constants in table
10003 *
10004 * @param [in,out]crc   Register containing the crc.
10005 * @param [in]val       Register containing the byte to fold into the CRC.
10006 * @param [in]table     Register containing the table of crc constants.
10007 *
10008 * uint32_t crc;
10009 * val = crc_table[(val ^ crc) & 0xFF];
10010 * crc = val ^ (crc >> 8);
10011 *
10012 */
10013void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
10014  xorl(val, crc);
10015  andl(val, 0xFF);
10016  shrl(crc, 8); // unsigned shift
10017  xorl(crc, Address(table, val, Address::times_4, 0));
10018}
10019
10020/**
10021 * Fold 128-bit data chunk
10022 */
10023void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
10024  if (UseAVX > 0) {
10025    vpclmulhdq(xtmp, xK, xcrc); // [123:64]
10026    vpclmulldq(xcrc, xK, xcrc); // [63:0]
10027    vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */);
10028    pxor(xcrc, xtmp);
10029  } else {
10030    movdqa(xtmp, xcrc);
10031    pclmulhdq(xtmp, xK);   // [123:64]
10032    pclmulldq(xcrc, xK);   // [63:0]
10033    pxor(xcrc, xtmp);
10034    movdqu(xtmp, Address(buf, offset));
10035    pxor(xcrc, xtmp);
10036  }
10037}
10038
10039void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
10040  if (UseAVX > 0) {
10041    vpclmulhdq(xtmp, xK, xcrc);
10042    vpclmulldq(xcrc, xK, xcrc);
10043    pxor(xcrc, xbuf);
10044    pxor(xcrc, xtmp);
10045  } else {
10046    movdqa(xtmp, xcrc);
10047    pclmulhdq(xtmp, xK);
10048    pclmulldq(xcrc, xK);
10049    pxor(xcrc, xbuf);
10050    pxor(xcrc, xtmp);
10051  }
10052}
10053
10054/**
10055 * 8-bit folds to compute 32-bit CRC
10056 *
10057 * uint64_t xcrc;
10058 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
10059 */
10060void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
10061  movdl(tmp, xcrc);
10062  andl(tmp, 0xFF);
10063  movdl(xtmp, Address(table, tmp, Address::times_4, 0));
10064  psrldq(xcrc, 1); // unsigned shift one byte
10065  pxor(xcrc, xtmp);
10066}
10067
10068/**
10069 * uint32_t crc;
10070 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
10071 */
10072void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
10073  movl(tmp, crc);
10074  andl(tmp, 0xFF);
10075  shrl(crc, 8);
10076  xorl(crc, Address(table, tmp, Address::times_4, 0));
10077}
10078
10079/**
10080 * @param crc   register containing existing CRC (32-bit)
10081 * @param buf   register pointing to input byte buffer (byte*)
10082 * @param len   register containing number of bytes
10083 * @param table register that will contain address of CRC table
10084 * @param tmp   scratch register
10085 */
10086void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
10087  assert_different_registers(crc, buf, len, table, tmp, rax);
10088
10089  Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
10090  Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
10091
10092  // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
10093  // context for the registers used, where all instructions below are using 128-bit mode
10094  // On EVEX without VL and BW, these instructions will all be AVX.
10095  if (VM_Version::supports_avx512vlbw()) {
10096    movl(tmp, 0xffff);
10097    kmovwl(k1, tmp);
10098  }
10099
10100  lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
10101  notl(crc); // ~crc
10102  cmpl(len, 16);
10103  jcc(Assembler::less, L_tail);
10104
10105  // Align buffer to 16 bytes
10106  movl(tmp, buf);
10107  andl(tmp, 0xF);
10108  jccb(Assembler::zero, L_aligned);
10109  subl(tmp,  16);
10110  addl(len, tmp);
10111
10112  align(4);
10113  BIND(L_align_loop);
10114  movsbl(rax, Address(buf, 0)); // load byte with sign extension
10115  update_byte_crc32(crc, rax, table);
10116  increment(buf);
10117  incrementl(tmp);
10118  jccb(Assembler::less, L_align_loop);
10119
10120  BIND(L_aligned);
10121  movl(tmp, len); // save
10122  shrl(len, 4);
10123  jcc(Assembler::zero, L_tail_restore);
10124
10125  // Fold crc into first bytes of vector
10126  movdqa(xmm1, Address(buf, 0));
10127  movdl(rax, xmm1);
10128  xorl(crc, rax);
10129  pinsrd(xmm1, crc, 0);
10130  addptr(buf, 16);
10131  subl(len, 4); // len > 0
10132  jcc(Assembler::less, L_fold_tail);
10133
10134  movdqa(xmm2, Address(buf,  0));
10135  movdqa(xmm3, Address(buf, 16));
10136  movdqa(xmm4, Address(buf, 32));
10137  addptr(buf, 48);
10138  subl(len, 3);
10139  jcc(Assembler::lessEqual, L_fold_512b);
10140
10141  // Fold total 512 bits of polynomial on each iteration,
10142  // 128 bits per each of 4 parallel streams.
10143  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
10144
10145  align(32);
10146  BIND(L_fold_512b_loop);
10147  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
10148  fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
10149  fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
10150  fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
10151  addptr(buf, 64);
10152  subl(len, 4);
10153  jcc(Assembler::greater, L_fold_512b_loop);
10154
10155  // Fold 512 bits to 128 bits.
10156  BIND(L_fold_512b);
10157  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
10158  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
10159  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
10160  fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
10161
10162  // Fold the rest of 128 bits data chunks
10163  BIND(L_fold_tail);
10164  addl(len, 3);
10165  jccb(Assembler::lessEqual, L_fold_128b);
10166  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
10167
10168  BIND(L_fold_tail_loop);
10169  fold_128bit_crc32(xmm1, xmm0, xmm5, buf,  0);
10170  addptr(buf, 16);
10171  decrementl(len);
10172  jccb(Assembler::greater, L_fold_tail_loop);
10173
10174  // Fold 128 bits in xmm1 down into 32 bits in crc register.
10175  BIND(L_fold_128b);
10176  movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
10177  if (UseAVX > 0) {
10178    vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
10179    vpand(xmm3, xmm0, xmm2, 0 /* vector_len */);
10180    vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
10181  } else {
10182    movdqa(xmm2, xmm0);
10183    pclmulqdq(xmm2, xmm1, 0x1);
10184    movdqa(xmm3, xmm0);
10185    pand(xmm3, xmm2);
10186    pclmulqdq(xmm0, xmm3, 0x1);
10187  }
10188  psrldq(xmm1, 8);
10189  psrldq(xmm2, 4);
10190  pxor(xmm0, xmm1);
10191  pxor(xmm0, xmm2);
10192
10193  // 8 8-bit folds to compute 32-bit CRC.
10194  for (int j = 0; j < 4; j++) {
10195    fold_8bit_crc32(xmm0, table, xmm1, rax);
10196  }
10197  movdl(crc, xmm0); // mov 32 bits to general register
10198  for (int j = 0; j < 4; j++) {
10199    fold_8bit_crc32(crc, table, rax);
10200  }
10201
10202  BIND(L_tail_restore);
10203  movl(len, tmp); // restore
10204  BIND(L_tail);
10205  andl(len, 0xf);
10206  jccb(Assembler::zero, L_exit);
10207
10208  // Fold the rest of bytes
10209  align(4);
10210  BIND(L_tail_loop);
10211  movsbl(rax, Address(buf, 0)); // load byte with sign extension
10212  update_byte_crc32(crc, rax, table);
10213  increment(buf);
10214  decrementl(len);
10215  jccb(Assembler::greater, L_tail_loop);
10216
10217  BIND(L_exit);
10218  notl(crc); // ~c
10219}
10220
10221#ifdef _LP64
10222// S. Gueron / Information Processing Letters 112 (2012) 184
10223// Algorithm 4: Computing carry-less multiplication using a precomputed lookup table.
10224// Input: A 32 bit value B = [byte3, byte2, byte1, byte0].
10225// Output: the 64-bit carry-less product of B * CONST
10226void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n,
10227                                     Register tmp1, Register tmp2, Register tmp3) {
10228  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
10229  if (n > 0) {
10230    addq(tmp3, n * 256 * 8);
10231  }
10232  //    Q1 = TABLEExt[n][B & 0xFF];
10233  movl(tmp1, in);
10234  andl(tmp1, 0x000000FF);
10235  shll(tmp1, 3);
10236  addq(tmp1, tmp3);
10237  movq(tmp1, Address(tmp1, 0));
10238
10239  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
10240  movl(tmp2, in);
10241  shrl(tmp2, 8);
10242  andl(tmp2, 0x000000FF);
10243  shll(tmp2, 3);
10244  addq(tmp2, tmp3);
10245  movq(tmp2, Address(tmp2, 0));
10246
10247  shlq(tmp2, 8);
10248  xorq(tmp1, tmp2);
10249
10250  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
10251  movl(tmp2, in);
10252  shrl(tmp2, 16);
10253  andl(tmp2, 0x000000FF);
10254  shll(tmp2, 3);
10255  addq(tmp2, tmp3);
10256  movq(tmp2, Address(tmp2, 0));
10257
10258  shlq(tmp2, 16);
10259  xorq(tmp1, tmp2);
10260
10261  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
10262  shrl(in, 24);
10263  andl(in, 0x000000FF);
10264  shll(in, 3);
10265  addq(in, tmp3);
10266  movq(in, Address(in, 0));
10267
10268  shlq(in, 24);
10269  xorq(in, tmp1);
10270  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
10271}
10272
10273void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
10274                                      Register in_out,
10275                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
10276                                      XMMRegister w_xtmp2,
10277                                      Register tmp1,
10278                                      Register n_tmp2, Register n_tmp3) {
10279  if (is_pclmulqdq_supported) {
10280    movdl(w_xtmp1, in_out); // modified blindly
10281
10282    movl(tmp1, const_or_pre_comp_const_index);
10283    movdl(w_xtmp2, tmp1);
10284    pclmulqdq(w_xtmp1, w_xtmp2, 0);
10285
10286    movdq(in_out, w_xtmp1);
10287  } else {
10288    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3);
10289  }
10290}
10291
10292// Recombination Alternative 2: No bit-reflections
10293// T1 = (CRC_A * U1) << 1
10294// T2 = (CRC_B * U2) << 1
10295// C1 = T1 >> 32
10296// C2 = T2 >> 32
10297// T1 = T1 & 0xFFFFFFFF
10298// T2 = T2 & 0xFFFFFFFF
10299// T1 = CRC32(0, T1)
10300// T2 = CRC32(0, T2)
10301// C1 = C1 ^ T1
10302// C2 = C2 ^ T2
10303// CRC = C1 ^ C2 ^ CRC_C
10304void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
10305                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10306                                     Register tmp1, Register tmp2,
10307                                     Register n_tmp3) {
10308  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10309  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10310  shlq(in_out, 1);
10311  movl(tmp1, in_out);
10312  shrq(in_out, 32);
10313  xorl(tmp2, tmp2);
10314  crc32(tmp2, tmp1, 4);
10315  xorl(in_out, tmp2); // we don't care about upper 32 bit contents here
10316  shlq(in1, 1);
10317  movl(tmp1, in1);
10318  shrq(in1, 32);
10319  xorl(tmp2, tmp2);
10320  crc32(tmp2, tmp1, 4);
10321  xorl(in1, tmp2);
10322  xorl(in_out, in1);
10323  xorl(in_out, in2);
10324}
10325
10326// Set N to predefined value
10327// Subtract from a lenght of a buffer
10328// execute in a loop:
10329// CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0
10330// for i = 1 to N do
10331//  CRC_A = CRC32(CRC_A, A[i])
10332//  CRC_B = CRC32(CRC_B, B[i])
10333//  CRC_C = CRC32(CRC_C, C[i])
10334// end for
10335// Recombine
10336void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
10337                                       Register in_out1, Register in_out2, Register in_out3,
10338                                       Register tmp1, Register tmp2, Register tmp3,
10339                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10340                                       Register tmp4, Register tmp5,
10341                                       Register n_tmp6) {
10342  Label L_processPartitions;
10343  Label L_processPartition;
10344  Label L_exit;
10345
10346  bind(L_processPartitions);
10347  cmpl(in_out1, 3 * size);
10348  jcc(Assembler::less, L_exit);
10349    xorl(tmp1, tmp1);
10350    xorl(tmp2, tmp2);
10351    movq(tmp3, in_out2);
10352    addq(tmp3, size);
10353
10354    bind(L_processPartition);
10355      crc32(in_out3, Address(in_out2, 0), 8);
10356      crc32(tmp1, Address(in_out2, size), 8);
10357      crc32(tmp2, Address(in_out2, size * 2), 8);
10358      addq(in_out2, 8);
10359      cmpq(in_out2, tmp3);
10360      jcc(Assembler::less, L_processPartition);
10361    crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
10362            w_xtmp1, w_xtmp2, w_xtmp3,
10363            tmp4, tmp5,
10364            n_tmp6);
10365    addq(in_out2, 2 * size);
10366    subl(in_out1, 3 * size);
10367    jmp(L_processPartitions);
10368
10369  bind(L_exit);
10370}
10371#else
10372void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n,
10373                                     Register tmp1, Register tmp2, Register tmp3,
10374                                     XMMRegister xtmp1, XMMRegister xtmp2) {
10375  lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr()));
10376  if (n > 0) {
10377    addl(tmp3, n * 256 * 8);
10378  }
10379  //    Q1 = TABLEExt[n][B & 0xFF];
10380  movl(tmp1, in_out);
10381  andl(tmp1, 0x000000FF);
10382  shll(tmp1, 3);
10383  addl(tmp1, tmp3);
10384  movq(xtmp1, Address(tmp1, 0));
10385
10386  //    Q2 = TABLEExt[n][B >> 8 & 0xFF];
10387  movl(tmp2, in_out);
10388  shrl(tmp2, 8);
10389  andl(tmp2, 0x000000FF);
10390  shll(tmp2, 3);
10391  addl(tmp2, tmp3);
10392  movq(xtmp2, Address(tmp2, 0));
10393
10394  psllq(xtmp2, 8);
10395  pxor(xtmp1, xtmp2);
10396
10397  //    Q3 = TABLEExt[n][B >> 16 & 0xFF];
10398  movl(tmp2, in_out);
10399  shrl(tmp2, 16);
10400  andl(tmp2, 0x000000FF);
10401  shll(tmp2, 3);
10402  addl(tmp2, tmp3);
10403  movq(xtmp2, Address(tmp2, 0));
10404
10405  psllq(xtmp2, 16);
10406  pxor(xtmp1, xtmp2);
10407
10408  //    Q4 = TABLEExt[n][B >> 24 & 0xFF];
10409  shrl(in_out, 24);
10410  andl(in_out, 0x000000FF);
10411  shll(in_out, 3);
10412  addl(in_out, tmp3);
10413  movq(xtmp2, Address(in_out, 0));
10414
10415  psllq(xtmp2, 24);
10416  pxor(xtmp1, xtmp2); // Result in CXMM
10417  //    return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24;
10418}
10419
10420void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1,
10421                                      Register in_out,
10422                                      uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported,
10423                                      XMMRegister w_xtmp2,
10424                                      Register tmp1,
10425                                      Register n_tmp2, Register n_tmp3) {
10426  if (is_pclmulqdq_supported) {
10427    movdl(w_xtmp1, in_out);
10428
10429    movl(tmp1, const_or_pre_comp_const_index);
10430    movdl(w_xtmp2, tmp1);
10431    pclmulqdq(w_xtmp1, w_xtmp2, 0);
10432    // Keep result in XMM since GPR is 32 bit in length
10433  } else {
10434    crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2);
10435  }
10436}
10437
10438void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2,
10439                                     XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10440                                     Register tmp1, Register tmp2,
10441                                     Register n_tmp3) {
10442  crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10443  crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3);
10444
10445  psllq(w_xtmp1, 1);
10446  movdl(tmp1, w_xtmp1);
10447  psrlq(w_xtmp1, 32);
10448  movdl(in_out, w_xtmp1);
10449
10450  xorl(tmp2, tmp2);
10451  crc32(tmp2, tmp1, 4);
10452  xorl(in_out, tmp2);
10453
10454  psllq(w_xtmp2, 1);
10455  movdl(tmp1, w_xtmp2);
10456  psrlq(w_xtmp2, 32);
10457  movdl(in1, w_xtmp2);
10458
10459  xorl(tmp2, tmp2);
10460  crc32(tmp2, tmp1, 4);
10461  xorl(in1, tmp2);
10462  xorl(in_out, in1);
10463  xorl(in_out, in2);
10464}
10465
10466void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported,
10467                                       Register in_out1, Register in_out2, Register in_out3,
10468                                       Register tmp1, Register tmp2, Register tmp3,
10469                                       XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10470                                       Register tmp4, Register tmp5,
10471                                       Register n_tmp6) {
10472  Label L_processPartitions;
10473  Label L_processPartition;
10474  Label L_exit;
10475
10476  bind(L_processPartitions);
10477  cmpl(in_out1, 3 * size);
10478  jcc(Assembler::less, L_exit);
10479    xorl(tmp1, tmp1);
10480    xorl(tmp2, tmp2);
10481    movl(tmp3, in_out2);
10482    addl(tmp3, size);
10483
10484    bind(L_processPartition);
10485      crc32(in_out3, Address(in_out2, 0), 4);
10486      crc32(tmp1, Address(in_out2, size), 4);
10487      crc32(tmp2, Address(in_out2, size*2), 4);
10488      crc32(in_out3, Address(in_out2, 0+4), 4);
10489      crc32(tmp1, Address(in_out2, size+4), 4);
10490      crc32(tmp2, Address(in_out2, size*2+4), 4);
10491      addl(in_out2, 8);
10492      cmpl(in_out2, tmp3);
10493      jcc(Assembler::less, L_processPartition);
10494
10495        push(tmp3);
10496        push(in_out1);
10497        push(in_out2);
10498        tmp4 = tmp3;
10499        tmp5 = in_out1;
10500        n_tmp6 = in_out2;
10501
10502      crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2,
10503            w_xtmp1, w_xtmp2, w_xtmp3,
10504            tmp4, tmp5,
10505            n_tmp6);
10506
10507        pop(in_out2);
10508        pop(in_out1);
10509        pop(tmp3);
10510
10511    addl(in_out2, 2 * size);
10512    subl(in_out1, 3 * size);
10513    jmp(L_processPartitions);
10514
10515  bind(L_exit);
10516}
10517#endif //LP64
10518
10519#ifdef _LP64
10520// Algorithm 2: Pipelined usage of the CRC32 instruction.
10521// Input: A buffer I of L bytes.
10522// Output: the CRC32C value of the buffer.
10523// Notations:
10524// Write L = 24N + r, with N = floor (L/24).
10525// r = L mod 24 (0 <= r < 24).
10526// Consider I as the concatenation of A|B|C|R, where A, B, C, each,
10527// N quadwords, and R consists of r bytes.
10528// A[j] = I [8j+7:8j], j= 0, 1, ..., N-1
10529// B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1
10530// C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1
10531// if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1
10532void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10533                                          Register tmp1, Register tmp2, Register tmp3,
10534                                          Register tmp4, Register tmp5, Register tmp6,
10535                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10536                                          bool is_pclmulqdq_supported) {
10537  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10538  Label L_wordByWord;
10539  Label L_byteByByteProlog;
10540  Label L_byteByByte;
10541  Label L_exit;
10542
10543  if (is_pclmulqdq_supported ) {
10544    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10545    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1);
10546
10547    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10548    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10549
10550    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10551    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10552    assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
10553  } else {
10554    const_or_pre_comp_const_index[0] = 1;
10555    const_or_pre_comp_const_index[1] = 0;
10556
10557    const_or_pre_comp_const_index[2] = 3;
10558    const_or_pre_comp_const_index[3] = 2;
10559
10560    const_or_pre_comp_const_index[4] = 5;
10561    const_or_pre_comp_const_index[5] = 4;
10562   }
10563  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10564                    in2, in1, in_out,
10565                    tmp1, tmp2, tmp3,
10566                    w_xtmp1, w_xtmp2, w_xtmp3,
10567                    tmp4, tmp5,
10568                    tmp6);
10569  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10570                    in2, in1, in_out,
10571                    tmp1, tmp2, tmp3,
10572                    w_xtmp1, w_xtmp2, w_xtmp3,
10573                    tmp4, tmp5,
10574                    tmp6);
10575  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10576                    in2, in1, in_out,
10577                    tmp1, tmp2, tmp3,
10578                    w_xtmp1, w_xtmp2, w_xtmp3,
10579                    tmp4, tmp5,
10580                    tmp6);
10581  movl(tmp1, in2);
10582  andl(tmp1, 0x00000007);
10583  negl(tmp1);
10584  addl(tmp1, in2);
10585  addq(tmp1, in1);
10586
10587  BIND(L_wordByWord);
10588  cmpq(in1, tmp1);
10589  jcc(Assembler::greaterEqual, L_byteByByteProlog);
10590    crc32(in_out, Address(in1, 0), 4);
10591    addq(in1, 4);
10592    jmp(L_wordByWord);
10593
10594  BIND(L_byteByByteProlog);
10595  andl(in2, 0x00000007);
10596  movl(tmp2, 1);
10597
10598  BIND(L_byteByByte);
10599  cmpl(tmp2, in2);
10600  jccb(Assembler::greater, L_exit);
10601    crc32(in_out, Address(in1, 0), 1);
10602    incq(in1);
10603    incl(tmp2);
10604    jmp(L_byteByByte);
10605
10606  BIND(L_exit);
10607}
10608#else
10609void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2,
10610                                          Register tmp1, Register  tmp2, Register tmp3,
10611                                          Register tmp4, Register  tmp5, Register tmp6,
10612                                          XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3,
10613                                          bool is_pclmulqdq_supported) {
10614  uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
10615  Label L_wordByWord;
10616  Label L_byteByByteProlog;
10617  Label L_byteByByte;
10618  Label L_exit;
10619
10620  if (is_pclmulqdq_supported) {
10621    const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr;
10622    const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1);
10623
10624    const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2);
10625    const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3);
10626
10627    const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
10628    const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
10629  } else {
10630    const_or_pre_comp_const_index[0] = 1;
10631    const_or_pre_comp_const_index[1] = 0;
10632
10633    const_or_pre_comp_const_index[2] = 3;
10634    const_or_pre_comp_const_index[3] = 2;
10635
10636    const_or_pre_comp_const_index[4] = 5;
10637    const_or_pre_comp_const_index[5] = 4;
10638  }
10639  crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported,
10640                    in2, in1, in_out,
10641                    tmp1, tmp2, tmp3,
10642                    w_xtmp1, w_xtmp2, w_xtmp3,
10643                    tmp4, tmp5,
10644                    tmp6);
10645  crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported,
10646                    in2, in1, in_out,
10647                    tmp1, tmp2, tmp3,
10648                    w_xtmp1, w_xtmp2, w_xtmp3,
10649                    tmp4, tmp5,
10650                    tmp6);
10651  crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported,
10652                    in2, in1, in_out,
10653                    tmp1, tmp2, tmp3,
10654                    w_xtmp1, w_xtmp2, w_xtmp3,
10655                    tmp4, tmp5,
10656                    tmp6);
10657  movl(tmp1, in2);
10658  andl(tmp1, 0x00000007);
10659  negl(tmp1);
10660  addl(tmp1, in2);
10661  addl(tmp1, in1);
10662
10663  BIND(L_wordByWord);
10664  cmpl(in1, tmp1);
10665  jcc(Assembler::greaterEqual, L_byteByByteProlog);
10666    crc32(in_out, Address(in1,0), 4);
10667    addl(in1, 4);
10668    jmp(L_wordByWord);
10669
10670  BIND(L_byteByByteProlog);
10671  andl(in2, 0x00000007);
10672  movl(tmp2, 1);
10673
10674  BIND(L_byteByByte);
10675  cmpl(tmp2, in2);
10676  jccb(Assembler::greater, L_exit);
10677    movb(tmp1, Address(in1, 0));
10678    crc32(in_out, tmp1, 1);
10679    incl(in1);
10680    incl(tmp2);
10681    jmp(L_byteByByte);
10682
10683  BIND(L_exit);
10684}
10685#endif // LP64
10686#undef BIND
10687#undef BLOCK_COMMENT
10688
10689
10690// Compress char[] array to byte[].
10691void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
10692                                         XMMRegister tmp1Reg, XMMRegister tmp2Reg,
10693                                         XMMRegister tmp3Reg, XMMRegister tmp4Reg,
10694                                         Register tmp5, Register result) {
10695  Label copy_chars_loop, return_length, return_zero, done;
10696
10697  // rsi: src
10698  // rdi: dst
10699  // rdx: len
10700  // rcx: tmp5
10701  // rax: result
10702
10703  // rsi holds start addr of source char[] to be compressed
10704  // rdi holds start addr of destination byte[]
10705  // rdx holds length
10706
10707  assert(len != result, "");
10708
10709  // save length for return
10710  push(len);
10711
10712  if (UseSSE42Intrinsics) {
10713    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
10714    Label copy_32_loop, copy_16, copy_tail;
10715
10716    movl(result, len);
10717    movl(tmp5, 0xff00ff00);   // create mask to test for Unicode chars in vectors
10718
10719    // vectored compression
10720    andl(len, 0xfffffff0);    // vector count (in chars)
10721    andl(result, 0x0000000f);    // tail count (in chars)
10722    testl(len, len);
10723    jccb(Assembler::zero, copy_16);
10724
10725    // compress 16 chars per iter
10726    movdl(tmp1Reg, tmp5);
10727    pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10728    pxor(tmp4Reg, tmp4Reg);
10729
10730    lea(src, Address(src, len, Address::times_2));
10731    lea(dst, Address(dst, len, Address::times_1));
10732    negptr(len);
10733
10734    bind(copy_32_loop);
10735    movdqu(tmp2Reg, Address(src, len, Address::times_2));     // load 1st 8 characters
10736    por(tmp4Reg, tmp2Reg);
10737    movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
10738    por(tmp4Reg, tmp3Reg);
10739    ptest(tmp4Reg, tmp1Reg);       // check for Unicode chars in next vector
10740    jcc(Assembler::notZero, return_zero);
10741    packuswb(tmp2Reg, tmp3Reg);    // only ASCII chars; compress each to 1 byte
10742    movdqu(Address(dst, len, Address::times_1), tmp2Reg);
10743    addptr(len, 16);
10744    jcc(Assembler::notZero, copy_32_loop);
10745
10746    // compress next vector of 8 chars (if any)
10747    bind(copy_16);
10748    movl(len, result);
10749    andl(len, 0xfffffff8);    // vector count (in chars)
10750    andl(result, 0x00000007);    // tail count (in chars)
10751    testl(len, len);
10752    jccb(Assembler::zero, copy_tail);
10753
10754    movdl(tmp1Reg, tmp5);
10755    pshufd(tmp1Reg, tmp1Reg, 0);   // store Unicode mask in tmp1Reg
10756    pxor(tmp3Reg, tmp3Reg);
10757
10758    movdqu(tmp2Reg, Address(src, 0));
10759    ptest(tmp2Reg, tmp1Reg);       // check for Unicode chars in vector
10760    jccb(Assembler::notZero, return_zero);
10761    packuswb(tmp2Reg, tmp3Reg);    // only LATIN1 chars; compress each to 1 byte
10762    movq(Address(dst, 0), tmp2Reg);
10763    addptr(src, 16);
10764    addptr(dst, 8);
10765
10766    bind(copy_tail);
10767    movl(len, result);
10768  }
10769  // compress 1 char per iter
10770  testl(len, len);
10771  jccb(Assembler::zero, return_length);
10772  lea(src, Address(src, len, Address::times_2));
10773  lea(dst, Address(dst, len, Address::times_1));
10774  negptr(len);
10775
10776  bind(copy_chars_loop);
10777  load_unsigned_short(result, Address(src, len, Address::times_2));
10778  testl(result, 0xff00);      // check if Unicode char
10779  jccb(Assembler::notZero, return_zero);
10780  movb(Address(dst, len, Address::times_1), result);  // ASCII char; compress to 1 byte
10781  increment(len);
10782  jcc(Assembler::notZero, copy_chars_loop);
10783
10784  // if compression succeeded, return length
10785  bind(return_length);
10786  pop(result);
10787  jmpb(done);
10788
10789  // if compression failed, return 0
10790  bind(return_zero);
10791  xorl(result, result);
10792  addptr(rsp, wordSize);
10793
10794  bind(done);
10795}
10796
10797// Inflate byte[] array to char[].
10798void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
10799                                        XMMRegister tmp1, Register tmp2) {
10800  Label copy_chars_loop, done;
10801
10802  // rsi: src
10803  // rdi: dst
10804  // rdx: len
10805  // rcx: tmp2
10806
10807  // rsi holds start addr of source byte[] to be inflated
10808  // rdi holds start addr of destination char[]
10809  // rdx holds length
10810  assert_different_registers(src, dst, len, tmp2);
10811
10812  if (UseSSE42Intrinsics) {
10813    assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
10814    Label copy_8_loop, copy_bytes, copy_tail;
10815
10816    movl(tmp2, len);
10817    andl(tmp2, 0x00000007);   // tail count (in chars)
10818    andl(len, 0xfffffff8);    // vector count (in chars)
10819    jccb(Assembler::zero, copy_tail);
10820
10821    // vectored inflation
10822    lea(src, Address(src, len, Address::times_1));
10823    lea(dst, Address(dst, len, Address::times_2));
10824    negptr(len);
10825
10826    // inflate 8 chars per iter
10827    bind(copy_8_loop);
10828    pmovzxbw(tmp1, Address(src, len, Address::times_1));  // unpack to 8 words
10829    movdqu(Address(dst, len, Address::times_2), tmp1);
10830    addptr(len, 8);
10831    jcc(Assembler::notZero, copy_8_loop);
10832
10833    bind(copy_tail);
10834    movl(len, tmp2);
10835
10836    cmpl(len, 4);
10837    jccb(Assembler::less, copy_bytes);
10838
10839    movdl(tmp1, Address(src, 0));  // load 4 byte chars
10840    pmovzxbw(tmp1, tmp1);
10841    movq(Address(dst, 0), tmp1);
10842    subptr(len, 4);
10843    addptr(src, 4);
10844    addptr(dst, 8);
10845
10846    bind(copy_bytes);
10847  }
10848  testl(len, len);
10849  jccb(Assembler::zero, done);
10850  lea(src, Address(src, len, Address::times_1));
10851  lea(dst, Address(dst, len, Address::times_2));
10852  negptr(len);
10853
10854  // inflate 1 char per iter
10855  bind(copy_chars_loop);
10856  load_unsigned_byte(tmp2, Address(src, len, Address::times_1));  // load byte char
10857  movw(Address(dst, len, Address::times_2), tmp2);  // inflate byte char to word
10858  increment(len);
10859  jcc(Assembler::notZero, copy_chars_loop);
10860
10861  bind(done);
10862}
10863
10864
10865Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
10866  switch (cond) {
10867    // Note some conditions are synonyms for others
10868    case Assembler::zero:         return Assembler::notZero;
10869    case Assembler::notZero:      return Assembler::zero;
10870    case Assembler::less:         return Assembler::greaterEqual;
10871    case Assembler::lessEqual:    return Assembler::greater;
10872    case Assembler::greater:      return Assembler::lessEqual;
10873    case Assembler::greaterEqual: return Assembler::less;
10874    case Assembler::below:        return Assembler::aboveEqual;
10875    case Assembler::belowEqual:   return Assembler::above;
10876    case Assembler::above:        return Assembler::belowEqual;
10877    case Assembler::aboveEqual:   return Assembler::below;
10878    case Assembler::overflow:     return Assembler::noOverflow;
10879    case Assembler::noOverflow:   return Assembler::overflow;
10880    case Assembler::negative:     return Assembler::positive;
10881    case Assembler::positive:     return Assembler::negative;
10882    case Assembler::parity:       return Assembler::noParity;
10883    case Assembler::noParity:     return Assembler::parity;
10884  }
10885  ShouldNotReachHere(); return Assembler::overflow;
10886}
10887
10888SkipIfEqual::SkipIfEqual(
10889    MacroAssembler* masm, const bool* flag_addr, bool value) {
10890  _masm = masm;
10891  _masm->cmp8(ExternalAddress((address)flag_addr), value);
10892  _masm->jcc(Assembler::equal, _label);
10893}
10894
10895SkipIfEqual::~SkipIfEqual() {
10896  _masm->bind(_label);
10897}
10898
10899// 32-bit Windows has its own fast-path implementation
10900// of get_thread
10901#if !defined(WIN32) || defined(_LP64)
10902
10903// This is simply a call to Thread::current()
10904void MacroAssembler::get_thread(Register thread) {
10905  if (thread != rax) {
10906    push(rax);
10907  }
10908  LP64_ONLY(push(rdi);)
10909  LP64_ONLY(push(rsi);)
10910  push(rdx);
10911  push(rcx);
10912#ifdef _LP64
10913  push(r8);
10914  push(r9);
10915  push(r10);
10916  push(r11);
10917#endif
10918
10919  MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0);
10920
10921#ifdef _LP64
10922  pop(r11);
10923  pop(r10);
10924  pop(r9);
10925  pop(r8);
10926#endif
10927  pop(rcx);
10928  pop(rdx);
10929  LP64_ONLY(pop(rsi);)
10930  LP64_ONLY(pop(rdi);)
10931  if (thread != rax) {
10932    mov(thread, rax);
10933    pop(rax);
10934  }
10935}
10936
10937#endif
10938